Update to LLVM 3.5a.

Change-Id: Ifadecab779f128e62e430c2b4f6ddd84953ed617
author: Stephen Hines <srhines@google.com> 2014-04-23 16:57:46 -0700
committer: Stephen Hines <srhines@google.com> 2014-04-24 15:53:16 -0700
commit: 36b56886974eae4f9c5ebc96befd3e7bfe5de338 (patch)
tree: e6cfb69fbbd937f450eeb83bfb83b9da3b01275a /test/CodeGen
parent: 69a8640022b04415ae9fac62f8ab090601d8f889 (diff)
download: external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.zip
external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.gz
external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.bz2
1417 files changed, 87220 insertions, 4801 deletions
diff --git a/test/CodeGen/AArch64/128bit_load_store.ll b/test/CodeGen/AArch64/128bit_load_store.ll
new file mode 100644
index 0000000..502fd70
--- /dev/null
+++ b/test/CodeGen/AArch64/128bit_load_store.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
+
+define void @test_store_f128(fp128* %ptr, fp128 %val) #0 {
+; CHECK: test_store_f128
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store fp128 %val, fp128* %ptr, align 16
+  ret void
+}
+
+define fp128 @test_load_f128(fp128* readonly %ptr) #2 {
+; CHECK: test_load_f128
+; CHECK: ldr	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  %0 = load fp128* %ptr, align 16
+  ret fp128 %0
+}
+
+define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
+; CHECK: test_vstrq_p128
+; CHECK: str	{{x[0-9]+}}, [{{x[0-9]+}}, #8]
+; CHECK-NEXT: str	 {{x[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  %0 = bitcast i128* %ptr to fp128*
+  %1 = bitcast i128 %val to fp128
+  store fp128 %1, fp128* %0, align 16
+  ret void
+}
+
+define i128 @test_vldrq_p128(i128* readonly %ptr) #2 {
+; CHECK: test_vldrq_p128
+; CHECK: ldr	{{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-NEXT: ldr	{{x[0-9]+}}, [{{x[0-9]+}}, #8]
+entry:
+  %0 = bitcast i128* %ptr to fp128*
+  %1 = load fp128* %0, align 16
+  %2 = bitcast fp128 %1 to i128
+  ret i128 %2
+}
+
+define void @test_ld_st_p128(i128* nocapture %ptr) #0 {
+; CHECK: test_ld_st_p128
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-NEXT: str	{{q[0-9]+}}, [{{x[0-9]+}}, #16]
+entry:
+  %0 = bitcast i128* %ptr to fp128*
+  %1 = load fp128* %0, align 16
+  %add.ptr = getelementptr inbounds i128* %ptr, i64 1
+  %2 = bitcast i128* %add.ptr to fp128*
+  store fp128 %1, fp128* %2, align 16
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/adc.ll b/test/CodeGen/AArch64/adc.ll
index 26fd3e6..29637d3 100644
--- a/test/CodeGen/AArch64/adc.ll
+++ b/test/CodeGen/AArch64/adc.ll
@@ -1,15 +1,20 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
 
 define i128 @test_simple(i128 %a, i128 %b, i128 %c) {
 ; CHECK-LABEL: test_simple:
 
   %valadd = add i128 %a, %b
-; CHECK: adds [[ADDLO:x[0-9]+]], x0, x2
-; CHECK-NEXT: adcs [[ADDHI:x[0-9]+]], x1, x3
+; CHECK-LE: adds [[ADDLO:x[0-9]+]], x0, x2
+; CHECK-LE-NEXT: adcs [[ADDHI:x[0-9]+]], x1, x3
+; CHECK-BE: adds [[ADDLO:x[0-9]+]], x1, x3
+; CHECK-BE-NEXT: adcs [[ADDHI:x[0-9]+]], x0, x2
 
   %valsub = sub i128 %valadd, %c
-; CHECK: subs x0, [[ADDLO]], x4
-; CHECK: sbcs x1, [[ADDHI]], x5
+; CHECK-LE: subs x0, [[ADDLO]], x4
+; CHECK-LE: sbcs x1, [[ADDHI]], x5
+; CHECK-BE: subs x1, [[ADDLO]], x5
+; CHECK-BE: sbcs x0, [[ADDHI]], x4
 
   ret i128 %valsub
 ; CHECK: ret
@@ -19,8 +24,10 @@ define i128 @test_imm(i128 %a) {
 ; CHECK-LABEL: test_imm:
 
   %val = add i128 %a, 12
-; CHECK: adds x0, x0, #12
-; CHECK: adcs x1, x1, {{x[0-9]|xzr}}
+; CHECK-LE: adds x0, x0, #12
+; CHECK-LE: adcs x1, x1, {{x[0-9]|xzr}}
+; CHECK-BE: adds x1, x1, #12
+; CHECK-BE: adcs x0, x0, {{x[0-9]|xzr}}
 
   ret i128 %val
 ; CHECK: ret
@@ -32,8 +39,10 @@ define i128 @test_shifted(i128 %a, i128 %b) {
   %rhs = shl i128 %b, 45
 
   %val = add i128 %a, %rhs
-; CHECK: adds x0, x0, x2, lsl #45
-; CHECK: adcs x1, x1, {{x[0-9]}}
+; CHECK-LE: adds x0, x0, x2, lsl #45
+; CHECK-LE: adcs x1, x1, {{x[0-9]}}
+; CHECK-BE: adds x1, x1, x3, lsl #45
+; CHECK-BE: adcs x0, x0, {{x[0-9]}}
 
   ret i128 %val
 ; CHECK: ret
@@ -46,8 +55,10 @@ define i128 @test_extended(i128 %a, i16 %b) {
   %rhs = shl i128 %ext, 3
 
   %val = add i128 %a, %rhs
-; CHECK: adds x0, x0, w2, sxth #3
-; CHECK: adcs x1, x1, {{x[0-9]}}
+; CHECK-LE: adds x0, x0, w2, sxth #3
+; CHECK-LE: adcs x1, x1, {{x[0-9]}}
+; CHECK-BE: adds x1, x1, w2, sxth #3
+; CHECK-BE: adcs x0, x0, {{x[0-9]}}
 
   ret i128 %val
 ; CHECK: ret
diff --git a/test/CodeGen/AArch64/assertion-rc-mismatch.ll b/test/CodeGen/AArch64/assertion-rc-mismatch.ll
new file mode 100644
index 0000000..02b0c0e
--- /dev/null
+++ b/test/CodeGen/AArch64/assertion-rc-mismatch.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; Test case related to <rdar://problem/15633429>.
+
+; CHECK-LABEL: small
+define i64 @small(i64 %encodedBase) {
+cmp:
+  %lnot.i.i = icmp eq i64 %encodedBase, 0
+  br i1 %lnot.i.i, label %if, label %else
+if:
+  %tmp1 = call i8* @llvm.returnaddress(i32 0)
+  br label %end
+else:
+  %tmp3 = call i8* @llvm.returnaddress(i32 0)
+  %ptr = getelementptr inbounds i8* %tmp3, i64 -16
+  %ld = load i8* %ptr, align 4
+  %tmp2 = inttoptr i8 %ld to i8*
+  br label %end
+end:
+  %tmp = phi i8* [ %tmp1, %if ], [ %tmp2, %else ]
+  %coerce.val.pi56 = ptrtoint i8* %tmp to i64
+  ret i64 %coerce.val.pi56
+}
+
+declare i8* @llvm.returnaddress(i32)
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index de84ff4..5fe2936 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-REG %s
 
 @var8 = global i8 0
 @var16 = global i16 0
@@ -17,6 +18,8 @@ define i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -37,6 +40,8 @@ define i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -57,6 +62,8 @@ define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -77,6 +84,8 @@ define i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: add [[NEW:x[0-9]+]], x[[OLD]], x0
+; CHECK-REG: add x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
+; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -97,6 +106,8 @@ define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -117,6 +128,8 @@ define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stlxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -137,6 +150,8 @@ define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -157,6 +172,8 @@ define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: sub [[NEW:x[0-9]+]], x[[OLD]], x0
+; CHECK-REG: sub x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
+; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -177,6 +194,8 @@ define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -197,6 +216,8 @@ define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -217,6 +238,8 @@ define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -237,6 +260,8 @@ define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: and [[NEW:x[0-9]+]], x[[OLD]], x0
+; CHECK-REG: and x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
+; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -257,6 +282,8 @@ define i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -277,6 +304,8 @@ define i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -297,6 +326,8 @@ define i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -317,6 +348,8 @@ define i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: orr [[NEW:x[0-9]+]], x[[OLD]], x0
+; CHECK-REG: orr x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
+; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -337,6 +370,8 @@ define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -357,6 +392,8 @@ define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -377,6 +414,8 @@ define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
+; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -397,6 +436,8 @@ define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: eor [[NEW:x[0-9]+]], x[[OLD]], x0
+; CHECK-REG: eor x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
+; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -416,6 +457,7 @@ define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
 ; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
+; CHECK-REG-NOT: stxrb w0, w0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -435,6 +477,7 @@ define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
 ; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
+; CHECK-REG-NOT: stlxrh w0, w0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -454,6 +497,7 @@ define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
 ; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
+; CHECK-REG-NOT: stlxr w0, w0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -473,6 +517,7 @@ define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
 ; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
+; CHECK-REG-NOT: stxr w0, x0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], x0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -495,6 +540,8 @@ define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]], sxtb
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
+; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -516,6 +563,8 @@ define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]], sxth
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
+; CHECK-REG-NOT: stlxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -537,6 +586,8 @@ define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]]
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
+; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -558,6 +609,8 @@ define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-NEXT: cmp x0, x[[OLD]]
 ; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
+; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, gt
+; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -579,6 +632,8 @@ define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]], sxtb
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
+; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -600,6 +655,8 @@ define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]], sxth
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
+; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -621,6 +678,8 @@ define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]]
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
+; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -642,6 +701,8 @@ define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-NEXT: cmp x0, x[[OLD]]
 ; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lt
+; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, lt
+; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -663,6 +724,8 @@ define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]], uxtb
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
+; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -684,6 +747,8 @@ define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]], uxth
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
+; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -705,6 +770,8 @@ define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]]
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
+; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -726,6 +793,8 @@ define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-NEXT: cmp x0, x[[OLD]]
 ; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
+; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, hi
+; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -747,6 +816,8 @@ define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]], uxtb
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
+; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -768,6 +839,8 @@ define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]], uxth
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
+; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -789,6 +862,8 @@ define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w0, w[[OLD]]
 ; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
+; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
+; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -810,6 +885,8 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-NEXT: cmp x0, x[[OLD]]
 ; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lo
+; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, lo
+; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -820,7 +897,7 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
 
 define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i8:
-   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire
+   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
@@ -832,6 +909,7 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
+; CHECK-REG-NOT: stxrb w1, w1, [x{{[0-9]+}}]
 ; CHECK: stxrb [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
@@ -842,7 +920,7 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 
 define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i16:
-   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst
+   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
@@ -854,6 +932,7 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
+; CHECK-REG-NOT: stlxrh w1, w1, [x{{[0-9]+}}]
 ; CHECK: stlxrh [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
@@ -864,7 +943,7 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 
 define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i32:
-   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release
+   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
@@ -876,6 +955,7 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
+; CHECK-REG-NOT: stlxr w1, w1, [x{{[0-9]+}}]
 ; CHECK: stlxr [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
@@ -886,7 +966,7 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 
 define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i64:
-   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic
+   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
@@ -898,6 +978,7 @@ define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK-NEXT: cmp x[[OLD]], x0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
+; CHECK-REG-NOT: stxr w1, x1, [x{{[0-9]+}}]
 ; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
diff --git a/test/CodeGen/AArch64/concatvector-bugs.ll b/test/CodeGen/AArch64/concatvector-bugs.ll
new file mode 100644
index 0000000..5889e22
--- /dev/null
+++ b/test/CodeGen/AArch64/concatvector-bugs.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon
+; Bug: i8 type in FRP8 register but not registering with register class causes segmentation fault.
+; Fix: Removed i8 type from FPR8 register class.
+
+define void @test_concatvector_v8i8() {
+entry.split:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry.split
+  unreachable
+
+if.end:                                           ; preds = %entry.split
+  br i1 undef, label %if.then9, label %if.end18
+
+if.then9:                                         ; preds = %if.end
+  unreachable
+
+if.end18:                                         ; preds = %if.end
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %if.end18
+  br i1 false, label %if.then30, label %for.inc
+
+if.then30:                                        ; preds = %for.body
+  unreachable
+
+for.inc:                                          ; preds = %for.body
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  br label %for.body77
+
+for.body77:                                       ; preds = %for.body77, %for.end
+  br i1 undef, label %for.end106, label %for.body77
+
+for.end106:                                       ; preds = %for.body77
+  br i1 undef, label %for.body130.us.us, label %stmt.for.body130.us.us
+
+stmt.for.body130.us.us:                     ; preds = %stmt.for.body130.us.us, %for.end106
+  %_p_splat.us = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
+  store <8 x i8> %_p_splat.us, <8 x i8>* undef, align 1
+  br label %stmt.for.body130.us.us
+
+for.body130.us.us:                                ; preds = %for.body130.us.us, %for.end106
+  br label %for.body130.us.us
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32)
+
+define <8 x i16> @test_splat(i32 %l) nounwind {
+; CHECK-LABEL: test_splat:
+; CHECK: ret
+  %lhs = insertelement <1 x i32> undef, i32 %l, i32 0
+  %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11)
+  %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %vec
+}
+
+
+define <8 x i16> @test_notsplat(<8 x i16> %a, <8 x i16> %b, i32 %l) nounwind {
+; CHECK-LABEL: test_notsplat:
+; CHECK: ret
+entry:
+  %lhs = insertelement <1 x i32> undef, i32 %l, i32 0
+  %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11)
+  %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i16> %vec
+}
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
new file mode 100644
index 0000000..f0b60f0
--- /dev/null
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -0,0 +1,13 @@
+; This tests that llc accepts all valid AArch64 CPUs
+
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
+
+; CHECK-NOT: {{.*}}  is not a recognized processor for this target
+; INVALID: {{.*}}  is not a recognized processor for this target
+
+define i32 @f(i64 %z) {
+	ret i32 0
+}
diff --git a/test/CodeGen/AArch64/fcvt-int.ll b/test/CodeGen/AArch64/fcvt-int.ll
index b28eb3e..97427a7 100644
--- a/test/CodeGen/AArch64/fcvt-int.ll
+++ b/test/CodeGen/AArch64/fcvt-int.ll
@@ -69,7 +69,7 @@ define float @test_i32tofloat(i32 %in) {
 ; CHECK-DAG: scvtf [[SIG:s[0-9]+]], {{w[0-9]+}}
 
   %res = fsub float %signed, %unsigned
-; CHECL: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]]
+; CHECK: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]]
   ret float %res
 ; CHECK: ret
 }
diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll
index 590557f..2a6790e 100644
--- a/test/CodeGen/AArch64/fp-dp3.ll
+++ b/test/CodeGen/AArch64/fp-dp3.ll
@@ -26,8 +26,9 @@ define float @test_fmsub(float %a, float %b, float %c) {
 define float @test_fnmadd(float %a, float %b, float %c) {
 ; CHECK-LABEL: test_fnmadd:
 ; CHECK-NOFAST-LABEL: test_fnmadd:
+  %nega = fsub float -0.0, %a
   %negc = fsub float -0.0, %c
-  %val = call float @llvm.fma.f32(float %a, float %b, float %negc)
+  %val = call float @llvm.fma.f32(float %nega, float %b, float %negc)
 ; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK-NOFAST: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   ret float %val
@@ -36,9 +37,8 @@ define float @test_fnmadd(float %a, float %b, float %c) {
 define float @test_fnmsub(float %a, float %b, float %c) {
 ; CHECK-LABEL: test_fnmsub:
 ; CHECK-NOFAST-LABEL: test_fnmsub:
-  %nega = fsub float -0.0, %a
   %negc = fsub float -0.0, %c
-  %val = call float @llvm.fma.f32(float %nega, float %b, float %negc)
+  %val = call float @llvm.fma.f32(float %a, float %b, float %negc)
 ; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK-NOFAST: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   ret float %val
@@ -66,8 +66,9 @@ define double @testd_fmsub(double %a, double %b, double %c) {
 define double @testd_fnmadd(double %a, double %b, double %c) {
 ; CHECK-LABEL: testd_fnmadd:
 ; CHECK-NOFAST-LABEL: testd_fnmadd:
+  %nega = fsub double -0.0, %a
   %negc = fsub double -0.0, %c
-  %val = call double @llvm.fma.f64(double %a, double %b, double %negc)
+  %val = call double @llvm.fma.f64(double %nega, double %b, double %negc)
 ; CHECK: fnmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 ; CHECK-NOFAST: fnmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret double %val
@@ -76,9 +77,8 @@ define double @testd_fnmadd(double %a, double %b, double %c) {
 define double @testd_fnmsub(double %a, double %b, double %c) {
 ; CHECK-LABEL: testd_fnmsub:
 ; CHECK-NOFAST-LABEL: testd_fnmsub:
-  %nega = fsub double -0.0, %a
   %negc = fsub double -0.0, %c
-  %val = call double @llvm.fma.f64(double %nega, double %b, double %negc)
+  %val = call double @llvm.fma.f64(double %a, double %b, double %negc)
 ; CHECK: fnmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 ; CHECK-NOFAST: fnmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret double %val
@@ -113,12 +113,13 @@ define float @test_fnmadd_unfused(float %a, float %b, float %c) {
 ; CHECK-NOFAST-LABEL: test_fnmadd_unfused:
   %nega = fsub float -0.0, %a
   %prod = fmul float %b, %c
-  %sum = fadd float %nega, %prod
+  %diff = fsub float %nega, %prod
 ; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK-NOFAST-NOT: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret float %sum
+; CHECK-NOFAST: ret
+  ret float %diff
 }
 
 define float @test_fnmsub_unfused(float %a, float %b, float %c) {
@@ -126,12 +127,37 @@ define float @test_fnmsub_unfused(float %a, float %b, float %c) {
 ; CHECK-NOFAST-LABEL: test_fnmsub_unfused:
   %nega = fsub float -0.0, %a
   %prod = fmul float %b, %c
-  %diff = fsub float %nega, %prod
+  %sum = fadd float %nega, %prod
 ; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK-NOFAST-NOT: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-NOFAST-DAG: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-NOFAST-DAG: fneg {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-NOFAST-DAG: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-NOFAST: ret
-  ret float %diff
+; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %sum
 }
+
+; Another set of tests that check for multiply single use
+
+define float @test_fmadd_unfused_su(float %a, float %b, float %c) {
+; CHECK-LABEL: test_fmadd_unfused_su:
+  %prod = fmul float %b, %c
+  %sum = fadd float %a, %prod
+  %res = fadd float %sum, %prod
+; CHECK-NOT: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %res
+}
+
+define float @test_fmsub_unfused_su(float %a, float %b, float %c) {
+; CHECK-LABEL: test_fmsub_unfused_su:
+  %prod = fmul float %b, %c
+  %diff = fsub float %a, %prod
+  %res = fsub float %diff, %prod
+; CHECK-NOT: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %res
+}
+
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index 430d77f..f307686 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,5 +1,7 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -146,7 +148,8 @@ define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var
 
     %retval = load volatile i32* %stacked
     ret i32 %retval
-; CHECK: ldr w0, [sp, #16]
+; CHECK-LE: ldr w0, [sp, #16]
+; CHECK-BE: ldr w0, [sp, #20]
 }
 
 define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
@@ -180,8 +183,10 @@ define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
 ; CHECK: check_i128_stackalign
     store i128 %stack2, i128* @var128
     ; Nothing local on stack in current codegen, so first stack is 16 away
-; CHECK: add     x[[REG:[0-9]+]], sp, #16
-; CHECK: ldr {{x[0-9]+}}, [x[[REG]], #8]
+; CHECK-LE: add     x[[REG:[0-9]+]], sp, #16
+; CHECK-LE: ldr {{x[0-9]+}}, [x[[REG]], #8]
+; CHECK-BE: ldr {{x[0-9]+}}, [sp, #24]
+
     ; Important point is that we address sp+24 for second dword
 ; CHECK: ldr     {{x[0-9]+}}, [sp, #16]
     ret void
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index ac188bb..f029bf2 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-BE --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -126,8 +128,10 @@ define void @check_i128_align() {
 
   call void @check_i128_regalign(i32 0, i128 42)
 ; CHECK-NOT: mov x1
-; CHECK: movz x2, #42
-; CHECK: mov x3, xzr
+; CHECK-LE: movz x2, #42
+; CHECK-LE: mov x3, xzr
+; CHECK-BE: movz x3, #42
+; CHECK-BE: mov x2, xzr
 ; CHECK: bl check_i128_regalign
 
   ret void
diff --git a/test/CodeGen/AArch64/i128-shift.ll b/test/CodeGen/AArch64/i128-shift.ll
new file mode 100644
index 0000000..d786d44
--- /dev/null
+++ b/test/CodeGen/AArch64/i128-shift.ll
@@ -0,0 +1,43 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i128 @test_i128_lsl(i128 %a, i32 %shift) {
+; CHECK-LABEL: test_i128_lsl:
+
+  %sh_prom = zext i32 %shift to i128
+  %shl = shl i128 %a, %sh_prom
+
+; CHECK: movz [[SIXTYFOUR:x[0-9]+]], #64
+; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw
+; CHECK-NEXT: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[REVSHAMT]]
+; CHECK: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[SHAMT:x[0-9]+]]
+; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]]
+; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64
+; CHECK-NEXT: lsl [[TMP3:x[0-9]+]], [[LO]], [[EXTRASHAMT]]
+; CHECK-NEXT: cmp [[EXTRASHAMT]], #0
+; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], [[TMP3]], [[FALSEVAL]], ge
+; CHECK-NEXT: lsl [[TMP4:x[0-9]+]], [[LO]], [[SHAMT]]
+; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], xzr, [[TMP4]], ge
+
+  ret i128 %shl
+}
+
+define i128 @test_i128_shr(i128 %a, i32 %shift) {
+; CHECK-LABEL: test_i128_shr:
+
+  %sh_prom = zext i32 %shift to i128
+  %shr = lshr i128 %a, %sh_prom
+
+; CHECK: movz [[SIXTYFOUR]], #64
+; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw
+; CHECK-NEXT: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[REVSHAMT]]
+; CHECK: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[SHAMT:x[0-9]+]]
+; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]]
+; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64
+; CHECK-NEXT: lsr [[TRUEVAL:x[0-9]+]], [[HI]], [[EXTRASHAMT]]
+; CHECK-NEXT: cmp [[EXTRASHAMT]], #0
+; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], [[TRUEVAL]], [[FALSEVAL]], ge
+; CHECK-NEXT: lsr [[TMP3:x[0-9]+]], [[HI]], [[SHAMT]]
+; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], xzr, [[TMP3]], ge
+
+  ret i128 %shr
+}
diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll
index 3ff1c1a..076ae27 100644
--- a/test/CodeGen/AArch64/init-array.ll
+++ b/test/CodeGen/AArch64/init-array.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array < %s | FileCheck %s
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
   ret void
diff --git a/test/CodeGen/AArch64/inline-asm-constraints.ll b/test/CodeGen/AArch64/inline-asm-constraints.ll
index 18a3b37..365453c 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints.ll
@@ -1,4 +1,4 @@
-;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -no-integrated-as < %s | FileCheck %s
 
 define i64 @test_inline_constraint_r(i64 %base, i32 %offset) {
 ; CHECK-LABEL: test_inline_constraint_r:
diff --git a/test/CodeGen/AArch64/inline-asm-modifiers.ll b/test/CodeGen/AArch64/inline-asm-modifiers.ll
index b7f4d3c..cb66335 100644
--- a/test/CodeGen/AArch64/inline-asm-modifiers.ll
+++ b/test/CodeGen/AArch64/inline-asm-modifiers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -no-integrated-as < %s | FileCheck %s
 
 @var_simple = hidden global i32 0
 @var_got = global i32 0
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 4bb0942..94717f5 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -code-model=large -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic <%s | FileCheck --check-prefix=CHECK-PIC %s
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
@@ -22,6 +23,12 @@ define i32 @test_jumptable(i32 %in) {
 ; CHECK-LARGE: ldr [[DEST:x[0-9]+]], [x[[JTADDR]], {{x[0-9]+}}, lsl #3]
 ; CHECK-LARGE: br [[DEST]]
 
+; CHECK-PIC: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
+; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], #:lo12:.LJTI0_0
+; CHECK-PIC: ldrsw [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #2]
+; CHECK-PIC: add [[TABLE:x[0-9]+]], [[DEST]], x[[JT]]
+; CHECK-PIC: br [[TABLE]]
+
 def:
   ret i32 0
 
@@ -47,3 +54,12 @@ lbl4:
 ; CHECK-NEXT: .xword
 ; CHECK-NEXT: .xword
 ; CHECK-NEXT: .xword
+
+; CHECK-PIC-NOT: .data_region
+; CHECK-PIC: .LJTI0_0:
+; CHECK-PIC-NEXT: .word
+; CHECK-PIC-NEXT: .word
+; CHECK-PIC-NEXT: .word
+; CHECK-PIC-NEXT: .word
+; CHECK-PIC-NEXT: .word
+; CHECK-PIC-NOT: .end_data_region
diff --git a/test/CodeGen/AArch64/mature-mc-support.ll b/test/CodeGen/AArch64/mature-mc-support.ll
new file mode 100644
index 0000000..06e3cc7
--- /dev/null
+++ b/test/CodeGen/AArch64/mature-mc-support.ll
@@ -0,0 +1,12 @@
+; Test that inline assembly is parsed by the MC layer when MC support is mature
+; (even when the output is assembly).
+
+; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t1
+; RUN: FileCheck %s < %t1
+
+; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t2
+; RUN: FileCheck %s < %t2
+
+module asm "	.this_directive_is_very_unlikely_to_exist"
+
+; CHECK: LLVM ERROR: Error parsing inline asm
diff --git a/test/CodeGen/AArch64/misched-basic-A53.ll b/test/CodeGen/AArch64/misched-basic-A53.ll
new file mode 100644
index 0000000..1555c48
--- /dev/null
+++ b/test/CodeGen/AArch64/misched-basic-A53.ll
@@ -0,0 +1,112 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+;
+; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
+; much higher than the ADD instructions in order to hide latency. When not
+; specifying a subtarget, the MADD will remain near the end of the block.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: main
+; CHECK: *** Final schedule for BB#2 ***
+; CHECK: SU(13)
+; CHECK: MADDwwww
+; CHECK: SU(4)
+; CHECK: ADDwwi_lsl0_s
+; CHECK: ********** INTERVALS **********
+@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
+@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %x = alloca [8 x i32], align 4
+  %y = alloca [8 x i32], align 4
+  %i = alloca i32, align 4
+  %xx = alloca i32, align 4
+  %yy = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = bitcast [8 x i32]* %x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false)
+  %1 = bitcast [8 x i32]* %y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false)
+  store i32 0, i32* %xx, align 4
+  store i32 0, i32* %yy, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %2 = load i32* %i, align 4
+  %cmp = icmp slt i32 %2, 8
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom
+  %4 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %4, 1
+  store i32 %add, i32* %xx, align 4
+  %5 = load i32* %xx, align 4
+  %add1 = add nsw i32 %5, 12
+  store i32 %add1, i32* %xx, align 4
+  %6 = load i32* %xx, align 4
+  %add2 = add nsw i32 %6, 23
+  store i32 %add2, i32* %xx, align 4
+  %7 = load i32* %xx, align 4
+  %add3 = add nsw i32 %7, 34
+  store i32 %add3, i32* %xx, align 4
+  %8 = load i32* %i, align 4
+  %idxprom4 = sext i32 %8 to i64
+  %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4
+  %9 = load i32* %arrayidx5, align 4
+  %10 = load i32* %yy, align 4
+  %mul = mul nsw i32 %10, %9
+  store i32 %mul, i32* %yy, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %11 = load i32* %i, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %12 = load i32* %xx, align 4
+  %13 = load i32* %yy, align 4
+  %add6 = add nsw i32 %12, %13
+  ret i32 %add6
+}
+
+
+; The Cortex-A53 machine model will cause the FDIVvvv_42 to be raised to
+; hide latency. Whereas normally there would only be a single FADDvvv_4s
+; after it, this test checks to make sure there are more than one.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: neon4xfloat:BB#0
+; CHECK: *** Final schedule for BB#0 ***
+; CHECK: FDIVvvv_4S
+; CHECK: FADDvvv_4S
+; CHECK: FADDvvv_4S
+; CHECK: ********** INTERVALS **********
+define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
+        %tmp1 = fadd <4 x float> %A, %B;
+        %tmp2 = fadd <4 x float> %A, %tmp1;
+        %tmp3 = fadd <4 x float> %A, %tmp2;
+        %tmp4 = fadd <4 x float> %A, %tmp3;
+        %tmp5 = fadd <4 x float> %A, %tmp4;
+        %tmp6 = fadd <4 x float> %A, %tmp5;
+        %tmp7 = fadd <4 x float> %A, %tmp6;
+        %tmp8 = fadd <4 x float> %A, %tmp7;
+        %tmp9 = fdiv <4 x float> %A, %B;
+        %tmp10 = fadd <4 x float> %tmp8, %tmp9;
+
+        ret <4 x float> %tmp10
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll
new file mode 100644
index 0000000..f58c598
--- /dev/null
+++ b/test/CodeGen/AArch64/mul-lohi.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
+
+define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
+; CHECK-LABEL: test_128bitmul:
+; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
+; CHECK-DAG: madd [[PART1:x[0-9]+]], x0, x3, [[CARRY]]
+; CHECK: madd x1, x1, x2, [[PART1]]
+; CHECK: mul x0, x0, x2
+
+; CHECK-BE-LABEL: test_128bitmul:
+; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
+; CHECK-BE-DAG: madd [[PART1:x[0-9]+]], x1, x2, [[CARRY]]
+; CHECK-BE: madd x0, x0, x3, [[PART1]]
+; CHECK-BE: mul x1, x1, x3
+
+  %prod = mul i128 %lhs, %rhs
+  ret i128 %prod
+}
diff --git a/test/CodeGen/AArch64/neon-2velem.ll b/test/CodeGen/AArch64/neon-2velem.ll
index 9d61842..acffb14 100644
--- a/test/CodeGen/AArch64/neon-2velem.ll
+++ b/test/CodeGen/AArch64/neon-2velem.ll
@@ -45,6 +45,7 @@ declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
 define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmla_lane_s16:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %b
@@ -55,6 +56,7 @@ entry:
 define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlaq_lane_s16:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %b
@@ -65,6 +67,7 @@ entry:
 define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmla_lane_s32:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %b
@@ -75,6 +78,7 @@ entry:
 define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlaq_lane_s32:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %b
@@ -85,6 +89,7 @@ entry:
 define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmla_laneq_s16:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -95,6 +100,7 @@ entry:
 define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlaq_laneq_s16:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %b
@@ -105,6 +111,7 @@ entry:
 define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmla_laneq_s32:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -115,6 +122,7 @@ entry:
 define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlaq_laneq_s32:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %b
@@ -125,6 +133,7 @@ entry:
 define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmls_lane_s16:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %b
@@ -135,6 +144,7 @@ entry:
 define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlsq_lane_s16:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %b
@@ -145,6 +155,7 @@ entry:
 define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmls_lane_s32:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %b
@@ -155,6 +166,7 @@ entry:
 define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlsq_lane_s32:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %b
@@ -165,6 +177,7 @@ entry:
 define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmls_laneq_s16:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -175,6 +188,7 @@ entry:
 define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlsq_laneq_s16:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %b
@@ -185,6 +199,7 @@ entry:
 define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmls_laneq_s32:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -195,6 +210,7 @@ entry:
 define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlsq_laneq_s32:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %b
@@ -205,6 +221,7 @@ entry:
 define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmul_lane_s16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %a
@@ -214,6 +231,7 @@ entry:
 define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmulq_lane_s16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %a
@@ -223,6 +241,7 @@ entry:
 define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmul_lane_s32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %a
@@ -232,6 +251,7 @@ entry:
 define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmulq_lane_s32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %a
@@ -241,6 +261,7 @@ entry:
 define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmul_lane_u16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %a
@@ -250,6 +271,7 @@ entry:
 define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmulq_lane_u16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %a
@@ -259,6 +281,7 @@ entry:
 define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmul_lane_u32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %a
@@ -268,6 +291,7 @@ entry:
 define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmulq_lane_u32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %a
@@ -277,6 +301,7 @@ entry:
 define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmul_laneq_s16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -286,6 +311,7 @@ entry:
 define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmulq_laneq_s16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %a
@@ -295,6 +321,7 @@ entry:
 define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmul_laneq_s32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -304,6 +331,7 @@ entry:
 define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmulq_laneq_s32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %a
@@ -313,6 +341,7 @@ entry:
 define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmul_laneq_u16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -322,6 +351,7 @@ entry:
 define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmulq_laneq_u16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %a
@@ -331,6 +361,7 @@ entry:
 define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmul_laneq_u32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -340,6 +371,7 @@ entry:
 define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmulq_laneq_u32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %a
@@ -349,6 +381,7 @@ entry:
 define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK: test_vfma_lane_f32:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -360,6 +393,7 @@ declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
 ; CHECK: test_vfmaq_lane_f32:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -371,6 +405,7 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
 ; CHECK: test_vfma_laneq_f32:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -380,6 +415,7 @@ entry:
 define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
 ; CHECK: test_vfmaq_laneq_f32:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -389,6 +425,7 @@ entry:
 define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK: test_vfms_lane_f32:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
@@ -399,6 +436,7 @@ entry:
 define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
 ; CHECK: test_vfmsq_lane_f32:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -409,6 +447,7 @@ entry:
 define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
 ; CHECK: test_vfms_laneq_f32:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
@@ -419,6 +458,7 @@ entry:
 define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
 ; CHECK: test_vfmsq_laneq_f32:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -429,6 +469,7 @@ entry:
 define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
 ; CHECK: test_vfmaq_lane_f64:
 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
 entry:
   %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -440,6 +481,7 @@ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
 define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
 ; CHECK: test_vfmaq_laneq_f64:
 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
 entry:
   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -449,6 +491,7 @@ entry:
 define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
 ; CHECK: test_vfmsq_lane_f64:
 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
 entry:
   %sub = fsub <1 x double> <double -0.000000e+00>, %v
   %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
@@ -459,6 +502,7 @@ entry:
 define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
 ; CHECK: test_vfmsq_laneq_f64:
 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
 entry:
   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -466,9 +510,57 @@ entry:
   ret <2 x double> %0
 }
 
+define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmas_laneq_f32
+; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %extract = extractelement <4 x float> %v, i32 3
+  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
+  ret float %0
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
+; CHECK-LABEL: test_vfmsd_lane_f64
+; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %extract.rhs = extractelement <1 x double> %v, i32 0
+  %extract = fsub double -0.000000e+00, %extract.rhs
+  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
+  ret double %0
+}
+
+declare double @llvm.fma.f64(double, double, double)
+
+define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
+; CHECK: test_vfmss_laneq_f32
+; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %extract.rhs = extractelement <4 x float> %v, i32 3
+  %extract = fsub float -0.000000e+00, %extract.rhs
+  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
+  ret float %0
+}
+
+define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmsd_laneq_f64
+; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %extract.rhs = extractelement <2 x double> %v, i32 1
+  %extract = fsub double -0.000000e+00, %extract.rhs
+  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
+  ret double %0
+}
+
 define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlal_lane_s16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -479,6 +571,7 @@ entry:
 define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlal_lane_s32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -489,6 +582,7 @@ entry:
 define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlal_laneq_s16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -499,6 +593,7 @@ entry:
 define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlal_laneq_s32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -509,6 +604,7 @@ entry:
 define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlal_high_lane_s16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -520,6 +616,7 @@ entry:
 define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlal_high_lane_s32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -531,6 +628,7 @@ entry:
 define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlal_high_laneq_s16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -542,6 +640,7 @@ entry:
 define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlal_high_laneq_s32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -553,6 +652,7 @@ entry:
 define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlsl_lane_s16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -563,6 +663,7 @@ entry:
 define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlsl_lane_s32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -573,6 +674,7 @@ entry:
 define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlsl_laneq_s16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -583,6 +685,7 @@ entry:
 define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlsl_laneq_s32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -593,6 +696,7 @@ entry:
 define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlsl_high_lane_s16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -604,6 +708,7 @@ entry:
 define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlsl_high_lane_s32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -615,6 +720,7 @@ entry:
 define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlsl_high_laneq_s16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -626,6 +732,7 @@ entry:
 define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlsl_high_laneq_s32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -637,6 +744,7 @@ entry:
 define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlal_lane_u16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -647,6 +755,7 @@ entry:
 define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlal_lane_u32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -657,6 +766,7 @@ entry:
 define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlal_laneq_u16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -667,6 +777,7 @@ entry:
 define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlal_laneq_u32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -677,6 +788,7 @@ entry:
 define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlal_high_lane_u16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -688,6 +800,7 @@ entry:
 define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlal_high_lane_u32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -699,6 +812,7 @@ entry:
 define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlal_high_laneq_u16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -710,6 +824,7 @@ entry:
 define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlal_high_laneq_u32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -721,6 +836,7 @@ entry:
 define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlsl_lane_u16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -731,6 +847,7 @@ entry:
 define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlsl_lane_u32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -741,6 +858,7 @@ entry:
 define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlsl_laneq_u16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -751,6 +869,7 @@ entry:
 define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlsl_laneq_u32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -761,6 +880,7 @@ entry:
 define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlsl_high_lane_u16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -772,6 +892,7 @@ entry:
 define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlsl_high_lane_u32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -783,6 +904,7 @@ entry:
 define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlsl_high_laneq_u16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -794,6 +916,7 @@ entry:
 define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlsl_high_laneq_u32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -805,6 +928,7 @@ entry:
 define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmull_lane_s16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -814,6 +938,7 @@ entry:
 define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmull_lane_s32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -823,6 +948,7 @@ entry:
 define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmull_lane_u16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -832,6 +958,7 @@ entry:
 define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmull_lane_u32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -841,6 +968,7 @@ entry:
 define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmull_high_lane_s16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -851,6 +979,7 @@ entry:
 define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmull_high_lane_s32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -861,6 +990,7 @@ entry:
 define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmull_high_lane_u16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -871,6 +1001,7 @@ entry:
 define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmull_high_lane_u32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -881,6 +1012,7 @@ entry:
 define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmull_laneq_s16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -890,6 +1022,7 @@ entry:
 define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmull_laneq_s32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -899,6 +1032,7 @@ entry:
 define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmull_laneq_u16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -908,6 +1042,7 @@ entry:
 define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmull_laneq_u32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -917,6 +1052,7 @@ entry:
 define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmull_high_laneq_s16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -927,6 +1063,7 @@ entry:
 define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmull_high_laneq_s32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -937,6 +1074,7 @@ entry:
 define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmull_high_laneq_u16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -947,6 +1085,7 @@ entry:
 define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmull_high_laneq_u32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -957,6 +1096,7 @@ entry:
 define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vqdmlal_lane_s16:
 ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -967,6 +1107,7 @@ entry:
 define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vqdmlal_lane_s32:
 ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -977,6 +1118,7 @@ entry:
 define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vqdmlal_high_lane_s16:
 ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -988,6 +1130,7 @@ entry:
 define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vqdmlal_high_lane_s32:
 ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -999,6 +1142,7 @@ entry:
 define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vqdmlsl_lane_s16:
 ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1009,6 +1153,7 @@ entry:
 define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vqdmlsl_lane_s32:
 ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1019,6 +1164,7 @@ entry:
 define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vqdmlsl_high_lane_s16:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1030,6 +1176,7 @@ entry:
 define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vqdmlsl_high_lane_s32:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1041,6 +1188,7 @@ entry:
 define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqdmull_lane_s16:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1050,6 +1198,7 @@ entry:
 define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqdmull_lane_s32:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1059,6 +1208,7 @@ entry:
 define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vqdmull_laneq_s16:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1068,6 +1218,7 @@ entry:
 define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vqdmull_laneq_s32:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1077,6 +1228,7 @@ entry:
 define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqdmull_high_lane_s16:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1087,6 +1239,7 @@ entry:
 define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqdmull_high_lane_s32:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1097,6 +1250,7 @@ entry:
 define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vqdmull_high_laneq_s16:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1107,6 +1261,7 @@ entry:
 define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vqdmull_high_laneq_s32:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1117,6 +1272,7 @@ entry:
 define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqdmulh_lane_s16:
 ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1126,6 +1282,7 @@ entry:
 define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqdmulhq_lane_s16:
 ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -1135,6 +1292,7 @@ entry:
 define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqdmulh_lane_s32:
 ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1144,6 +1302,7 @@ entry:
 define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqdmulhq_lane_s32:
 ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -1153,6 +1312,7 @@ entry:
 define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqrdmulh_lane_s16:
 ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1162,6 +1322,7 @@ entry:
 define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqrdmulhq_lane_s16:
 ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -1171,6 +1332,7 @@ entry:
 define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqrdmulh_lane_s32:
 ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1180,6 +1342,7 @@ entry:
 define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqrdmulhq_lane_s32:
 ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -1189,6 +1352,7 @@ entry:
 define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
 ; CHECK: test_vmul_lane_f32:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %mul = fmul <2 x float> %shuffle, %a
@@ -1198,6 +1362,7 @@ entry:
 define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
 ; CHECK: test_vmul_lane_f64:
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -1210,6 +1375,7 @@ entry:
 define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
 ; CHECK: test_vmulq_lane_f32:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = fmul <4 x float> %shuffle, %a
@@ -1219,6 +1385,7 @@ entry:
 define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
 ; CHECK: test_vmulq_lane_f64:
 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x double> %shuffle, %a
@@ -1228,6 +1395,7 @@ entry:
 define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
 ; CHECK: test_vmul_laneq_f32:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %mul = fmul <2 x float> %shuffle, %a
@@ -1237,6 +1405,7 @@ entry:
 define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
 ; CHECK: test_vmul_laneq_f64:
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -1249,6 +1418,7 @@ entry:
 define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
 ; CHECK: test_vmulq_laneq_f32:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = fmul <4 x float> %shuffle, %a
@@ -1258,6 +1428,7 @@ entry:
 define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
 ; CHECK: test_vmulq_laneq_f64:
 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %mul = fmul <2 x double> %shuffle, %a
@@ -1267,6 +1438,7 @@ entry:
 define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
 ; CHECK: test_vmulx_lane_f32:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1276,6 +1448,7 @@ entry:
 define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
 ; CHECK: test_vmulxq_lane_f32:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1285,6 +1458,7 @@ entry:
 define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
 ; CHECK: test_vmulxq_lane_f64:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1294,6 +1468,7 @@ entry:
 define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
 ; CHECK: test_vmulx_laneq_f32:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1303,6 +1478,7 @@ entry:
 define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
 ; CHECK: test_vmulxq_laneq_f32:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1312,6 +1488,7 @@ entry:
 define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
 ; CHECK: test_vmulxq_laneq_f64:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1321,6 +1498,7 @@ entry:
 define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmla_lane_s16_0:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1331,6 +1509,7 @@ entry:
 define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlaq_lane_s16_0:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1341,6 +1520,7 @@ entry:
 define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmla_lane_s32_0:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1351,6 +1531,7 @@ entry:
 define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlaq_lane_s32_0:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1361,6 +1542,7 @@ entry:
 define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmla_laneq_s16_0:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1371,6 +1553,7 @@ entry:
 define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlaq_laneq_s16_0:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1381,6 +1564,7 @@ entry:
 define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmla_laneq_s32_0:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1391,6 +1575,7 @@ entry:
 define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlaq_laneq_s32_0:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1401,6 +1586,7 @@ entry:
 define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmls_lane_s16_0:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1411,6 +1597,7 @@ entry:
 define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlsq_lane_s16_0:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1421,6 +1608,7 @@ entry:
 define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmls_lane_s32_0:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1431,6 +1619,7 @@ entry:
 define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlsq_lane_s32_0:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1441,6 +1630,7 @@ entry:
 define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmls_laneq_s16_0:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1451,6 +1641,7 @@ entry:
 define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlsq_laneq_s16_0:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1461,6 +1652,7 @@ entry:
 define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmls_laneq_s32_0:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1471,6 +1663,7 @@ entry:
 define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlsq_laneq_s32_0:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1481,6 +1674,7 @@ entry:
 define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmul_lane_s16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1490,6 +1684,7 @@ entry:
 define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmulq_lane_s16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1499,6 +1694,7 @@ entry:
 define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmul_lane_s32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1508,6 +1704,7 @@ entry:
 define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmulq_lane_s32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -1517,6 +1714,7 @@ entry:
 define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmul_lane_u16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1526,6 +1724,7 @@ entry:
 define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmulq_lane_u16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1535,6 +1734,7 @@ entry:
 define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmul_lane_u32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1544,6 +1744,7 @@ entry:
 define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmulq_lane_u32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -1553,6 +1754,7 @@ entry:
 define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmul_laneq_s16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1562,6 +1764,7 @@ entry:
 define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmulq_laneq_s16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1571,6 +1774,7 @@ entry:
 define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmul_laneq_s32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1580,6 +1784,7 @@ entry:
 define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmulq_laneq_s32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -1589,6 +1794,7 @@ entry:
 define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmul_laneq_u16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1598,6 +1804,7 @@ entry:
 define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmulq_laneq_u16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1607,6 +1814,7 @@ entry:
 define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmul_laneq_u32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1616,6 +1824,7 @@ entry:
 define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmulq_laneq_u32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -1625,6 +1834,7 @@ entry:
 define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK: test_vfma_lane_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -1634,6 +1844,7 @@ entry:
 define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
 ; CHECK: test_vfmaq_lane_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -1643,6 +1854,7 @@ entry:
 define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
 ; CHECK: test_vfma_laneq_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -1652,6 +1864,7 @@ entry:
 define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
 ; CHECK: test_vfmaq_laneq_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -1661,6 +1874,7 @@ entry:
 define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK: test_vfms_lane_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
@@ -1671,6 +1885,7 @@ entry:
 define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
 ; CHECK: test_vfmsq_lane_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
@@ -1681,6 +1896,7 @@ entry:
 define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
 ; CHECK: test_vfms_laneq_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
@@ -1691,6 +1907,7 @@ entry:
 define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
 ; CHECK: test_vfmsq_laneq_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
@@ -1701,6 +1918,7 @@ entry:
 define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
 ; CHECK: test_vfmaq_laneq_f64_0:
 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
 entry:
   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -1710,6 +1928,7 @@ entry:
 define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
 ; CHECK: test_vfmsq_laneq_f64_0:
 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
 entry:
   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
@@ -1720,6 +1939,7 @@ entry:
 define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlal_lane_s16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1730,6 +1950,7 @@ entry:
 define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlal_lane_s32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1740,6 +1961,7 @@ entry:
 define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlal_laneq_s16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1750,6 +1972,7 @@ entry:
 define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlal_laneq_s32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1760,6 +1983,7 @@ entry:
 define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlal_high_lane_s16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -1771,6 +1995,7 @@ entry:
 define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlal_high_lane_s32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -1782,6 +2007,7 @@ entry:
 define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlal_high_laneq_s16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -1793,6 +2019,7 @@ entry:
 define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlal_high_laneq_s32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -1804,6 +2031,7 @@ entry:
 define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlsl_lane_s16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1814,6 +2042,7 @@ entry:
 define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlsl_lane_s32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1824,6 +2053,7 @@ entry:
 define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlsl_laneq_s16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1834,6 +2064,7 @@ entry:
 define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlsl_laneq_s32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1844,6 +2075,7 @@ entry:
 define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlsl_high_lane_s16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -1855,6 +2087,7 @@ entry:
 define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlsl_high_lane_s32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -1866,6 +2099,7 @@ entry:
 define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlsl_high_laneq_s16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -1877,6 +2111,7 @@ entry:
 define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlsl_high_laneq_s32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -1888,6 +2123,7 @@ entry:
 define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlal_lane_u16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1898,6 +2134,7 @@ entry:
 define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlal_lane_u32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1908,6 +2145,7 @@ entry:
 define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlal_laneq_u16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1918,6 +2156,7 @@ entry:
 define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlal_laneq_u32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1928,6 +2167,7 @@ entry:
 define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlal_high_lane_u16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -1939,6 +2179,7 @@ entry:
 define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlal_high_lane_u32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -1950,6 +2191,7 @@ entry:
 define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlal_high_laneq_u16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -1961,6 +2203,7 @@ entry:
 define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlal_high_laneq_u32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -1972,6 +2215,7 @@ entry:
 define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlsl_lane_u16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1982,6 +2226,7 @@ entry:
 define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlsl_lane_u32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1992,6 +2237,7 @@ entry:
 define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlsl_laneq_u16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2002,6 +2248,7 @@ entry:
 define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlsl_laneq_u32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2012,6 +2259,7 @@ entry:
 define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vmlsl_high_lane_u16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2023,6 +2271,7 @@ entry:
 define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vmlsl_high_lane_u32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2034,6 +2283,7 @@ entry:
 define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK: test_vmlsl_high_laneq_u16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2045,6 +2295,7 @@ entry:
 define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK: test_vmlsl_high_laneq_u32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2056,6 +2307,7 @@ entry:
 define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmull_lane_s16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2065,6 +2317,7 @@ entry:
 define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmull_lane_s32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2074,6 +2327,7 @@ entry:
 define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmull_lane_u16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2083,6 +2337,7 @@ entry:
 define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmull_lane_u32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2092,6 +2347,7 @@ entry:
 define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmull_high_lane_s16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2102,6 +2358,7 @@ entry:
 define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmull_high_lane_s32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2112,6 +2369,7 @@ entry:
 define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vmull_high_lane_u16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2122,6 +2380,7 @@ entry:
 define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vmull_high_lane_u32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2132,6 +2391,7 @@ entry:
 define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmull_laneq_s16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2141,6 +2401,7 @@ entry:
 define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmull_laneq_s32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2150,6 +2411,7 @@ entry:
 define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmull_laneq_u16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2159,6 +2421,7 @@ entry:
 define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmull_laneq_u32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2168,6 +2431,7 @@ entry:
 define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmull_high_laneq_s16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2178,6 +2442,7 @@ entry:
 define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmull_high_laneq_s32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2188,6 +2453,7 @@ entry:
 define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vmull_high_laneq_u16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2198,6 +2464,7 @@ entry:
 define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vmull_high_laneq_u32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2208,6 +2475,7 @@ entry:
 define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vqdmlal_lane_s16_0:
 ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2218,6 +2486,7 @@ entry:
 define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vqdmlal_lane_s32_0:
 ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2228,6 +2497,7 @@ entry:
 define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vqdmlal_high_lane_s16_0:
 ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2239,6 +2509,7 @@ entry:
 define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vqdmlal_high_lane_s32_0:
 ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2250,6 +2521,7 @@ entry:
 define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vqdmlsl_lane_s16_0:
 ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2260,6 +2532,7 @@ entry:
 define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vqdmlsl_lane_s32_0:
 ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2270,6 +2543,7 @@ entry:
 define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK: test_vqdmlsl_high_lane_s16_0:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2281,6 +2555,7 @@ entry:
 define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK: test_vqdmlsl_high_lane_s32_0:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2292,6 +2567,7 @@ entry:
 define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqdmull_lane_s16_0:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2301,6 +2577,7 @@ entry:
 define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqdmull_lane_s32_0:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2310,6 +2587,7 @@ entry:
 define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vqdmull_laneq_s16_0:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2319,6 +2597,7 @@ entry:
 define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vqdmull_laneq_s32_0:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2328,6 +2607,7 @@ entry:
 define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqdmull_high_lane_s16_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2338,6 +2618,7 @@ entry:
 define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqdmull_high_lane_s32_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2348,6 +2629,7 @@ entry:
 define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK: test_vqdmull_high_laneq_s16_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2358,6 +2640,7 @@ entry:
 define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK: test_vqdmull_high_laneq_s32_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2368,6 +2651,7 @@ entry:
 define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqdmulh_lane_s16_0:
 ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2377,6 +2661,7 @@ entry:
 define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqdmulhq_lane_s16_0:
 ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -2386,6 +2671,7 @@ entry:
 define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqdmulh_lane_s32_0:
 ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2395,6 +2681,7 @@ entry:
 define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqdmulhq_lane_s32_0:
 ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -2404,6 +2691,7 @@ entry:
 define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqrdmulh_lane_s16_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2413,6 +2701,7 @@ entry:
 define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK: test_vqrdmulhq_lane_s16_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -2422,6 +2711,7 @@ entry:
 define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqrdmulh_lane_s32_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2431,6 +2721,7 @@ entry:
 define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK: test_vqrdmulhq_lane_s32_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -2440,6 +2731,7 @@ entry:
 define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
 ; CHECK: test_vmul_lane_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x float> %shuffle, %a
@@ -2449,6 +2741,7 @@ entry:
 define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
 ; CHECK: test_vmulq_lane_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %mul = fmul <4 x float> %shuffle, %a
@@ -2458,6 +2751,7 @@ entry:
 define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
 ; CHECK: test_vmul_laneq_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x float> %shuffle, %a
@@ -2467,6 +2761,7 @@ entry:
 define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
 ; CHECK: test_vmul_laneq_f64_0:
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -2479,6 +2774,7 @@ entry:
 define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
 ; CHECK: test_vmulq_laneq_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %mul = fmul <4 x float> %shuffle, %a
@@ -2488,6 +2784,7 @@ entry:
 define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
 ; CHECK: test_vmulq_laneq_f64_0:
 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x double> %shuffle, %a
@@ -2497,6 +2794,7 @@ entry:
 define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
 ; CHECK: test_vmulx_lane_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -2506,6 +2804,7 @@ entry:
 define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
 ; CHECK: test_vmulxq_lane_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -2515,6 +2814,7 @@ entry:
 define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
 ; CHECK: test_vmulxq_lane_f64_0:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -2524,6 +2824,7 @@ entry:
 define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
 ; CHECK: test_vmulx_laneq_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -2533,6 +2834,7 @@ entry:
 define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
 ; CHECK: test_vmulxq_laneq_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -2542,6 +2844,7 @@ entry:
 define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
 ; CHECK: test_vmulxq_laneq_f64_0:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
diff --git a/test/CodeGen/AArch64/neon-3vdiff.ll b/test/CodeGen/AArch64/neon-3vdiff.ll
index 171e2b2..96400eb 100644
--- a/test/CodeGen/AArch64/neon-3vdiff.ll
+++ b/test/CodeGen/AArch64/neon-3vdiff.ll
@@ -1804,3 +1804,30 @@ entry:
   ret <8 x i16> %vmull.i.i
 }
 
+define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
+; CHECK: test_vmull_p64
+; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
+entry:
+  %vmull.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vmull1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i, <1 x i64> %vmull1.i) #1
+  %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
+  ret i128 %vmull3.i
+}
+
+define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
+; CHECK: test_vmull_high_p64
+; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %0 = extractelement <2 x i64> %a, i32 1
+  %1 = extractelement <2 x i64> %b, i32 1
+  %vmull.i.i = insertelement <1 x i64> undef, i64 %0, i32 0
+  %vmull1.i.i = insertelement <1 x i64> undef, i64 %1, i32 0
+  %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i.i, <1 x i64> %vmull1.i.i) #1
+  %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
+  ret i128 %vmull3.i.i
+}
+
+declare <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64>, <1 x i64>) #5
+
+
diff --git a/test/CodeGen/AArch64/neon-across.ll b/test/CodeGen/AArch64/neon-across.ll
index 733db97..6d30c95 100644
--- a/test/CodeGen/AArch64/neon-across.ll
+++ b/test/CodeGen/AArch64/neon-across.ll
@@ -1,12 +1,12 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
-declare <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v4f32(<4 x float>)
+declare float @llvm.aarch64.neon.vminnmv(<4 x float>)
 
-declare <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v4f32(<4 x float>)
+declare float @llvm.aarch64.neon.vmaxnmv(<4 x float>)
 
-declare <1 x float> @llvm.aarch64.neon.vminv.v1f32.v4f32(<4 x float>)
+declare float @llvm.aarch64.neon.vminv(<4 x float>)
 
-declare <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v4f32(<4 x float>)
+declare float @llvm.aarch64.neon.vmaxv(<4 x float>)
 
 declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32>)
 
@@ -442,8 +442,7 @@ define float @test_vmaxvq_f32(<4 x float> %a) {
 ; CHECK: test_vmaxvq_f32:
 ; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %vmaxv.i = tail call <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v4f32(<4 x float> %a)
-  %0 = extractelement <1 x float> %vmaxv.i, i32 0
+  %0 = call float @llvm.aarch64.neon.vmaxv(<4 x float> %a)
   ret float %0
 }
 
@@ -451,8 +450,7 @@ define float @test_vminvq_f32(<4 x float> %a) {
 ; CHECK: test_vminvq_f32:
 ; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %vminv.i = tail call <1 x float> @llvm.aarch64.neon.vminv.v1f32.v4f32(<4 x float> %a)
-  %0 = extractelement <1 x float> %vminv.i, i32 0
+  %0 = call float @llvm.aarch64.neon.vminv(<4 x float> %a)
   ret float %0
 }
 
@@ -460,8 +458,7 @@ define float @test_vmaxnmvq_f32(<4 x float> %a) {
 ; CHECK: test_vmaxnmvq_f32:
 ; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %vmaxnmv.i = tail call <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v4f32(<4 x float> %a)
-  %0 = extractelement <1 x float> %vmaxnmv.i, i32 0
+  %0 = call float @llvm.aarch64.neon.vmaxnmv(<4 x float> %a)
   ret float %0
 }
 
@@ -469,8 +466,7 @@ define float @test_vminnmvq_f32(<4 x float> %a) {
 ; CHECK: test_vminnmvq_f32:
 ; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %vminnmv.i = tail call <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v4f32(<4 x float> %a)
-  %0 = extractelement <1 x float> %vminnmv.i, i32 0
+  %0 = call float @llvm.aarch64.neon.vminnmv(<4 x float> %a)
   ret float %0
 }
 
diff --git a/test/CodeGen/AArch64/neon-add-pairwise.ll b/test/CodeGen/AArch64/neon-add-pairwise.ll
index 1abfed3..32d8222 100644
--- a/test/CodeGen/AArch64/neon-add-pairwise.ll
+++ b/test/CodeGen/AArch64/neon-add-pairwise.ll
@@ -90,3 +90,12 @@ define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
         ret <2 x double> %val
 }
 
+define i32 @test_vaddv.v2i32(<2 x i32> %a) {
+; CHECK-LABEL: test_vaddv.v2i32
+; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %1 = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32> %a)
+  %2 = extractelement <1 x i32> %1, i32 0
+  ret i32 %2
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32>)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/neon-add-sub.ll
index 078ba14..9015237 100644
--- a/test/CodeGen/AArch64/neon-add-sub.ll
+++ b/test/CodeGen/AArch64/neon-add-sub.ll
@@ -1,119 +1,119 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: add {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = add <8 x i8> %A, %B;
 	ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: add {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = add <16 x i8> %A, %B;
 	ret <16 x i8> %tmp3
 }
 
 define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: add {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = add <4 x i16> %A, %B;
 	ret <4 x i16> %tmp3
 }
 
 define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: add {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+;CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = add <8 x i16> %A, %B;
 	ret <8 x i16> %tmp3
 }
 
 define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: add {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = add <2 x i32> %A, %B;
 	ret <2 x i32> %tmp3
 }
 
 define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: add {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = add <4 x i32> %A, %B;
 	ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = add <2 x i64> %A, %B;
 	ret <2 x i64> %tmp3
 }
 
 define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fadd {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = fadd <2 x float> %A, %B;
 	ret <2 x float> %tmp3
 }
 
 define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fadd {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = fadd <4 x float> %A, %B;
 	ret <4 x float> %tmp3
 }
 define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = fadd <2 x double> %A, %B;
 	ret <2 x double> %tmp3
 }
 
 define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sub {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = sub <8 x i8> %A, %B;
 	ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sub {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = sub <16 x i8> %A, %B;
 	ret <16 x i8> %tmp3
 }
 
 define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sub {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = sub <4 x i16> %A, %B;
 	ret <4 x i16> %tmp3
 }
 
 define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sub {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+;CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = sub <8 x i16> %A, %B;
 	ret <8 x i16> %tmp3
 }
 
 define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = sub <2 x i32> %A, %B;
 	ret <2 x i32> %tmp3
 }
 
 define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = sub <4 x i32> %A, %B;
 	ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = sub <2 x i64> %A, %B;
 	ret <2 x i64> %tmp3
 }
 
 define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fsub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = fsub <2 x float> %A, %B;
 	ret <2 x float> %tmp3
 }
 
 define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fsub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = fsub <4 x float> %A, %B;
 	ret <4 x float> %tmp3
 }
 define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = fsub <2 x double> %A, %B;
 	ret <2 x double> %tmp3
 }
@@ -234,4 +234,46 @@ declare <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double>, <1 x double>)
 declare <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double>, <1 x double>)
 declare <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double>, <1 x double>)
 declare <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
-\ No newline at end of file
+declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
+
+define <1 x i8> @test_add_v1i8(<1 x i8> %a, <1 x i8> %b) {
+;CHECK-LABEL: test_add_v1i8:
+;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %c = add <1 x i8> %a, %b
+  ret <1 x i8> %c
+}
+
+define <1 x i16> @test_add_v1i16(<1 x i16> %a, <1 x i16> %b) {
+;CHECK-LABEL: test_add_v1i16:
+;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %c = add <1 x i16> %a, %b
+  ret <1 x i16> %c
+}
+
+define <1 x i32> @test_add_v1i32(<1 x i32> %a, <1 x i32> %b) {
+;CHECK-LABEL: test_add_v1i32:
+;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %c = add <1 x i32> %a, %b
+  ret <1 x i32> %c
+}
+
+define <1 x i8> @test_sub_v1i8(<1 x i8> %a, <1 x i8> %b) {
+;CHECK-LABEL: test_sub_v1i8:
+;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %c = sub <1 x i8> %a, %b
+  ret <1 x i8> %c
+}
+
+define <1 x i16> @test_sub_v1i16(<1 x i16> %a, <1 x i16> %b) {
+;CHECK-LABEL: test_sub_v1i16:
+;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %c = sub <1 x i16> %a, %b
+  ret <1 x i16> %c
+}
+
+define <1 x i32> @test_sub_v1i32(<1 x i32> %a, <1 x i32> %b) {
+;CHECK-LABEL: test_sub_v1i32:
+;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %c = sub <1 x i32> %a, %b
+  ret <1 x i32> %c
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-bitcast.ll b/test/CodeGen/AArch64/neon-bitcast.ll
index f9ec704..61099d4 100644
--- a/test/CodeGen/AArch64/neon-bitcast.ll
+++ b/test/CodeGen/AArch64/neon-bitcast.ll
@@ -20,8 +20,8 @@ define <2 x i32> @test_v8i8_to_v2i32(<8 x i8> %in) nounwind {
   ret <2 x i32> %val
 }
 
-define <2 x float> @test_v8i8_to_v1f32(<8 x i8> %in) nounwind{
-; CHECK: test_v8i8_to_v1f32:
+define <2 x float> @test_v8i8_to_v2f32(<8 x i8> %in) nounwind{
+; CHECK: test_v8i8_to_v2f32:
 ; CHECK-NEXT: // BB#0:
 ; CHECK-NEXT: ret
 
@@ -67,8 +67,8 @@ define <2 x i32> @test_v4i16_to_v2i32(<4 x i16> %in) nounwind {
   ret <2 x i32> %val
 }
 
-define <2 x float> @test_v4i16_to_v1f32(<4 x i16> %in) nounwind{
-; CHECK: test_v4i16_to_v1f32:
+define <2 x float> @test_v4i16_to_v2f32(<4 x i16> %in) nounwind{
+; CHECK: test_v4i16_to_v2f32:
 ; CHECK-NEXT: // BB#0:
 ; CHECK-NEXT: ret
 
@@ -114,8 +114,8 @@ define <2 x i32> @test_v2i32_to_v2i32(<2 x i32> %in) nounwind {
   ret <2 x i32> %val
 }
 
-define <2 x float> @test_v2i32_to_v1f32(<2 x i32> %in) nounwind{
-; CHECK: test_v2i32_to_v1f32:
+define <2 x float> @test_v2i32_to_v2f32(<2 x i32> %in) nounwind{
+; CHECK: test_v2i32_to_v2f32:
 ; CHECK-NEXT: // BB#0:
 ; CHECK-NEXT: ret
 
diff --git a/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 1c43b97..7e5b693 100644
--- a/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1,502 +1,502 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
-
 define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <8 x i8> %a, %b;
 	ret <8 x i8> %tmp1
 }
 
 define <16 x i8> @and16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <16 x i8> %a, %b;
 	ret <16 x i8> %tmp1
 }
 
 
 define <8 x i8> @orr8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <8 x i8> %a, %b;
 	ret <8 x i8> %tmp1
 }
 
 define <16 x i8> @orr16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <16 x i8> %a, %b;
 	ret <16 x i8> %tmp1
 }
 
 
 define <8 x i8> @xor8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <8 x i8> %a, %b;
 	ret <8 x i8> %tmp1
 }
 
 define <16 x i8> @xor16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <16 x i8> %a, %b;
 	ret <16 x i8> %tmp1
 }
 
 define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b)  {
-;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
-	%tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
-	%tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 >
+;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 >
+	%tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1 >
 	%tmp3 = or <8 x i8> %tmp1, %tmp2
 	ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) {
-;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
-	%tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
-	%tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 >
+;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0 >
+	%tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1 >
 	%tmp3 = or <16 x i8> %tmp1, %tmp2
 	ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @orn8xi8(<8 x i8> %a, <8 x i8> %b)  {
-;CHECK:  orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = or <8 x i8> %a, %tmp1
   ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @orn16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK:  orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = or <16 x i8> %a, %tmp1
   ret <16 x i8> %tmp2
 }
 
 define <8 x i8> @bic8xi8(<8 x i8> %a, <8 x i8> %b)  {
-;CHECK:  bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = and <8 x i8> %a, %tmp1
   ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @bic16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK:  bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = and <16 x i8> %a, %tmp1
   ret <16 x i8> %tmp2
 }
 
 define <2 x i32> @orrimm2s_lsl0(<2 x i32> %a) {
-;CHECK:  orr {{v[0-31]+}}.2s, #0xff
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff
 	%tmp1 = or <2 x i32> %a, < i32 255, i32 255>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @orrimm2s_lsl8(<2 x i32> %a) {
-;CHECK:  orr {{v[0-31]+}}.2s, #0xff, lsl #8
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
 	%tmp1 = or <2 x i32> %a, < i32 65280, i32 65280>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @orrimm2s_lsl16(<2 x i32> %a) {
-;CHECK:  orr {{v[0-31]+}}.2s, #0xff, lsl #16
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
 	%tmp1 = or <2 x i32> %a, < i32 16711680, i32 16711680>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @orrimm2s_lsl24(<2 x i32> %a) {
-;CHECK:  orr {{v[0-31]+}}.2s, #0xff, lsl #24
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
 	%tmp1 = or <2 x i32> %a, < i32 4278190080, i32 4278190080>
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl0(<4 x i32> %a) {
-;CHECK:  orr {{v[0-31]+}}.4s, #0xff
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff
 	%tmp1 = or <4 x i32> %a, < i32 255, i32 255, i32 255, i32 255>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl8(<4 x i32> %a) {
-;CHECK:  orr {{v[0-31]+}}.4s, #0xff, lsl #8
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
 	%tmp1 = or <4 x i32> %a, < i32 65280, i32 65280, i32 65280, i32 65280>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl16(<4 x i32> %a) {
-;CHECK:  orr {{v[0-31]+}}.4s, #0xff, lsl #16
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
 	%tmp1 = or <4 x i32> %a, < i32 16711680, i32 16711680, i32 16711680, i32 16711680>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl24(<4 x i32> %a) {
-;CHECK:  orr {{v[0-31]+}}.4s, #0xff, lsl #24
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
 	%tmp1 = or <4 x i32> %a, < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i16> @orrimm4h_lsl0(<4 x i16> %a) {
-;CHECK:  orr {{v[0-31]+}}.4h, #0xff
+;CHECK:  orr {{v[0-9]+}}.4h, #0xff
 	%tmp1 = or <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255 >
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @orrimm4h_lsl8(<4 x i16> %a) {
-;CHECK:  orr {{v[0-31]+}}.4h, #0xff, lsl #8
+;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
 	%tmp1 = or <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 >
 	ret <4 x i16> %tmp1
 }
 
 define <8 x i16> @orrimm8h_lsl0(<8 x i16> %a) {
-;CHECK:  orr {{v[0-31]+}}.8h, #0xff
+;CHECK:  orr {{v[0-9]+}}.8h, #0xff
 	%tmp1 = or <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @orrimm8h_lsl8(<8 x i16> %a) {
-;CHECK:  orr {{v[0-31]+}}.8h, #0xff, lsl #8
+;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
 	%tmp1 = or <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl0(<2 x i32> %a) {
-;CHECK:  bic {{v[0-31]+}}.2s, #0x10
+;CHECK:  bic {{v[0-9]+}}.2s, #0x10
 	%tmp1 = and <2 x i32> %a, < i32 4294967279, i32 4294967279 >
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl8(<2 x i32> %a) {
-;CHECK:  bic {{v[0-31]+}}.2s, #0x10, lsl #8
-	%tmp1 = and <2 x i32> %a, < i32 18446744073709547519, i32  18446744073709547519 >
+;CHECK:  bic {{v[0-9]+}}.2s, #0x10, lsl #8
+	%tmp1 = and <2 x i32> %a, < i32 4294963199, i32  4294963199 >
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl16(<2 x i32> %a) {
-;CHECK:  bic {{v[0-31]+}}.2s, #0x10, lsl #16
-	%tmp1 = and <2 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039 >
+;CHECK:  bic {{v[0-9]+}}.2s, #0x10, lsl #16
+	%tmp1 = and <2 x i32> %a, < i32 4293918719, i32 4293918719 >
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl124(<2 x i32> %a) {
-;CHECK:  bic {{v[0-31]+}}.2s, #0x10, lsl #24
-	%tmp1 = and <2 x i32> %a, < i32 18446744073441116159, i32  18446744073441116159>
+;CHECK:  bic {{v[0-9]+}}.2s, #0x10, lsl #24
+	%tmp1 = and <2 x i32> %a, < i32 4026531839, i32  4026531839>
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl0(<4 x i32> %a) {
-;CHECK:  bic {{v[0-31]+}}.4s, #0x10
+;CHECK:  bic {{v[0-9]+}}.4s, #0x10
 	%tmp1 = and <4 x i32> %a, < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl8(<4 x i32> %a) {
-;CHECK:  bic {{v[0-31]+}}.4s, #0x10, lsl #8
-	%tmp1 = and <4 x i32> %a, < i32 18446744073709547519, i32  18446744073709547519, i32  18446744073709547519, i32  18446744073709547519 >
+;CHECK:  bic {{v[0-9]+}}.4s, #0x10, lsl #8
+	%tmp1 = and <4 x i32> %a, < i32 4294963199, i32  4294963199, i32  4294963199, i32  4294963199 >
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl16(<4 x i32> %a) {
-;CHECK:  bic {{v[0-31]+}}.4s, #0x10, lsl #16
-	%tmp1 = and <4 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039 >
+;CHECK:  bic {{v[0-9]+}}.4s, #0x10, lsl #16
+	%tmp1 = and <4 x i32> %a, < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 >
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl124(<4 x i32> %a) {
-;CHECK:  bic {{v[0-31]+}}.4s, #0x10, lsl #24
-	%tmp1 = and <4 x i32> %a, < i32 18446744073441116159, i32  18446744073441116159, i32  18446744073441116159, i32  18446744073441116159>
+;CHECK:  bic {{v[0-9]+}}.4s, #0x10, lsl #24
+	%tmp1 = and <4 x i32> %a, < i32 4026531839, i32  4026531839, i32  4026531839, i32  4026531839>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl0_a(<4 x i16> %a) {
-;CHECK:  bic {{v[0-31]+}}.4h, #0x10
-	%tmp1 = and <4 x i16> %a, < i16 18446744073709551599, i16  18446744073709551599, i16  18446744073709551599, i16  18446744073709551599 >
+;CHECK:  bic {{v[0-9]+}}.4h, #0x10
+	%tmp1 = and <4 x i16> %a, < i16 4294967279, i16  4294967279, i16  4294967279, i16  4294967279 >
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl0_b(<4 x i16> %a) {
-;CHECK:  bic {{v[0-31]+}}.4h, #0x0
+;CHECK:  bic {{v[0-9]+}}.4h, #0xff
 	%tmp1 = and <4 x i16> %a, < i16 65280, i16  65280, i16  65280, i16 65280 >
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl8_a(<4 x i16> %a) {
-;CHECK:  bic {{v[0-31]+}}.4h, #0x10, lsl #8
-	%tmp1 = and <4 x i16> %a, < i16 18446744073709547519, i16  18446744073709547519, i16  18446744073709547519, i16  18446744073709547519>
+;CHECK:  bic {{v[0-9]+}}.4h, #0x10, lsl #8
+	%tmp1 = and <4 x i16> %a, < i16 4294963199, i16  4294963199, i16  4294963199, i16  4294963199>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl8_b(<4 x i16> %a) {
-;CHECK:  bic {{v[0-31]+}}.4h, #0x0, lsl #8
+;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
 	%tmp1 = and <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255>
 	ret <4 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl0_a(<8 x i16> %a) {
-;CHECK:  bic {{v[0-31]+}}.8h, #0x10
-	%tmp1 = and <8 x i16> %a, < i16 18446744073709551599, i16  18446744073709551599, i16  18446744073709551599, i16  18446744073709551599,
-   i16  18446744073709551599, i16  18446744073709551599, i16  18446744073709551599, i16  18446744073709551599 >
+;CHECK:  bic {{v[0-9]+}}.8h, #0x10
+	%tmp1 = and <8 x i16> %a, < i16 4294967279, i16  4294967279, i16  4294967279, i16  4294967279,
+   i16  4294967279, i16  4294967279, i16  4294967279, i16  4294967279 >
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl0_b(<8 x i16> %a) {
-;CHECK:  bic {{v[0-31]+}}.8h, #0x0
+;CHECK:  bic {{v[0-9]+}}.8h, #0xff
 	%tmp1 = and <8 x i16> %a, < i16 65280, i16  65280, i16  65280, i16 65280, i16 65280, i16  65280, i16  65280, i16 65280 >
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl8_a(<8 x i16> %a) {
-;CHECK:  bic {{v[0-31]+}}.8h, #0x10, lsl #8
-	%tmp1 = and <8 x i16> %a, < i16 18446744073709547519, i16  18446744073709547519, i16  18446744073709547519, i16  18446744073709547519,
-   i16  18446744073709547519, i16  18446744073709547519, i16  18446744073709547519, i16  18446744073709547519>
+;CHECK:  bic {{v[0-9]+}}.8h, #0x10, lsl #8
+	%tmp1 = and <8 x i16> %a, < i16 4294963199, i16  4294963199, i16  4294963199, i16  4294963199,
+   i16  4294963199, i16  4294963199, i16  4294963199, i16  4294963199>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl8_b(<8 x i16> %a) {
-;CHECK:  bic {{v[0-31]+}}.8h, #0x0, lsl #8
+;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
 	%tmp1 = and <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i32> @and2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <2 x i32> %a, %b;
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i16> @and4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <4 x i16> %a, %b;
 	ret <4 x i16> %tmp1
 }
 
 define <1 x i64> @and1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <1 x i64> %a, %b;
 	ret <1 x i64> %tmp1
 }
 
 define <4 x i32> @and4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <4 x i32> %a, %b;
 	ret <4 x i32> %tmp1
 }
 
 define <8 x i16> @and8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <8 x i16> %a, %b;
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @and2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <2 x i64> %a, %b;
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i32> @orr2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <2 x i32> %a, %b;
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i16> @orr4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <4 x i16> %a, %b;
 	ret <4 x i16> %tmp1
 }
 
 define <1 x i64> @orr1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <1 x i64> %a, %b;
 	ret <1 x i64> %tmp1
 }
 
 define <4 x i32> @orr4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <4 x i32> %a, %b;
 	ret <4 x i32> %tmp1
 }
 
 define <8 x i16> @orr8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <8 x i16> %a, %b;
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @orr2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <2 x i64> %a, %b;
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i32> @eor2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <2 x i32> %a, %b;
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i16> @eor4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <4 x i16> %a, %b;
 	ret <4 x i16> %tmp1
 }
 
 define <1 x i64> @eor1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <1 x i64> %a, %b;
 	ret <1 x i64> %tmp1
 }
 
 define <4 x i32> @eor4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <4 x i32> %a, %b;
 	ret <4 x i32> %tmp1
 }
 
 define <8 x i16> @eor8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <8 x i16> %a, %b;
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @eor2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <2 x i64> %a, %b;
 	ret <2 x i64> %tmp1
 }
 
 
 define <2 x i32> @bic2xi32(<2 x i32> %a, <2 x i32> %b)  {
-;CHECK:  bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
   %tmp2 = and <2 x i32> %a, %tmp1
   ret <2 x i32> %tmp2
 }
 
 define <4 x i16> @bic4xi16(<4 x i16> %a, <4 x i16> %b)  {
-;CHECK:  bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
   %tmp2 = and <4 x i16> %a, %tmp1
   ret <4 x i16> %tmp2
 }
 
 define <1 x i64> @bic1xi64(<1 x i64> %a, <1 x i64> %b)  {
-;CHECK:  bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <1 x i64> %b, < i64 -1>
   %tmp2 = and <1 x i64> %a, %tmp1
   ret <1 x i64> %tmp2
 }
 
 define <4 x i32> @bic4xi32(<4 x i32> %a, <4 x i32> %b)  {
-;CHECK:  bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
   %tmp2 = and <4 x i32> %a, %tmp1
   ret <4 x i32> %tmp2
 }
 
 define <8 x i16> @bic8xi16(<8 x i16> %a, <8 x i16> %b)  {
-;CHECK:  bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
   %tmp2 = and <8 x i16> %a, %tmp1
   ret <8 x i16> %tmp2
 }
 
 define <2 x i64> @bic2xi64(<2 x i64> %a, <2 x i64> %b)  {
-;CHECK:  bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
   %tmp2 = and <2 x i64> %a, %tmp1
   ret <2 x i64> %tmp2
 }
 
 define <2 x i32> @orn2xi32(<2 x i32> %a, <2 x i32> %b)  {
-;CHECK:  orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
   %tmp2 = or <2 x i32> %a, %tmp1
   ret <2 x i32> %tmp2
 }
 
 define <4 x i16> @orn4xi16(<4 x i16> %a, <4 x i16> %b)  {
-;CHECK:  orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
   %tmp2 = or <4 x i16> %a, %tmp1
   ret <4 x i16> %tmp2
 }
 
 define <1 x i64> @orn1xi64(<1 x i64> %a, <1 x i64> %b)  {
-;CHECK:  orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <1 x i64> %b, < i64 -1>
   %tmp2 = or <1 x i64> %a, %tmp1
   ret <1 x i64> %tmp2
 }
 
 define <4 x i32> @orn4xi32(<4 x i32> %a, <4 x i32> %b)  {
-;CHECK:  orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
   %tmp2 = or <4 x i32> %a, %tmp1
   ret <4 x i32> %tmp2
 }
 
 define <8 x i16> @orn8xi16(<8 x i16> %a, <8 x i16> %b)  {
-;CHECK:  orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
   %tmp2 = or <8 x i16> %a, %tmp1
   ret <8 x i16> %tmp2
 }
 
 define <2 x i64> @orn2xi64(<2 x i64> %a, <2 x i64> %b)  {
-;CHECK:  orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
   %tmp2 = or <2 x i64> %a, %tmp1
   ret <2 x i64> %tmp2
 }
+
 define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b)  {
-;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
-	%tmp1 = and <2 x i32> %a, < i32 -1, i32 -1 >
-	%tmp2 = and <2 x i32> %b, < i32 0, i32 0 >
+;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp1 = and <2 x i32> %a, < i32 -1, i32 0 >
+	%tmp2 = and <2 x i32> %b, < i32 0, i32 -1 >
 	%tmp3 = or <2 x i32> %tmp1, %tmp2
 	ret <2 x i32> %tmp3
 }
 
 
 define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b)  {
-;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
-	%tmp1 = and <4 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1 >
-	%tmp2 = and <4 x i16> %b, < i16 0, i16 0,i16 0, i16 0 >
+;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp1 = and <4 x i16> %a, < i16 -1, i16 0, i16 -1,i16 0 >
+	%tmp2 = and <4 x i16> %b, < i16 0, i16 -1,i16 0, i16 -1 >
 	%tmp3 = or <4 x i16> %tmp1, %tmp2
 	ret <4 x i16> %tmp3
 }
 
 define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b)  {
-;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
-	%tmp1 = and <1 x i64> %a, < i64 -1 >
-	%tmp2 = and <1 x i64> %b, < i64 0 >
+;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp1 = and <1 x i64> %a, < i64 -16 >
+	%tmp2 = and <1 x i64> %b, < i64 15 >
 	%tmp3 = or <1 x i64> %tmp1, %tmp2
 	ret <1 x i64> %tmp3
 }
 
 define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b)  {
-;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
-	%tmp1 = and <4 x i32> %a, < i32 -1, i32 -1, i32 -1, i32 -1 >
-	%tmp2 = and <4 x i32> %b, < i32 0, i32 0, i32 0, i32 0 >
+;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp1 = and <4 x i32> %a, < i32 -1, i32 0, i32 -1, i32 0 >
+	%tmp2 = and <4 x i32> %b, < i32 0, i32 -1, i32 0, i32 -1 >
 	%tmp3 = or <4 x i32> %tmp1, %tmp2
 	ret <4 x i32> %tmp3
 }
 
 define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b)  {
-;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
-	%tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1, i16 -1, i16 -1, i16 -1,i16 -1 >
-	%tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0 >
+;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 0,i16 0, i16 -1, i16 -1, i16 0,i16 0 >
+	%tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 -1, i16 -1, i16 0, i16 0, i16 -1, i16 -1 >
 	%tmp3 = or <8 x i16> %tmp1, %tmp2
 	ret <8 x i16> %tmp3
 }
 
 define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b)  {
-;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
-	%tmp1 = and <2 x i64> %a, < i64 -1, i64 -1 >
-	%tmp2 = and <2 x i64> %b, < i64 0, i64 0 >
+;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp1 = and <2 x i64> %a, < i64 -1, i64 0 >
+	%tmp2 = and <2 x i64> %b, < i64 0, i64 -1 >
 	%tmp3 = or <2 x i64> %tmp1, %tmp2
 	ret <2 x i64> %tmp3
 }
 
 
 define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <8 x i8> %v1, %v2
   %2 = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   %3 = and <8 x i8> %2, %v3
@@ -505,7 +505,7 @@ define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
 }
 
 define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <4 x i16> %v1, %v2
   %2 = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
   %3 = and <4 x i16> %2, %v3
@@ -514,7 +514,7 @@ define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
 }
 
 define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <2 x i32> %v1, %v2
   %2 = xor <2 x i32> %v1, <i32 -1, i32 -1>
   %3 = and <2 x i32> %2, %v3
@@ -523,7 +523,7 @@ define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
 }
 
 define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <1 x i64> %v1, %v2
   %2 = xor <1 x i64> %v1, <i64 -1>
   %3 = and <1 x i64> %2, %v3
@@ -532,7 +532,7 @@ define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
 }
 
 define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <16 x i8> %v1, %v2
   %2 = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   %3 = and <16 x i8> %2, %v3
@@ -541,7 +541,7 @@ define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
 }
 
 define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <8 x i16> %v1, %v2
   %2 = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   %3 = and <8 x i16> %2, %v3
@@ -550,7 +550,7 @@ define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
 }
 
 define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <4 x i32> %v1, %v2
   %2 = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
   %3 = and <4 x i32> %2, %v3
@@ -558,8 +558,65 @@ define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
   ret <4 x i32> %4
 }
 
+define <8 x i8> @vselect_v8i8(<8 x i8> %a) {
+;CHECK:  movi	 {{d[0-9]+}}, #0xffff
+;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %b = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i8> %a, <8 x i8> <i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+  ret <8 x i8> %b
+}
+
+define <4 x i16> @vselect_v4i16(<4 x i16> %a) {
+;CHECK:  movi	 {{d[0-9]+}}, #0xffff
+;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i16> %a, <4 x i16> <i16 undef, i16 0, i16 0, i16 0>
+  ret <4 x i16> %b
+}
+
+define <8 x i8> @vselect_cmp_ne(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT:  not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %cmp = icmp ne <8 x i8> %a, %b
+  %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @vselect_cmp_eq(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %cmp = icmp eq <8 x i8> %a, %b
+  %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @vselect_cmpz_ne(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+;CHECK-NEXT:  not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %cmp = icmp ne <8 x i8> %a, zeroinitializer
+  %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @vselect_cmpz_eq(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %cmp = icmp eq <8 x i8> %a, zeroinitializer
+  %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
+  ret <8 x i8> %d
+}
+
+define <8 x i8> @vselect_tst(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+;CHECK:  cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = and <8 x i8> %a, %b
+	%tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
+  %d = select <8 x i1> %tmp4, <8 x i8> %b, <8 x i8> %c
+  ret <8 x i8> %d
+}
+
 define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <2 x i64> %v1, %v2
   %2 = xor <2 x i64> %v1, <i64 -1, i64 -1>
   %3 = and <2 x i64> %2, %v3
@@ -568,27 +625,459 @@ define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
 }
 
 define <8 x i8> @orrimm8b_as_orrimm4h_lsl0(<8 x i8> %a) {
-;CHECK:  orr {{v[0-31]+}}.4h, #0xff
+;CHECK:  orr {{v[0-9]+}}.4h, #0xff
   %val = or <8 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
   ret <8 x i8> %val
 }
 
 define <8 x i8> @orrimm8b_as_orimm4h_lsl8(<8 x i8> %a) {
-;CHECK:  orr {{v[0-31]+}}.4h, #0xff, lsl #8
+;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
   %val = or <8 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
   ret <8 x i8> %val
 }
 
 define <16 x i8> @orimm16b_as_orrimm8h_lsl0(<16 x i8> %a) {
-;CHECK:  orr {{v[0-31]+}}.8h, #0xff
+;CHECK:  orr {{v[0-9]+}}.8h, #0xff
   %val = or <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
   ret <16 x i8> %val
 }
 
 define <16 x i8> @orimm16b_as_orrimm8h_lsl8(<16 x i8> %a) {
-;CHECK:  orr {{v[0-31]+}}.8h, #0xff, lsl #8
+;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
   %val = or <16 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
   ret <16 x i8> %val
 }
 
+define <8 x i8> @and8imm2s_lsl0(<8 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xff
+	%tmp1 = and <8 x i8> %a, < i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255>
+	ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @and8imm2s_lsl8(<8 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #8
+	%tmp1 = and <8 x i8> %a, < i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255>
+	ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @and8imm2s_lsl16(<8 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #16
+	%tmp1 = and <8 x i8> %a, < i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255>
+	ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @and8imm2s_lsl24(<8 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+	%tmp1 = and <8 x i8> %a, < i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1>
+	ret <8 x i8> %tmp1
+}
+
+define <4 x i16> @and16imm2s_lsl0(<4 x i16> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xff
+	%tmp1 = and <4 x i16> %a, < i16 65280, i16 65535, i16 65280, i16 65535>
+	ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @and16imm2s_lsl8(<4 x i16> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #8
+	%tmp1 = and <4 x i16> %a, < i16 255, i16 65535, i16 255, i16 65535>
+	ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @and16imm2s_lsl16(<4 x i16> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #16
+	%tmp1 = and <4 x i16> %a, < i16 65535, i16 65280, i16 65535, i16 65280>
+	ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @and16imm2s_lsl24(<4 x i16> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+	%tmp1 = and <4 x i16> %a, < i16 65535, i16 511, i16 65535, i16 511>
+	ret <4 x i16> %tmp1
+}
+
+
+define <1 x i64> @and64imm2s_lsl0(<1 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xff
+	%tmp1 = and <1 x i64> %a, < i64 -1095216660736>
+	ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @and64imm2s_lsl8(<1 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #8
+	%tmp1 = and <1 x i64> %a, < i64 -280375465148161>
+	ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @and64imm2s_lsl16(<1 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #16
+	%tmp1 = and <1 x i64> %a, < i64 -71776119077928961>
+	ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @and64imm2s_lsl24(<1 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+	%tmp1 = and <1 x i64> %a, < i64 144115183814443007>
+	ret <1 x i64> %tmp1
+}
+
+define <16 x i8> @and8imm4s_lsl0(<16 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xff
+	%tmp1 = and <16 x i8> %a, < i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255>
+	ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @and8imm4s_lsl8(<16 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #8
+	%tmp1 = and <16 x i8> %a, < i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255>
+	ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @and8imm4s_lsl16(<16 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #16
+	%tmp1 = and <16 x i8> %a, < i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255>
+	ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @and8imm4s_lsl24(<16 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+	%tmp1 = and <16 x i8> %a, < i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1>
+	ret <16 x i8> %tmp1
+}
+
+define <8 x i16> @and16imm4s_lsl0(<8 x i16> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xff
+	%tmp1 = and <8 x i16> %a, < i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535>
+	ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @and16imm4s_lsl8(<8 x i16> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #8
+	%tmp1 = and <8 x i16> %a, < i16 255, i16 65535, i16 255, i16 65535, i16 255, i16 65535, i16 255, i16 65535>
+	ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @and16imm4s_lsl16(<8 x i16> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #16
+	%tmp1 = and <8 x i16> %a, < i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280>
+	ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @and16imm4s_lsl24(<8 x i16> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+	%tmp1 = and <8 x i16> %a, < i16 65535, i16 511, i16 65535, i16 511, i16 65535, i16 511, i16 65535, i16 511>
+	ret <8 x i16> %tmp1
+}
+
+define <2 x i64> @and64imm4s_lsl0(<2 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xff
+	%tmp1 = and <2 x i64> %a, < i64 -1095216660736, i64 -1095216660736>
+	ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @and64imm4s_lsl8(<2 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #8
+	%tmp1 = and <2 x i64> %a, < i64 -280375465148161, i64 -280375465148161>
+	ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @and64imm4s_lsl16(<2 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #16
+	%tmp1 = and <2 x i64> %a, < i64 -71776119077928961, i64 -71776119077928961>
+	ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @and64imm4s_lsl24(<2 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+	%tmp1 = and <2 x i64> %a, < i64 144115183814443007, i64 144115183814443007>
+	ret <2 x i64> %tmp1
+}
+
+define <8 x i8> @and8imm4h_lsl0(<8 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.4h, #0xff
+	%tmp1 = and <8 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
+	ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @and8imm4h_lsl8(<8 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
+	%tmp1 = and <8 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
+	ret <8 x i8> %tmp1
+}
+
+define <2 x i32> @and16imm4h_lsl0(<2 x i32> %a) {
+;CHECK:  bic {{v[0-9]+}}.4h, #0xff
+	%tmp1 = and <2 x i32> %a, < i32 4278255360, i32 4278255360>
+	ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @and16imm4h_lsl8(<2 x i32> %a) {
+;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
+	%tmp1 = and <2 x i32> %a, < i32 16711935, i32 16711935>
+	ret <2 x i32> %tmp1
+}
+
+define <1 x i64> @and64imm4h_lsl0(<1 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.4h, #0xff
+	%tmp1 = and <1 x i64> %a, < i64 -71777214294589696>
+	ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @and64imm4h_lsl8(<1 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
+	%tmp1 = and <1 x i64> %a, < i64 71777214294589695>
+	ret <1 x i64> %tmp1
+}
+
+define <16 x i8> @and8imm8h_lsl0(<16 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.8h, #0xff
+	%tmp1 = and <16 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255 >
+	ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @and8imm8h_lsl8(<16 x i8> %a) {
+;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
+	%tmp1 = and <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0 >
+	ret <16 x i8> %tmp1
+}
+
+define <4 x i32> @and16imm8h_lsl0(<4 x i32> %a) {
+;CHECK:  bic {{v[0-9]+}}.8h, #0xff
+	%tmp1 = and <4 x i32> %a, < i32 4278255360, i32 4278255360, i32 4278255360, i32 4278255360>
+	ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @and16imm8h_lsl8(<4 x i32> %a) {
+;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
+	%tmp1 = and <4 x i32> %a, < i32 16711935, i32 16711935, i32 16711935, i32 16711935>
+	ret <4 x i32> %tmp1
+}
+
+define <2 x i64> @and64imm8h_lsl0(<2 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.8h, #0xff
+	%tmp1 = and <2 x i64> %a, < i64 -71777214294589696, i64 -71777214294589696>
+	ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @and64imm8h_lsl8(<2 x i64> %a) {
+;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
+	%tmp1 = and <2 x i64> %a, < i64 71777214294589695, i64 71777214294589695>
+	ret <2 x i64> %tmp1
+}
+
+define <8 x i8> @orr8imm2s_lsl0(<8 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff
+	%tmp1 = or <8 x i8> %a, < i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0>
+	ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @orr8imm2s_lsl8(<8 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
+	%tmp1 = or <8 x i8> %a, < i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0>
+	ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @orr8imm2s_lsl16(<8 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
+	%tmp1 = or <8 x i8> %a, < i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0>
+	ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @orr8imm2s_lsl24(<8 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
+	%tmp1 = or <8 x i8> %a, < i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255>
+	ret <8 x i8> %tmp1
+}
+
+define <4 x i16> @orr16imm2s_lsl0(<4 x i16> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff
+	%tmp1 = or <4 x i16> %a, < i16 255, i16 0, i16 255, i16 0>
+	ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @orr16imm2s_lsl8(<4 x i16> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
+	%tmp1 = or <4 x i16> %a, < i16 65280, i16 0, i16 65280, i16 0>
+	ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @orr16imm2s_lsl16(<4 x i16> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
+	%tmp1 = or <4 x i16> %a, < i16 0, i16 255, i16 0, i16 255>
+	ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @orr16imm2s_lsl24(<4 x i16> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
+	%tmp1 = or <4 x i16> %a, < i16 0, i16 65280, i16 0, i16 65280>
+	ret <4 x i16> %tmp1
+}
+
+define <1 x i64> @orr64imm2s_lsl0(<1 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff
+	%tmp1 = or <1 x i64> %a, < i64 1095216660735>
+	ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @orr64imm2s_lsl8(<1 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
+	%tmp1 = or <1 x i64> %a, < i64 280375465148160>
+	ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @orr64imm2s_lsl16(<1 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
+	%tmp1 = or <1 x i64> %a, < i64 71776119077928960>
+	ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @orr64imm2s_lsl24(<1 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
+	%tmp1 = or <1 x i64> %a, < i64 -72057589759737856>
+	ret <1 x i64> %tmp1
+}
+
+define <16 x i8> @orr8imm4s_lsl0(<16 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff
+	%tmp1 = or <16 x i8> %a, < i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0>
+	ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @orr8imm4s_lsl8(<16 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
+	%tmp1 = or <16 x i8> %a, < i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0>
+	ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @orr8imm4s_lsl16(<16 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
+	%tmp1 = or <16 x i8> %a, < i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0>
+	ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @orr8imm4s_lsl24(<16 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
+	%tmp1 = or <16 x i8> %a, < i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255>
+	ret <16 x i8> %tmp1
+}
+
+define <8 x i16> @orr16imm4s_lsl0(<8 x i16> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff
+	%tmp1 = or <8 x i16> %a, < i16 255, i16 0, i16 255, i16 0, i16 255, i16 0, i16 255, i16 0>
+	ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @orr16imm4s_lsl8(<8 x i16> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
+	%tmp1 = or <8 x i16> %a, < i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0>
+	ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @orr16imm4s_lsl16(<8 x i16> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
+	%tmp1 = or <8 x i16> %a, < i16 0, i16 255, i16 0, i16 255, i16 0, i16 255, i16 0, i16 255>
+	ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @orr16imm4s_lsl24(<8 x i16> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
+	%tmp1 = or <8 x i16> %a, < i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280>
+	ret <8 x i16> %tmp1
+}
+
+define <2 x i64> @orr64imm4s_lsl0(<2 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff
+	%tmp1 = or <2 x i64> %a, < i64 1095216660735, i64 1095216660735>
+	ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @orr64imm4s_lsl8(<2 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
+	%tmp1 = or <2 x i64> %a, < i64 280375465148160, i64 280375465148160>
+	ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @orr64imm4s_lsl16(<2 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
+	%tmp1 = or <2 x i64> %a, < i64 71776119077928960, i64 71776119077928960>
+	ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @orr64imm4s_lsl24(<2 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
+	%tmp1 = or <2 x i64> %a, < i64 -72057589759737856, i64 -72057589759737856>
+	ret <2 x i64> %tmp1
+}
+
+define <8 x i8> @orr8imm4h_lsl0(<8 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+	%tmp1 = or <8 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
+	ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @orr8imm4h_lsl8(<8 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+	%tmp1 = or <8 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
+	ret <8 x i8> %tmp1
+}
+
+define <2 x i32> @orr16imm4h_lsl0(<2 x i32> %a) {
+;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+	%tmp1 = or <2 x i32> %a, < i32 16711935, i32 16711935>
+	ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @orr16imm4h_lsl8(<2 x i32> %a) {
+;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+	%tmp1 = or <2 x i32> %a, < i32 4278255360, i32 4278255360>
+	ret <2 x i32> %tmp1
+}
+
+define <1 x i64> @orr64imm4h_lsl0(<1 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+	%tmp1 = or <1 x i64> %a, < i64 71777214294589695>
+	ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @orr64imm4h_lsl8(<1 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+	%tmp1 = or <1 x i64> %a, < i64 -71777214294589696>
+	ret <1 x i64> %tmp1
+}
+
+define <16 x i8> @orr8imm8h_lsl0(<16 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+	%tmp1 = or <16 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
+	ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @orr8imm8h_lsl8(<16 x i8> %a) {
+;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+	%tmp1 = or <16 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
+	ret <16 x i8> %tmp1
+}
+
+define <4 x i32> @orr16imm8h_lsl0(<4 x i32> %a) {
+;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+	%tmp1 = or <4 x i32> %a, < i32 16711935, i32 16711935, i32 16711935, i32 16711935>
+	ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @orr16imm8h_lsl8(<4 x i32> %a) {
+;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+	%tmp1 = or <4 x i32> %a, < i32 4278255360, i32 4278255360, i32 4278255360, i32 4278255360>
+	ret <4 x i32> %tmp1
+}
+
+define <2 x i64> @orr64imm8h_lsl0(<2 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+	%tmp1 = or <2 x i64> %a, < i64 71777214294589695, i64 71777214294589695>
+	ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @orr64imm8h_lsl8(<2 x i64> %a) {
+;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+	%tmp1 = or <2 x i64> %a, < i64 -71777214294589696, i64 -71777214294589696>
+	ret <2 x i64> %tmp1
+}
 
diff --git a/test/CodeGen/AArch64/neon-bsl.ll b/test/CodeGen/AArch64/neon-bsl.ll
index 6bd923d..c55fd01 100644
--- a/test/CodeGen/AArch64/neon-bsl.ll
+++ b/test/CodeGen/AArch64/neon-bsl.ll
@@ -220,3 +220,16 @@ entry:
   ret <2 x double> %vbsl3.i
 }
 
+define <2 x double> @test_bsl_v2f64(<2 x i1> %v1, <2 x double> %v2, <2 x double> %v3) {
+; CHECK-LABEL: test_bsl_v2f64:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+  %1 = select <2 x i1> %v1, <2 x double> %v2, <2 x double> %v3
+  ret <2 x double> %1
+}
+
+define <4 x float> @test_bsl_v4f32(<4 x i1> %v1, <4 x float> %v2, <4 x float> %v3) {
+; CHECK-LABEL: test_bsl_v4f32:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+  %1 = select <4 x i1> %v1, <4 x float> %v2, <4 x float> %v3
+  ret <4 x float> %1
+}
diff --git a/test/CodeGen/AArch64/neon-copy.ll b/test/CodeGen/AArch64/neon-copy.ll
index e18530e..b4d55df 100644
--- a/test/CodeGen/AArch64/neon-copy.ll
+++ b/test/CodeGen/AArch64/neon-copy.ll
@@ -2,269 +2,269 @@
 
 
 define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
-;CHECK: ins {{v[0-31]+}}.b[15], {{w[0-31]+}}
+;CHECK: ins {{v[0-9]+}}.b[15], {{w[0-9]+}}
   %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
   ret <16 x i8> %tmp3
 }
 
 define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
-;CHECK: ins {{v[0-31]+}}.h[6], {{w[0-31]+}}
+;CHECK: ins {{v[0-9]+}}.h[6], {{w[0-9]+}}
   %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
   ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
-;CHECK: ins {{v[0-31]+}}.s[2], {{w[0-31]+}}
+;CHECK: ins {{v[0-9]+}}.s[2], {{w[0-9]+}}
   %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
   ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
-;CHECK: ins {{v[0-31]+}}.d[1], {{x[0-31]+}}
+;CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}}
   %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
   ret <2 x i64> %tmp3
 }
 
 define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
-;CHECK: ins {{v[0-31]+}}.b[5], {{w[0-31]+}}
+;CHECK: ins {{v[0-9]+}}.b[5], {{w[0-9]+}}
   %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
   ret <8 x i8> %tmp3
 }
 
 define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
-;CHECK: ins {{v[0-31]+}}.h[3], {{w[0-31]+}}
+;CHECK: ins {{v[0-9]+}}.h[3], {{w[0-9]+}}
   %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
   ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
-;CHECK: ins {{v[0-31]+}}.s[1], {{w[0-31]+}}
+;CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}}
   %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
   ret <2 x i32> %tmp3
 }
 
 define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2]
+;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 2
   %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
   ret <16 x i8> %tmp4
 }
 
 define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2]
+;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
   ret <8 x i16> %tmp4
 }
 
 define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
   ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <2 x i64> %tmp1, i32 0
   %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
   ret <2 x i64> %tmp4
 }
 
 define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x float> %tmp1, i32 2
   %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
   ret <4 x float> %tmp4
 }
 
 define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <2 x double> %tmp1, i32 0
   %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
   ret <2 x double> %tmp4
 }
 
 define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2]
+;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
   %tmp3 = extractelement <8 x i8> %tmp1, i32 2
   %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
   ret <16 x i8> %tmp4
 }
 
 define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2]
+;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
   ret <8 x i16> %tmp4
 }
 
 define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1]
+;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
   %tmp3 = extractelement <2 x i32> %tmp1, i32 1
   %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
   ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <1 x i64> %tmp1, i32 0
   %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
   ret <2 x i64> %tmp4
 }
 
 define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1]
+;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
   %tmp3 = extractelement <2 x float> %tmp1, i32 1
   %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
   ret <4 x float> %tmp4
 }
 
 define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <1 x double> %tmp1, i32 0
   %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
   ret <2 x double> %tmp4
 }
 
 define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.b[7], {{v[0-31]+}}.b[2]
+;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 2
   %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
   ret <8 x i8> %tmp4
 }
 
 define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2]
+;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
   ret <4 x i16> %tmp4
 }
 
 define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
   ret <2 x i32> %tmp4
 }
 
 define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <2 x i64> %tmp1, i32 0
   %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
   ret <1 x i64> %tmp4
 }
 
 define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x float> %tmp1, i32 2
   %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
   ret <2 x float> %tmp4
 }
 
 define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <2 x double> %tmp1, i32 0
   %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
   ret <1 x double> %tmp4
 }
 
 define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.b[4], {{v[0-31]+}}.b[2]
+;CHECK: ins {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2]
   %tmp3 = extractelement <8 x i8> %tmp1, i32 2
   %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
   ret <8 x i8> %tmp4
 }
 
 define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2]
+;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
   ret <4 x i16> %tmp4
 }
 
 define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[0]
+;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
   %tmp3 = extractelement <2 x i32> %tmp1, i32 0
   %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
   ret <2 x i32> %tmp4
 }
 
 define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <1 x i64> %tmp1, i32 0
   %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
   ret <1 x i64> %tmp4
 }
 
 define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[0]
+;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
   %tmp3 = extractelement <2 x float> %tmp1, i32 0
   %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
   ret <2 x float> %tmp4
 }
 
 define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
-;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <1 x double> %tmp1, i32 0
   %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
   ret <1 x double> %tmp4
 }
 
 define i32 @umovw16b(<16 x i8> %tmp1) {
-;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[8]
+;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 8
   %tmp4 = zext i8 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i32 @umovw8h(<8 x i16> %tmp1) {
-;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = zext i16 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i32 @umovw4s(<4 x i32> %tmp1) {
-;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.s[2]
+;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   ret i32 %tmp3
 }
 
 define i64 @umovx2d(<2 x i64> %tmp1) {
-;CHECK: umov {{x[0-31]+}}, {{v[0-31]+}}.d[0]
+;CHECK: umov {{x[0-9]+}}, {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <2 x i64> %tmp1, i32 0
   ret i64 %tmp3
 }
 
 define i32 @umovw8b(<8 x i8> %tmp1) {
-;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[7]
+;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
   %tmp3 = extractelement <8 x i8> %tmp1, i32 7
   %tmp4 = zext i8 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i32 @umovw4h(<4 x i16> %tmp1) {
-;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = zext i16 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i32 @umovw2s(<2 x i32> %tmp1) {
-;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.s[1]
+;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp3 = extractelement <2 x i32> %tmp1, i32 1
   ret i32 %tmp3
 }
 
 define i64 @umovx1d(<1 x i64> %tmp1) {
-;CHECK: fmov {{x[0-31]+}}, {{d[0-31]+}}
+;CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
   %tmp3 = extractelement <1 x i64> %tmp1, i32 0
   ret i64 %tmp3
 }
 
 define i32 @smovw16b(<16 x i8> %tmp1) {
-;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.b[8]
+;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 8
   %tmp4 = sext i8 %tmp3 to i32
   %tmp5 = add i32 5, %tmp4
@@ -272,7 +272,7 @@ define i32 @smovw16b(<16 x i8> %tmp1) {
 }
 
 define i32 @smovw8h(<8 x i16> %tmp1) {
-;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i32
   %tmp5 = add i32 5, %tmp4
@@ -280,28 +280,28 @@ define i32 @smovw8h(<8 x i16> %tmp1) {
 }
 
 define i32 @smovx16b(<16 x i8> %tmp1) {
-;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.b[8]
+;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[8]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 8
   %tmp4 = sext i8 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i32 @smovx8h(<8 x i16> %tmp1) {
-;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.h[2]
+;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i64 @smovx4s(<4 x i32> %tmp1) {
-;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.s[2]
+;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   %tmp4 = sext i32 %tmp3 to i64
   ret i64 %tmp4
 }
 
 define i32 @smovw8b(<8 x i8> %tmp1) {
-;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.b[4]
+;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4]
   %tmp3 = extractelement <8 x i8> %tmp1, i32 4
   %tmp4 = sext i8 %tmp3 to i32
   %tmp5 = add i32 5, %tmp4
@@ -309,7 +309,7 @@ define i32 @smovw8b(<8 x i8> %tmp1) {
 }
 
 define i32 @smovw4h(<4 x i16> %tmp1) {
-;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i32
   %tmp5 = add i32 5, %tmp4
@@ -317,21 +317,21 @@ define i32 @smovw4h(<4 x i16> %tmp1) {
 }
 
 define i32 @smovx8b(<8 x i8> %tmp1) {
-;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.b[6]
+;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[6]
   %tmp3 = extractelement <8 x i8> %tmp1, i32 6
   %tmp4 = sext i8 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i32 @smovx4h(<4 x i16> %tmp1) {
-;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.h[2]
+;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i64 @smovx2s(<2 x i32> %tmp1) {
-;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.s[1]
+;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp3 = extractelement <2 x i32> %tmp1, i32 1
   %tmp4 = sext i32 %tmp3 to i64
   ret i64 %tmp4
@@ -612,4 +612,791 @@ define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
    %res = bitcast i64 %in to <1 x double>
 ; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
    ret <1 x double> %res
-}
-\ No newline at end of file
+}
+
+define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
+; CHECK-LABEL: test_bitcastv8i8tov1f64:
+; CHECK: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <8 x i8> zeroinitializer, %a
+  %1 = bitcast <8 x i8> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
+; CHECK-LABEL: test_bitcastv4i16tov1f64:
+; CHECK: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <4 x i16> zeroinitializer, %a
+  %1 = bitcast <4 x i16> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_bitcastv2i32tov1f64:
+; CHECK: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <2 x i32> zeroinitializer, %a
+  %1 = bitcast <2 x i32> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1i64tov1f64:
+; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <1 x i64> zeroinitializer, %a
+  %1 = bitcast <1 x i64> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
+; CHECK-LABEL: test_bitcastv2f32tov1f64:
+; CHECK: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+  %1 = bitcast <2 x float> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov8i8:
+; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <8 x i8>
+  %sub.i = sub <8 x i8> zeroinitializer, %1
+  ret <8 x i8> %sub.i
+}
+
+define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov4i16:
+; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <4 x i16>
+  %sub.i = sub <4 x i16> zeroinitializer, %1
+  ret <4 x i16> %sub.i
+}
+
+define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov2i32:
+; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <2 x i32>
+  %sub.i = sub <2 x i32> zeroinitializer, %1
+  ret <2 x i32> %sub.i
+}
+
+define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov1i64:
+; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: neg {{d[0-9]+}}, {{d[0-9]+}}
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <1 x i64>
+  %sub.i = sub <1 x i64> zeroinitializer, %1
+  ret <1 x i64> %sub.i
+}
+
+define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov2f32:
+; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <2 x float>
+  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %1
+  ret <2 x float> %sub.i
+}
+
+; Test insert element into an undef vector
+define <8 x i8> @scalar_to_vector.v8i8(i8 %a) {
+; CHECK-LABEL: scalar_to_vector.v8i8:
+; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}}
+  %b = insertelement <8 x i8> undef, i8 %a, i32 0
+  ret <8 x i8> %b
+}
+
+define <16 x i8> @scalar_to_vector.v16i8(i8 %a) {
+; CHECK-LABEL: scalar_to_vector.v16i8:
+; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}}
+  %b = insertelement <16 x i8> undef, i8 %a, i32 0
+  ret <16 x i8> %b
+}
+
+define <4 x i16> @scalar_to_vector.v4i16(i16 %a) {
+; CHECK-LABEL: scalar_to_vector.v4i16:
+; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}}
+  %b = insertelement <4 x i16> undef, i16 %a, i32 0
+  ret <4 x i16> %b
+}
+
+define <8 x i16> @scalar_to_vector.v8i16(i16 %a) {
+; CHECK-LABEL: scalar_to_vector.v8i16:
+; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}}
+  %b = insertelement <8 x i16> undef, i16 %a, i32 0
+  ret <8 x i16> %b
+}
+
+define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
+; CHECK-LABEL: scalar_to_vector.v2i32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}}
+  %b = insertelement <2 x i32> undef, i32 %a, i32 0
+  ret <2 x i32> %b
+}
+
+define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
+; CHECK-LABEL: scalar_to_vector.v4i32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}}
+  %b = insertelement <4 x i32> undef, i32 %a, i32 0
+  ret <4 x i32> %b
+}
+
+define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
+; CHECK-LABEL: scalar_to_vector.v2i64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{x[0-9]+}}
+  %b = insertelement <2 x i64> undef, i64 %a, i32 0
+  ret <2 x i64> %b
+}
+
+define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
+; CHECK-LABEL: testDUP.v1i8:
+; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+  %b = extractelement <1 x i8> %a, i32 0
+  %c = insertelement <8 x i8> undef, i8 %b, i32 0
+  %d = insertelement <8 x i8> %c, i8 %b, i32 1
+  %e = insertelement <8 x i8> %d, i8 %b, i32 2
+  %f = insertelement <8 x i8> %e, i8 %b, i32 3
+  %g = insertelement <8 x i8> %f, i8 %b, i32 4
+  %h = insertelement <8 x i8> %g, i8 %b, i32 5
+  %i = insertelement <8 x i8> %h, i8 %b, i32 6
+  %j = insertelement <8 x i8> %i, i8 %b, i32 7
+  ret <8 x i8> %j
+}
+
+define <8 x i16> @testDUP.v1i16(<1 x i16> %a) {
+; CHECK-LABEL: testDUP.v1i16:
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+  %b = extractelement <1 x i16> %a, i32 0
+  %c = insertelement <8 x i16> undef, i16 %b, i32 0
+  %d = insertelement <8 x i16> %c, i16 %b, i32 1
+  %e = insertelement <8 x i16> %d, i16 %b, i32 2
+  %f = insertelement <8 x i16> %e, i16 %b, i32 3
+  %g = insertelement <8 x i16> %f, i16 %b, i32 4
+  %h = insertelement <8 x i16> %g, i16 %b, i32 5
+  %i = insertelement <8 x i16> %h, i16 %b, i32 6
+  %j = insertelement <8 x i16> %i, i16 %b, i32 7
+  ret <8 x i16> %j
+}
+
+define <4 x i32> @testDUP.v1i32(<1 x i32> %a) {
+; CHECK-LABEL: testDUP.v1i32:
+; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+  %b = extractelement <1 x i32> %a, i32 0
+  %c = insertelement <4 x i32> undef, i32 %b, i32 0
+  %d = insertelement <4 x i32> %c, i32 %b, i32 1
+  %e = insertelement <4 x i32> %d, i32 %b, i32 2
+  %f = insertelement <4 x i32> %e, i32 %b, i32 3
+  ret <4 x i32> %f
+}
+
+define <8 x i8> @getl(<16 x i8> %x) #0 {
+; CHECK-LABEL: getl:
+; CHECK: ret
+  %vecext = extractelement <16 x i8> %x, i32 0
+  %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <16 x i8> %x, i32 1
+  %vecinit2 = insertelement <8 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <16 x i8> %x, i32 2
+  %vecinit4 = insertelement <8 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <16 x i8> %x, i32 3
+  %vecinit6 = insertelement <8 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <16 x i8> %x, i32 4
+  %vecinit8 = insertelement <8 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <16 x i8> %x, i32 5
+  %vecinit10 = insertelement <8 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <16 x i8> %x, i32 6
+  %vecinit12 = insertelement <8 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <16 x i8> %x, i32 7
+  %vecinit14 = insertelement <8 x i8> %vecinit12, i8 %vecext13, i32 7
+  ret <8 x i8> %vecinit14
+}
+
+define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
+; CHECK-LABEL: test_dup_v2i32_v4i16:
+; CHECK: dup v0.4h, v0.h[2]
+entry:
+  %x = extractelement <2 x i32> %a, i32 1
+  %vget_lane = trunc i32 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
+; CHECK-LABEL: test_dup_v4i32_v8i16:
+; CHECK: dup v0.8h, v0.h[6]
+entry:
+  %x = extractelement <4 x i32> %a, i32 3
+  %vget_lane = trunc i32 %x to i16
+  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+  ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) {
+; CHECK-LABEL: test_dup_v1i64_v4i16:
+; CHECK: dup v0.4h, v0.h[0]
+entry:
+  %x = extractelement <1 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) {
+; CHECK-LABEL: test_dup_v1i64_v2i32:
+; CHECK: dup v0.2s, v0.s[0]
+entry:
+  %x = extractelement <1 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i32
+  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v8i16:
+; CHECK: dup v0.8h, v0.h[4]
+entry:
+  %x = extractelement <2 x i64> %a, i32 1
+  %vget_lane = trunc i64 %x to i16
+  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+  ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i32> @test_dup_v2i64_v4i32(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v4i32:
+; CHECK: dup v0.4s, v0.s[2]
+entry:
+  %x = extractelement <2 x i64> %a, i32 1
+  %vget_lane = trunc i64 %x to i32
+  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
+  ret <4 x i32> %vecinit3.i
+}
+
+define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
+; CHECK-LABEL: test_dup_v4i32_v4i16:
+; CHECK: dup v0.4h, v0.h[2]
+entry:
+  %x = extractelement <4 x i32> %a, i32 1
+  %vget_lane = trunc i32 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v4i16:
+; CHECK: dup v0.4h, v0.h[0]
+entry:
+  %x = extractelement <2 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v2i32:
+; CHECK: dup v0.2s, v0.s[0]
+entry:
+  %x = extractelement <2 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i32
+  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+
+define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) {
+; CHECK-LABEL: test_scalar_to_vector_f32_to_v2f32:
+; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
+; CHECK-NEXT: ret
+entry:
+  %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
+  %1 = insertelement <1 x float> undef, float %0, i32 0
+  %2 = extractelement <1 x float> %1, i32 0
+  %vecinit1.i = insertelement <2 x float> undef, float %2, i32 0
+  ret <2 x float> %vecinit1.i
+}
+
+define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) {
+; CHECK-LABEL: test_scalar_to_vector_f32_to_v4f32:
+; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
+; CHECK-NEXT: ret
+entry:
+  %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
+  %1 = insertelement <1 x float> undef, float %0, i32 0
+  %2 = extractelement <1 x float> %1, i32 0
+  %vecinit1.i = insertelement <4 x float> undef, float %2, i32 0
+  ret <4 x float> %vecinit1.i
+}
+
+declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>)
+
+define <2 x i32> @test_concat_undef_v1i32(<1 x i32> %a) {
+; CHECK-LABEL: test_concat_undef_v1i32:
+; CHECK: ins v{{[0-9]+}}.s[1], v{{[0-9]+}}.s[0]
+entry:
+  %0 = extractelement <1 x i32> %a, i32 0
+  %vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>) #4
+
+define <2 x i32> @test_concat_v1i32_undef(<1 x i32> %a) {
+; CHECK-LABEL: test_concat_v1i32_undef:
+; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: ret
+entry:
+  %b = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a)
+  %0 = extractelement <1 x i32> %b, i32 0
+  %vecinit.i432 = insertelement <2 x i32> undef, i32 %0, i32 0
+  ret <2 x i32> %vecinit.i432
+}
+
+define <2 x i32> @test_concat_same_v1i32_v1i32(<1 x i32> %a) {
+; CHECK-LABEL: test_concat_same_v1i32_v1i32:
+; CHECK: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
+entry:
+  %0 = extractelement <1 x i32> %a, i32 0
+  %vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %0, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <2 x i32> @test_concat_diff_v1i32_v1i32(<1 x i32> %a, <1 x i32> %b) {
+; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
+; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: sqabs s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: ins v0.s[1], v1.s[0]
+entry:
+  %c = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a)
+  %d = extractelement <1 x i32> %c, i32 0
+  %e = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %b)
+  %f = extractelement <1 x i32> %e, i32 0
+  %h = shufflevector <1 x i32> %c, <1 x i32> %e, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %h
+}
+
+define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %vecinit30
+}
+
+define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <8 x i8> %x, i32 0
+  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <8 x i8> %x, i32 1
+  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <8 x i8> %x, i32 2
+  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <8 x i8> %x, i32 3
+  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <8 x i8> %x, i32 4
+  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <8 x i8> %x, i32 5
+  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <8 x i8> %x, i32 6
+  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <8 x i8> %x, i32 7
+  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
+  %vecinit30 = shufflevector <16 x i8> %vecinit14, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %vecinit30
+}
+
+define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <16 x i8> %x, i32 0
+  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <16 x i8> %x, i32 1
+  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <16 x i8> %x, i32 2
+  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <16 x i8> %x, i32 3
+  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <16 x i8> %x, i32 4
+  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <16 x i8> %x, i32 5
+  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <16 x i8> %x, i32 6
+  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <16 x i8> %x, i32 7
+  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
+  %vecext15 = extractelement <8 x i8> %y, i32 0
+  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
+  %vecext17 = extractelement <8 x i8> %y, i32 1
+  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
+  %vecext19 = extractelement <8 x i8> %y, i32 2
+  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
+  %vecext21 = extractelement <8 x i8> %y, i32 3
+  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
+  %vecext23 = extractelement <8 x i8> %y, i32 4
+  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
+  %vecext25 = extractelement <8 x i8> %y, i32 5
+  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
+  %vecext27 = extractelement <8 x i8> %y, i32 6
+  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
+  %vecext29 = extractelement <8 x i8> %y, i32 7
+  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
+  ret <16 x i8> %vecinit30
+}
+
+define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <8 x i8> %x, i32 0
+  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <8 x i8> %x, i32 1
+  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <8 x i8> %x, i32 2
+  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <8 x i8> %x, i32 3
+  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <8 x i8> %x, i32 4
+  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <8 x i8> %x, i32 5
+  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <8 x i8> %x, i32 6
+  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <8 x i8> %x, i32 7
+  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
+  %vecext15 = extractelement <8 x i8> %y, i32 0
+  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
+  %vecext17 = extractelement <8 x i8> %y, i32 1
+  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
+  %vecext19 = extractelement <8 x i8> %y, i32 2
+  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
+  %vecext21 = extractelement <8 x i8> %y, i32 3
+  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
+  %vecext23 = extractelement <8 x i8> %y, i32 4
+  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
+  %vecext25 = extractelement <8 x i8> %y, i32 5
+  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
+  %vecext27 = extractelement <8 x i8> %y, i32 6
+  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
+  %vecext29 = extractelement <8 x i8> %y, i32 7
+  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
+  ret <16 x i8> %vecinit30
+}
+
+define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %vecinit14
+}
+
+define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <4 x i16> %x, i32 0
+  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
+  %vecext1 = extractelement <4 x i16> %x, i32 1
+  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
+  %vecext3 = extractelement <4 x i16> %x, i32 2
+  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
+  %vecext5 = extractelement <4 x i16> %x, i32 3
+  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
+  %vecinit14 = shufflevector <8 x i16> %vecinit6, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %vecinit14
+}
+
+define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <8 x i16> %x, i32 0
+  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
+  %vecext1 = extractelement <8 x i16> %x, i32 1
+  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
+  %vecext3 = extractelement <8 x i16> %x, i32 2
+  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
+  %vecext5 = extractelement <8 x i16> %x, i32 3
+  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
+  %vecext7 = extractelement <4 x i16> %y, i32 0
+  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
+  %vecext9 = extractelement <4 x i16> %y, i32 1
+  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
+  %vecext11 = extractelement <4 x i16> %y, i32 2
+  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
+  %vecext13 = extractelement <4 x i16> %y, i32 3
+  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
+  ret <8 x i16> %vecinit14
+}
+
+define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <4 x i16> %x, i32 0
+  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
+  %vecext1 = extractelement <4 x i16> %x, i32 1
+  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
+  %vecext3 = extractelement <4 x i16> %x, i32 2
+  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
+  %vecext5 = extractelement <4 x i16> %x, i32 3
+  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
+  %vecext7 = extractelement <4 x i16> %y, i32 0
+  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
+  %vecext9 = extractelement <4 x i16> %y, i32 1
+  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
+  %vecext11 = extractelement <4 x i16> %y, i32 2
+  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
+  %vecext13 = extractelement <4 x i16> %y, i32 3
+  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
+  ret <8 x i16> %vecinit14
+}
+
+define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %vecinit6
+}
+
+define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <2 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <2 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecinit6 = shufflevector <4 x i32> %vecinit2, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %vecinit6
+}
+
+define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext3 = extractelement <2 x i32> %y, i32 0
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
+  %vecext5 = extractelement <2 x i32> %y, i32 1
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
+  ret <4 x i32> %vecinit6
+}
+
+define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <2 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <2 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext3 = extractelement <2 x i32> %y, i32 0
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
+  %vecext5 = extractelement <2 x i32> %y, i32 1
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
+  ret <4 x i32> %vecinit6
+}
+
+define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %vecinit2
+}
+
+define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <1 x i64> %x, i32 0
+  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
+  %vecinit2 = shufflevector <2 x i64> %vecinit, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %vecinit2
+}
+
+define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <2 x i64> %x, i32 0
+  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
+  %vecext1 = extractelement <1 x i64> %y, i32 0
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
+  ret <2 x i64> %vecinit2
+}
+
+define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <1 x i64> %x, i32 0
+  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
+  %vecext1 = extractelement <1 x i64> %y, i32 0
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
+  ret <2 x i64> %vecinit2
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>)
+
+; This case tests the copy of two FPR8 registers, which is implemented by fmov
+; of two FPR32 registers.
+define <1 x i8> @test_copy_FPR8_FPR8(<1 x i8> %a, <1 x i8> %b) {
+; CHECK-LABEL: test_copy_FPR8_FPR8:
+; CHECK: usqadd b1, b0
+; CHECK-NEXT: fmov s0, s1
+entry:
+ %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %b, <1 x i8> %a)
+ ret <1 x i8> %vsqadd2.i
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>)
+
+define <1 x i16> @test_copy_FPR16_FPR16(<1 x i16> %a, <1 x i16> %b) {
+; CHECK-LABEL: test_copy_FPR16_FPR16:
+; CHECK: usqadd h1, h0
+; CHECK-NEXT: fmov s0, s1
+entry:
+  %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %b, <1 x i16> %a)
+  ret <1 x i16> %vsqadd2.i
+}
+
+define <4 x i16> @concat_vector_v4i16_const() {
+; CHECK-LABEL: concat_vector_v4i16_const:
+; CHECK: dup {{v[0-9]+}}.4h, wzr
+ %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %r
+}
+
+define <4 x i16> @concat_vector_v4i16_const_one() {
+; CHECK-LABEL: concat_vector_v4i16_const_one:
+; CHECK: movz {{w[0-9]+}}, #1
+; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+ %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %r
+}
+
+define <4 x i32> @concat_vector_v4i32_const() {
+; CHECK-LABEL: concat_vector_v4i32_const:
+; CHECK: dup {{v[0-9]+}}.4s, wzr
+ %r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %r
+}
+
+define <8 x i8> @concat_vector_v8i8_const() {
+; CHECK-LABEL: concat_vector_v8i8_const:
+; CHECK: dup {{v[0-9]+}}.8b, wzr
+ %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
+ ret <8 x i8> %r
+}
+
+define <8 x i16> @concat_vector_v8i16_const() {
+; CHECK-LABEL: concat_vector_v8i16_const:
+; CHECK: dup {{v[0-9]+}}.8h, wzr
+ %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %r
+}
+
+define <8 x i16> @concat_vector_v8i16_const_one() {
+; CHECK-LABEL: concat_vector_v8i16_const_one:
+; CHECK: movz {{w[0-9]+}}, #1
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+ %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %r
+}
+
+define <16 x i8> @concat_vector_v16i8_const() {
+; CHECK-LABEL: concat_vector_v16i8_const:
+; CHECK: dup {{v[0-9]+}}.16b, wzr
+ %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %r
+}
+
+define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
+; CHECK-LABEL: concat_vector_v4i16:
+; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+ %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %r
+}
+
+define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
+; CHECK-LABEL: concat_vector_v4i32:
+; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+ %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %r
+}
+
+define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
+; CHECK-LABEL: concat_vector_v8i8:
+; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[0]
+ %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
+ ret <8 x i8> %r
+}
+
+define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
+; CHECK-LABEL: concat_vector_v8i16:
+; CHECK: dup {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+ %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %r
+}
+
+define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
+; CHECK-LABEL: concat_vector_v16i8:
+; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[0]
+ %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %r
+}
diff --git a/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
new file mode 100644
index 0000000..4dffcd1
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <4 x i32> @copyTuple.QPair(i8* %a, i8* %b) {
+; CHECK-LABEL: copyTuple.QPair:
+; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i32 0, i32 4)
+  %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 1, i32 4)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QTriple(i8* %a, i8* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QTriple:
+; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
+  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i32 1, i32 4)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QQuad(i8* %a, i8* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QQuad:
+; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
+  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 1, i32 4)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-crypto.ll b/test/CodeGen/AArch64/neon-crypto.ll
index 0283e0e..c0014fa 100644
--- a/test/CodeGen/AArch64/neon-crypto.ll
+++ b/test/CodeGen/AArch64/neon-crypto.ll
@@ -1,40 +1,40 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -mattr=+crypto | FileCheck %s
 ; RUN: not llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s
 
-declare <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32>, <4 x i32>, <4 x i32>) #1
 
-declare <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32>, <4 x i32>, <4 x i32>) #1
 
-declare <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.neon.sha256h(<4 x i32>, <4 x i32>, <4 x i32>) #1
 
-declare <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32>, <4 x i32>, <4 x i32>) #1
 
-declare <4 x i32> @llvm.aarch64.neon.sha1m(<4 x i32>, <1 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.neon.sha1m(<4 x i32>, i32, <4 x i32>) #1
 
-declare <4 x i32> @llvm.aarch64.neon.sha1p(<4 x i32>, <1 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.neon.sha1p(<4 x i32>, i32, <4 x i32>) #1
 
-declare <4 x i32> @llvm.aarch64.neon.sha1c(<4 x i32>, <1 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.neon.sha1c(<4 x i32>, i32, <4 x i32>) #1
 
-declare <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32>, <4 x i32>) #1
 
-declare <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32>, <4 x i32>) #1
+declare <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32>, <4 x i32>) #1
 
-declare <1 x i32> @llvm.arm.neon.sha1h.v1i32(<1 x i32>) #1
+declare i32 @llvm.arm.neon.sha1h(i32) #1
 
-declare <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8>) #1
+declare <16 x i8> @llvm.arm.neon.aesimc(<16 x i8>) #1
 
-declare <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8>) #1
+declare <16 x i8> @llvm.arm.neon.aesmc(<16 x i8>) #1
 
-declare <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.neon.aesd(<16 x i8>, <16 x i8>) #1
 
-declare <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8>, <16 x i8>) #1
+declare <16 x i8> @llvm.arm.neon.aese(<16 x i8>, <16 x i8>) #1
 
 define <16 x i8> @test_vaeseq_u8(<16 x i8> %data, <16 x i8> %key) {
 ; CHECK: test_vaeseq_u8:
 ; CHECK: aese {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK-NO-CRYPTO: Cannot select: intrinsic %llvm.arm.neon.aese
 entry:
-  %aese.i = tail call <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8> %data, <16 x i8> %key)
+  %aese.i = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data, <16 x i8> %key)
   ret <16 x i8> %aese.i
 }
 
@@ -42,7 +42,7 @@ define <16 x i8> @test_vaesdq_u8(<16 x i8> %data, <16 x i8> %key) {
 ; CHECK: test_vaesdq_u8:
 ; CHECK: aesd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
-  %aesd.i = tail call <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8> %data, <16 x i8> %key)
+  %aesd.i = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data, <16 x i8> %key)
   ret <16 x i8> %aesd.i
 }
 
@@ -50,7 +50,7 @@ define <16 x i8> @test_vaesmcq_u8(<16 x i8> %data) {
 ; CHECK: test_vaesmcq_u8:
 ; CHECK: aesmc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
-  %aesmc.i = tail call <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8> %data)
+  %aesmc.i = tail call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %data)
   ret <16 x i8> %aesmc.i
 }
 
@@ -58,7 +58,7 @@ define <16 x i8> @test_vaesimcq_u8(<16 x i8> %data) {
 ; CHECK: test_vaesimcq_u8:
 ; CHECK: aesimc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
-  %aesimc.i = tail call <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8> %data)
+  %aesimc.i = tail call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %data)
   ret <16 x i8> %aesimc.i
 }
 
@@ -66,17 +66,15 @@ define i32 @test_vsha1h_u32(i32 %hash_e) {
 ; CHECK: test_vsha1h_u32:
 ; CHECK: sha1h {{s[0-9]+}}, {{s[0-9]+}}
 entry:
-  %sha1h.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
-  %sha1h1.i = tail call <1 x i32> @llvm.arm.neon.sha1h.v1i32(<1 x i32> %sha1h.i)
-  %0 = extractelement <1 x i32> %sha1h1.i, i32 0
-  ret i32 %0
+  %sha1h1.i = tail call i32 @llvm.arm.neon.sha1h(i32 %hash_e)
+  ret i32 %sha1h1.i
 }
 
 define <4 x i32> @test_vsha1su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w12_15) {
 ; CHECK: test_vsha1su1q_u32:
 ; CHECK: sha1su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %sha1su12.i = tail call <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32> %tw0_3, <4 x i32> %w12_15)
+  %sha1su12.i = tail call <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32> %tw0_3, <4 x i32> %w12_15)
   ret <4 x i32> %sha1su12.i
 }
 
@@ -84,7 +82,7 @@ define <4 x i32> @test_vsha256su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7) {
 ; CHECK: test_vsha256su0q_u32:
 ; CHECK: sha256su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %sha256su02.i = tail call <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32> %w0_3, <4 x i32> %w4_7)
+  %sha256su02.i = tail call <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
   ret <4 x i32> %sha256su02.i
 }
 
@@ -92,8 +90,7 @@ define <4 x i32> @test_vsha1cq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32>
 ; CHECK: test_vsha1cq_u32:
 ; CHECK: sha1c {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %sha1c.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
-  %sha1c1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1c(<4 x i32> %hash_abcd, <1 x i32> %sha1c.i, <4 x i32> %wk)
+  %sha1c1.i = tail call <4 x i32> @llvm.arm.neon.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
   ret <4 x i32> %sha1c1.i
 }
 
@@ -101,8 +98,7 @@ define <4 x i32> @test_vsha1pq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32>
 ; CHECK: test_vsha1pq_u32:
 ; CHECK: sha1p {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %sha1p.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
-  %sha1p1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1p(<4 x i32> %hash_abcd, <1 x i32> %sha1p.i, <4 x i32> %wk)
+  %sha1p1.i = tail call <4 x i32> @llvm.arm.neon.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
   ret <4 x i32> %sha1p1.i
 }
 
@@ -110,8 +106,7 @@ define <4 x i32> @test_vsha1mq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32>
 ; CHECK: test_vsha1mq_u32:
 ; CHECK: sha1m {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %sha1m.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
-  %sha1m1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1m(<4 x i32> %hash_abcd, <1 x i32> %sha1m.i, <4 x i32> %wk)
+  %sha1m1.i = tail call <4 x i32> @llvm.arm.neon.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
   ret <4 x i32> %sha1m1.i
 }
 
@@ -119,7 +114,7 @@ define <4 x i32> @test_vsha1su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32>
 ; CHECK: test_vsha1su0q_u32:
 ; CHECK: sha1su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %sha1su03.i = tail call <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11)
+  %sha1su03.i = tail call <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11)
   ret <4 x i32> %sha1su03.i
 }
 
@@ -127,7 +122,7 @@ define <4 x i32> @test_vsha256hq_u32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh,
 ; CHECK: test_vsha256hq_u32:
 ; CHECK: sha256h {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %sha256h3.i = tail call <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+  %sha256h3.i = tail call <4 x i32> @llvm.arm.neon.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
   ret <4 x i32> %sha256h3.i
 }
 
@@ -135,7 +130,7 @@ define <4 x i32> @test_vsha256h2q_u32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd
 ; CHECK: test_vsha256h2q_u32:
 ; CHECK: sha256h2 {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %sha256h23.i = tail call <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+  %sha256h23.i = tail call <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
   ret <4 x i32> %sha256h23.i
 }
 
@@ -143,7 +138,7 @@ define <4 x i32> @test_vsha256su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x
 ; CHECK: test_vsha256su1q_u32:
 ; CHECK: sha256su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %sha256su13.i = tail call <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+  %sha256su13.i = tail call <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
   ret <4 x i32> %sha256su13.i
 }
 
diff --git a/test/CodeGen/AArch64/neon-extract.ll b/test/CodeGen/AArch64/neon-extract.ll
index 5c52cd3..cddc226 100644
--- a/test/CodeGen/AArch64/neon-extract.ll
+++ b/test/CodeGen/AArch64/neon-extract.ll
@@ -188,3 +188,35 @@ entry:
   %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   ret <8 x i16> %vext
 }
+
+define <8 x i8> @test_undef_vext_s8(<8 x i8> %a) {
+; CHECK: test_undef_vext_s8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+entry:
+  %vext = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 10, i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x i8> %vext
+}
+
+define <16 x i8> @test_undef_vextq_s8(<16 x i8> %a) {
+; CHECK: test_undef_vextq_s8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+entry:
+  %vext = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 20, i32 20, i32 20, i32 20, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 20, i32 20, i32 20, i32 20, i32 20>
+  ret <16 x i8> %vext
+}
+
+define <4 x i16> @test_undef_vext_s16(<4 x i16> %a) {
+; CHECK: test_undef_vext_s16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+entry:
+  %vext = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x i16> %vext
+}
+
+define <8 x i16> @test_undef_vextq_s16(<8 x i16> %a) {
+; CHECK: test_undef_vextq_s16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+entry:
+  %vext = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x i16> %vext
+}
diff --git a/test/CodeGen/AArch64/neon-facge-facgt.ll b/test/CodeGen/AArch64/neon-facge-facgt.ll
index 146256e..28e8212 100644
--- a/test/CodeGen/AArch64/neon-facge-facgt.ll
+++ b/test/CodeGen/AArch64/neon-facge-facgt.ll
@@ -1,20 +1,20 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
-declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>)
-declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>)
-declare <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double>, <2 x double>)
+declare <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float>, <2 x float>)
+declare <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float>, <4 x float>)
+declare <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double>, <2 x double>)
 
 define <2 x i32> @facge_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: facge_from_intr_v2i32:
-  %val = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %A, <2 x float> %B)
+  %val = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
 ; CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   ret <2 x i32> %val
 }
 define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: facge_from_intr_v4i32:
-  %val = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %A, <4 x float> %B)
+  %val = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
 ; CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   ret <4 x i32> %val
 }
@@ -22,26 +22,26 @@ define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
 define <2 x i64> @facge_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: facge_from_intr_v2i64:
-  %val = call <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double> %A, <2 x double> %B)
+  %val = call <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
 ; CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
   ret <2 x i64> %val
 }
 
-declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>)
-declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>)
-declare <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double>, <2 x double>)
+declare <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float>, <2 x float>)
+declare <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float>, <4 x float>)
+declare <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double>, <2 x double>)
 
 define <2 x i32> @facgt_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: facgt_from_intr_v2i32:
-  %val = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %A, <2 x float> %B)
+  %val = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
 ; CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   ret <2 x i32> %val
 }
 define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: facgt_from_intr_v4i32:
-  %val = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %A, <4 x float> %B)
+  %val = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
 ; CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   ret <4 x i32> %val
 }
@@ -49,7 +49,7 @@ define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
 define <2 x i64> @facgt_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: facgt_from_intr_v2i64:
-  %val = call <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double> %A, <2 x double> %B)
+  %val = call <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
 ; CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
   ret <2 x i64> %val
 }
diff --git a/test/CodeGen/AArch64/neon-fma.ll b/test/CodeGen/AArch64/neon-fma.ll
index dcf4e28..af70302 100644
--- a/test/CodeGen/AArch64/neon-fma.ll
+++ b/test/CodeGen/AArch64/neon-fma.ll
@@ -1,21 +1,21 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <2 x float> @fmla2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp1 = fmul <2 x float> %A, %B;
 	%tmp2 = fadd <2 x float> %C, %tmp1;
 	ret <2 x float> %tmp2
 }
 
 define <4 x float> @fmla4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
-;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp1 = fmul <4 x float> %A, %B;
 	%tmp2 = fadd <4 x float> %C, %tmp1;
 	ret <4 x float> %tmp2
 }
 
 define <2 x double> @fmla2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
-;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+;CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp1 = fmul <2 x double> %A, %B;
 	%tmp2 = fadd <2 x double> %C, %tmp1;
 	ret <2 x double> %tmp2
@@ -23,21 +23,21 @@ define <2 x double> @fmla2xdouble(<2 x double> %A, <2 x double> %B, <2 x double>
 
 
 define <2 x float> @fmls2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp1 = fmul <2 x float> %A, %B;
 	%tmp2 = fsub <2 x float> %C, %tmp1;
 	ret <2 x float> %tmp2
 }
 
 define <4 x float> @fmls4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
-;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp1 = fmul <4 x float> %A, %B;
 	%tmp2 = fsub <4 x float> %C, %tmp1;
 	ret <4 x float> %tmp2
 }
 
 define <2 x double> @fmls2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
-;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+;CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp1 = fmul <2 x double> %A, %B;
 	%tmp2 = fsub <2 x double> %C, %tmp1;
 	ret <2 x double> %tmp2
@@ -51,39 +51,39 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
 
 define <2 x float> @fmla2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
         %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C)
 	ret <2 x float> %val
 }
 
 define <4 x float> @fmla4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
-;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
         %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C)
 	ret <4 x float> %val
 }
 
 define <2 x double> @fmla2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
-;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+;CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
         %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C)
 	ret <2 x double> %val
 }
 
 define <2 x float> @fmls2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
         %negA = fsub <2 x float> <float -0.0, float -0.0>, %A
         %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %negA, <2 x float> %B, <2 x float> %C)
 	ret <2 x float> %val
 }
 
 define <4 x float> @fmls4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
-;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
         %negA = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %A
         %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %negA, <4 x float> %B, <4 x float> %C)
 	ret <4 x float> %val
 }
 
 define <2 x double> @fmls2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
-;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+;CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
         %negA = fsub <2 x double> <double -0.0, double -0.0>, %A
         %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %negA, <2 x double> %B, <2 x double> %C)
 	ret <2 x double> %val
@@ -94,19 +94,39 @@ declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
 declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>)
 
 define <2 x float> @fmuladd2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
         %val = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C)
 	ret <2 x float> %val
 }
 
 define <4 x float> @fmuladd4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
-;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
         %val = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C)
 	ret <4 x float> %val
 }
 
 define <2 x double> @fmuladd2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
-;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+;CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
         %val = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C)
 	ret <2 x double> %val
 }
+
+
+; Another set of tests that check for multiply single use
+
+define <2 x float> @fmla2xfloati_su(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+;CHECK-NOT: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = fmul <2 x float> %A, %B;
+  %tmp2 = fadd <2 x float> %C, %tmp1;
+  %tmp3 = fadd <2 x float> %tmp2, %tmp1;
+  ret <2 x float> %tmp3
+}
+
+define <2 x double> @fmls2xdouble_su(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
+;CHECK-NOT: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+        %tmp1 = fmul <2 x double> %A, %B;
+        %tmp2 = fsub <2 x double> %C, %tmp1;
+        %tmp3 = fsub <2 x double> %tmp2, %tmp1;
+        ret <2 x double> %tmp3
+}
+
diff --git a/test/CodeGen/AArch64/neon-fpround_f128.ll b/test/CodeGen/AArch64/neon-fpround_f128.ll
new file mode 100644
index 0000000..a93f3f2
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-fpround_f128.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define <1 x double> @test_fpround_v1f128(<1 x fp128>* %a) {
+; CHECK-LABEL: test_fpround_v1f128:
+; CHECK: bl __trunctfdf2
+  %b = load <1 x fp128>* %a
+  %c = fptrunc <1 x fp128> %b to <1 x double>
+  ret <1 x double> %c
+}
+
+define <2 x double> @test_fpround_v2f128(<2 x fp128>* %a) {
+; CHECK-LABEL: test_fpround_v2f128:
+; CHECK: bl __trunctfdf2
+; CHECK: bl __trunctfdf2
+  %b = load <2 x fp128>* %a
+  %c = fptrunc <2 x fp128> %b to <2 x double>
+  ret <2 x double> %c
+}
diff --git a/test/CodeGen/AArch64/neon-load-store-v1i32.ll b/test/CodeGen/AArch64/neon-load-store-v1i32.ll
new file mode 100644
index 0000000..92f704d
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-load-store-v1i32.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+; Test load/store of v1i8, v1i16, v1i32 types can be selected correctly
+define void @load.store.v1i8(<1 x i8>* %ptr, <1 x i8>* %ptr2) {
+; CHECK-LABEL: load.store.v1i8:
+; CHECK: ldr b{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str b{{[0-9]+}}, [x{{[0-9]+|sp}}]
+  %a = load <1 x i8>* %ptr
+  store <1 x i8> %a, <1 x i8>* %ptr2
+  ret void
+}
+
+define void @load.store.v1i16(<1 x i16>* %ptr, <1 x i16>* %ptr2) {
+; CHECK-LABEL: load.store.v1i16:
+; CHECK: ldr h{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str h{{[0-9]+}}, [x{{[0-9]+|sp}}]
+  %a = load <1 x i16>* %ptr
+  store <1 x i16> %a, <1 x i16>* %ptr2
+  ret void
+}
+
+define void @load.store.v1i32(<1 x i32>* %ptr, <1 x i32>* %ptr2) {
+; CHECK-LABEL: load.store.v1i32:
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+|sp}}]
+  %a = load <1 x i32>* %ptr
+  store <1 x i32> %a, <1 x i32>* %ptr2
+  ret void
+}
diff --git a/test/CodeGen/AArch64/neon-max-min-pairwise.ll b/test/CodeGen/AArch64/neon-max-min-pairwise.ll
index d757aca..3e18077 100644
--- a/test/CodeGen/AArch64/neon-max-min-pairwise.ll
+++ b/test/CodeGen/AArch64/neon-max-min-pairwise.ll
@@ -308,3 +308,39 @@ define <2 x double> @test_fminnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
         ret <2 x double> %val
 }
 
+define i32 @test_vminv_s32(<2 x i32> %a) {
+; CHECK-LABEL: test_vminv_s32
+; CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %1 = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32> %a)
+  %2 = extractelement <1 x i32> %1, i32 0
+  ret i32 %2
+}
+
+define i32 @test_vminv_u32(<2 x i32> %a) {
+; CHECK-LABEL: test_vminv_u32
+; CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %1 = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32> %a)
+  %2 = extractelement <1 x i32> %1, i32 0
+  ret i32 %2
+}
+
+define i32 @test_vmaxv_s32(<2 x i32> %a) {
+; CHECK-LABEL: test_vmaxv_s32
+; CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %1 = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32> %a)
+  %2 = extractelement <1 x i32> %1, i32 0
+  ret i32 %2
+}
+
+define i32 @test_vmaxv_u32(<2 x i32> %a) {
+; CHECK-LABEL: test_vmaxv_u32
+; CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %1 = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32> %a)
+  %2 = extractelement <1 x i32> %1, i32 0
+  ret i32 %2
+}
+
+declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32>)
+declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32>)
+declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32>)
+declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32>)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-misc.ll b/test/CodeGen/AArch64/neon-misc.ll
index 9660bf2..7ec36c2 100644
--- a/test/CodeGen/AArch64/neon-misc.ll
+++ b/test/CodeGen/AArch64/neon-misc.ll
@@ -894,13 +894,13 @@ define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
 
 define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 {
 ; CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvtx_f32_f641.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %a) #4
+  %vcvtx_f32_f641.i = call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %a) #4
   ret <2 x float> %vcvtx_f32_f641.i
 }
 
 define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
 ; CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vcvtx_f32_f641.i.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %b) #4
+  %vcvtx_f32_f641.i.i = tail call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %b) #4
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvtx_f32_f641.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %shuffle.i
 }
@@ -1080,147 +1080,255 @@ define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 {
   ret <2 x i64> %vcvt.i
 }
 
-define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
+define <2 x i64> @test_vcvt_s64_f32(<2 x float> %a) #0 {
+; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
+; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = fptosi <2 x float> %a to <2 x i64>
+  ret <2 x i64> %vcvt.i
+}
+
+define <2 x i64> @test_vcvt_u64_f32(<2 x float> %a) #0 {
+; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
+; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = fptoui <2 x float> %a to <2 x i64>
+  ret <2 x i64> %vcvt.i
+}
+
+define <4 x i16> @test_vcvt_s16_f32(<4 x float> %a) #0 {
+; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vcvt.i = fptosi <4 x float> %a to <4 x i16>
+  ret <4 x i16> %vcvt.i
+}
+
+define <4 x i16> @test_vcvt_u16_f32(<4 x float> %a) #0 {
+; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vcvt.i = fptoui <4 x float> %a to <4 x i16>
+  ret <4 x i16> %vcvt.i
+}
+
+define <2 x i32> @test_vcvt_s32_f64(<2 x double> %a) #0 {
+; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vcvt.i = fptosi <2 x double> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <2 x i32> @test_vcvt_u32_f64(<2 x double> %a) #0 {
+; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vcvt.i = fptoui <2 x double> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <1 x i8> @test_vcvt_s8_f64(<1 x double> %a) #0 {
+; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}}
+  %vcvt.i = fptosi <1 x double> %a to <1 x i8>
+  ret <1 x i8> %vcvt.i
+}
+
+define <1 x i8> @test_vcvt_u8_f64(<1 x double> %a) #0 {
+; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}}
+  %vcvt.i = fptoui <1 x double> %a to <1 x i8>
+  ret <1 x i8> %vcvt.i
+}
+
+define <1 x i16> @test_vcvt_s16_f64(<1 x double> %a) #0 {
+; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}}
+  %vcvt.i = fptosi <1 x double> %a to <1 x i16>
+  ret <1 x i16> %vcvt.i
+}
+
+define <1 x i16> @test_vcvt_u16_f64(<1 x double> %a) #0 {
+; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}}
+  %vcvt.i = fptoui <1 x double> %a to <1 x i16>
+  ret <1 x i16> %vcvt.i
+}
+
+define <1 x i32> @test_vcvt_s32_f64_v1(<1 x double> %a) #0 {
+; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}}
+  %vcvt.i = fptosi <1 x double> %a to <1 x i32>
+  ret <1 x i32> %vcvt.i
+}
+
+define <1 x i32> @test_vcvt_u32_f64_v1(<1 x double> %a) #0 {
+; CHECK: fcvtzu w{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}}
+  %vcvt.i = fptoui <1 x double> %a to <1 x i32>
+  ret <1 x i32> %vcvt.i
+}
+
+define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vcvtn_s32_f32
 ; CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtns_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %a) #4
+  %vcvtns_f321.i = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a)
   ret <2 x i32> %vcvtns_f321.i
 }
 
-define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
+define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vcvtnq_s32_f32
 ; CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtns_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %a) #4
+  %vcvtns_f321.i = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a)
   ret <4 x i32> %vcvtns_f321.i
 }
 
-define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) #0 {
+define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vcvtnq_s64_f64
 ; CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtns_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %a) #4
+  %vcvtns_f641.i = call <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double> %a)
   ret <2 x i64> %vcvtns_f641.i
 }
 
-define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
+define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vcvtn_u32_f32
 ; CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtnu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %a) #4
+  %vcvtnu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a)
   ret <2 x i32> %vcvtnu_f321.i
 }
 
-define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
+define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vcvtnq_u32_f32
 ; CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtnu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %a) #4
+  %vcvtnu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a)
   ret <4 x i32> %vcvtnu_f321.i
 }
 
-define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) #0 {
+define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vcvtnq_u64_f64
 ; CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtnu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %a) #4
+  %vcvtnu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double> %a)
   ret <2 x i64> %vcvtnu_f641.i
 }
 
-define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
+define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vcvtp_s32_f32
 ; CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtps_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %a) #4
+  %vcvtps_f321.i = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a)
   ret <2 x i32> %vcvtps_f321.i
 }
 
-define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
+define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vcvtpq_s32_f32
 ; CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtps_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %a) #4
+  %vcvtps_f321.i = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a)
   ret <4 x i32> %vcvtps_f321.i
 }
 
-define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) #0 {
+define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vcvtpq_s64_f64
 ; CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtps_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %a) #4
+  %vcvtps_f641.i = call <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double> %a)
   ret <2 x i64> %vcvtps_f641.i
 }
 
-define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
+define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vcvtp_u32_f32
 ; CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtpu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %a) #4
+  %vcvtpu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a)
   ret <2 x i32> %vcvtpu_f321.i
 }
 
-define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
+define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vcvtpq_u32_f32
 ; CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtpu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %a) #4
+  %vcvtpu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a)
   ret <4 x i32> %vcvtpu_f321.i
 }
 
-define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) #0 {
+define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vcvtpq_u64_f64
 ; CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtpu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %a) #4
+  %vcvtpu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double> %a)
   ret <2 x i64> %vcvtpu_f641.i
 }
 
-define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
+define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vcvtm_s32_f32
 ; CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtms_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %a) #4
+  %vcvtms_f321.i = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a)
   ret <2 x i32> %vcvtms_f321.i
 }
 
-define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
+define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vcvtmq_s32_f32
 ; CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtms_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %a) #4
+  %vcvtms_f321.i = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a)
   ret <4 x i32> %vcvtms_f321.i
 }
 
-define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) #0 {
+define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vcvtmq_s64_f64
 ; CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtms_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %a) #4
+  %vcvtms_f641.i = call <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double> %a)
   ret <2 x i64> %vcvtms_f641.i
 }
 
-define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
+define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vcvtm_u32_f32
 ; CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtmu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %a) #4
+  %vcvtmu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a)
   ret <2 x i32> %vcvtmu_f321.i
 }
 
-define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
+define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vcvtmq_u32_f32
 ; CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtmu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %a) #4
+  %vcvtmu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a)
   ret <4 x i32> %vcvtmu_f321.i
 }
 
-define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) #0 {
+define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vcvtmq_u64_f64
 ; CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtmu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %a) #4
+  %vcvtmu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double> %a)
   ret <2 x i64> %vcvtmu_f641.i
 }
 
-define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
+define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vcvta_s32_f32
 ; CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtas_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %a) #4
+  %vcvtas_f321.i = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a)
   ret <2 x i32> %vcvtas_f321.i
 }
 
-define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
+define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vcvtaq_s32_f32
 ; CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtas_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %a) #4
+  %vcvtas_f321.i = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a)
   ret <4 x i32> %vcvtas_f321.i
 }
 
-define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) #0 {
+define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vcvtaq_s64_f64
 ; CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtas_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %a) #4
+  %vcvtas_f641.i = call <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double> %a)
   ret <2 x i64> %vcvtas_f641.i
 }
 
-define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
+define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vcvta_u32_f32
 ; CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtau_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %a) #4
+  %vcvtau_f321.i = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a)
   ret <2 x i32> %vcvtau_f321.i
 }
 
-define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
+define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vcvtaq_u32_f32
 ; CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtau_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %a) #4
+  %vcvtau_f321.i = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a)
   ret <4 x i32> %vcvtau_f321.i
 }
 
-define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) #0 {
+define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vcvtaq_u64_f64
 ; CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtau_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %a) #4
+  %vcvtau_f641.i = call <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double> %a)
   ret <2 x i64> %vcvtau_f641.i
 }
 
@@ -1326,6 +1434,94 @@ define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 {
   ret <2 x double> %vcvt.i
 }
 
+define <2 x float> @test_vcvt_f32_s64(<2 x i64> %a) #0 {
+; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vcvt.i = sitofp <2 x i64> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <2 x float> @test_vcvt_f32_u64(<2 x i64> %a) #0 {
+; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vcvt.i = uitofp <2 x i64> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <4 x float> @test_vcvt_f32_s16(<4 x i16> %a) #0 {
+; CHECK: sshll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0
+; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvt.i = sitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @test_vcvt_f32_u16(<4 x i16> %a) #0 {
+; CHECK: ushll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0
+; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvt.i = uitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <2 x double> @test_vcvt_f64_s32(<2 x i32> %a) #0 {
+; CHECK: sshll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0
+; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = sitofp <2 x i32> %a to <2 x double>
+  ret <2 x double> %vcvt.i
+}
+
+define <2 x double> @test_vcvt_f64_u32(<2 x i32> %a) #0 {
+; CHECK: ushll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0
+; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = uitofp <2 x i32> %a to <2 x double>
+  ret <2 x double> %vcvt.i
+}
+
+define <1 x double> @test_vcvt_f64_s8(<1 x i8> %a) #0 {
+; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0]
+; CHECK: sxtb w{{[0-9]+}}, w{{[0-9]+}}
+; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
+  %vcvt.i = sitofp <1 x i8> %a to <1 x double>
+  ret <1 x double> %vcvt.i
+}
+
+define <1 x double> @test_vcvt_f64_u8(<1 x i8> %a) #0 {
+; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0]
+; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xff
+; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
+  %vcvt.i = uitofp <1 x i8> %a to <1 x double>
+  ret <1 x double> %vcvt.i
+}
+
+define <1 x double> @test_vcvt_f64_s16(<1 x i16> %a) #0 {
+; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0]
+; CHECK: sxth w{{[0-9]+}}, w{{[0-9]+}}
+; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
+  %vcvt.i = sitofp <1 x i16> %a to <1 x double>
+  ret <1 x double> %vcvt.i
+}
+
+define <1 x double> @test_vcvt_f64_u16(<1 x i16> %a) #0 {
+; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0]
+; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xffff
+; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
+  %vcvt.i = uitofp <1 x i16> %a to <1 x double>
+  ret <1 x double> %vcvt.i
+}
+
+define <1 x double> @test_vcvt_f64_s32_v1(<1 x i32> %a) #0 {
+; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
+  %vcvt.i = sitofp <1 x i32> %a to <1 x double>
+  ret <1 x double> %vcvt.i
+}
+
+define <1 x double> @test_vcvt_f64_u32_v1(<1 x i32> %a) #0 {
+; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
+  %vcvt.i = uitofp <1 x i32> %a to <1 x double>
+  ret <1 x double> %vcvt.i
+}
+
 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #2
 
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2
@@ -1348,53 +1544,53 @@ declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) #2
 
 declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) #2
 
-declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) #2
+declare <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double>)
 
-declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) #2
+declare <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float>)
 
-declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) #2
+declare <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float>)
 
-declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) #2
+declare <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double>)
 
-declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) #2
+declare <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float>)
 
-declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) #2
+declare <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float>)
 
-declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) #2
+declare <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double>)
 
-declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) #2
+declare <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float>)
 
-declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) #2
+declare <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float>)
 
-declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) #2
+declare <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double>)
 
-declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) #2
+declare <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float>)
 
-declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) #2
+declare <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float>)
 
-declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) #2
+declare <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double>)
 
-declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) #2
+declare <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float>)
 
-declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) #2
+declare <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float>)
 
-declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) #2
+declare <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double>)
 
-declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) #2
+declare <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float>)
 
-declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) #2
+declare <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float>)
 
-declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) #2
+declare <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double>)
 
-declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) #2
+declare <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float>)
 
-declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) #2
+declare <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float>)
 
-declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) #2
+declare <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double>)
 
-declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) #2
+declare <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float>)
 
-declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) #2
+declare <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float>)
 
 declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #3
 
@@ -1438,7 +1634,7 @@ declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) #2
 
 declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) #2
 
-declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) #2
+declare <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double>) #2
 
 declare <2 x float> @llvm.aarch64.neon.fcvtn.v2f32.v2f64(<2 x double>) #2
 
@@ -1624,56 +1820,56 @@ define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) {
 define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvtn_s64_f64
 ; CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %a)
+  %1 = call <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double> %a)
   ret <1 x i64> %1
 }
 
 define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvtn_u64_f64
 ; CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %a)
+  %1 = call <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double> %a)
   ret <1 x i64> %1
 }
 
 define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvtp_s64_f64
 ; CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %a)
+  %1 = call <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double> %a)
   ret <1 x i64> %1
 }
 
 define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvtp_u64_f64
 ; CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %a)
+  %1 = call <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double> %a)
   ret <1 x i64> %1
 }
 
 define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvtm_s64_f64
 ; CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %a)
+  %1 = call <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double> %a)
   ret <1 x i64> %1
 }
 
 define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvtm_u64_f64
 ; CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %a)
+  %1 = call <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double> %a)
   ret <1 x i64> %1
 }
 
 define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvta_s64_f64
 ; CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %a)
+  %1 = call <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double> %a)
   ret <1 x i64> %1
 }
 
 define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvta_u64_f64
 ; CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %a)
+  %1 = call <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double> %a)
   ret <1 x i64> %1
 }
 
@@ -1691,14 +1887,14 @@ define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) {
   ret <1 x double> %1
 }
 
-declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double>)
 
 define <1 x double> @test_vrndn_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vrndn_f64
@@ -1796,4 +1992,23 @@ declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>)
 declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>)
 declare <1 x double> @llvm.sqrt.v1f64(<1 x double>)
 declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
-\ No newline at end of file
+declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
+
+define i64 @test_vaddlv_s32(<2 x i32> %a) {
+; CHECK-LABEL: test_vaddlv_s32
+; CHECK: saddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32> %a)
+  %2 = extractelement <1 x i64> %1, i32 0
+  ret i64 %2
+}
+
+define i64 @test_vaddlv_u32(<2 x i32> %a) {
+; CHECK-LABEL: test_vaddlv_u32
+; CHECK: uaddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32> %a)
+  %2 = extractelement <1 x i64> %1, i32 0
+  ret i64 %2
+}
+
+declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32>)
+declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32>)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-mla-mls.ll b/test/CodeGen/AArch64/neon-mla-mls.ll
index 23e9223..71bb0e7 100644
--- a/test/CodeGen/AArch64/neon-mla-mls.ll
+++ b/test/CodeGen/AArch64/neon-mla-mls.ll
@@ -2,84 +2,84 @@
 
 
 define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
-;CHECK: mla {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = mul <8 x i8> %A, %B;
 	%tmp2 = add <8 x i8> %C, %tmp1;
 	ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @mla16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-;CHECK: mla {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = mul <16 x i8> %A, %B;
 	%tmp2 = add <16 x i8> %C, %tmp1;
 	ret <16 x i8> %tmp2
 }
 
 define <4 x i16> @mla4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
-;CHECK: mla {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+;CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp1 = mul <4 x i16> %A, %B;
 	%tmp2 = add <4 x i16> %C, %tmp1;
 	ret <4 x i16> %tmp2
 }
 
 define <8 x i16> @mla8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
-;CHECK: mla {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+;CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp1 = mul <8 x i16> %A, %B;
 	%tmp2 = add <8 x i16> %C, %tmp1;
 	ret <8 x i16> %tmp2
 }
 
 define <2 x i32> @mla2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
-;CHECK: mla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp1 = mul <2 x i32> %A, %B;
 	%tmp2 = add <2 x i32> %C, %tmp1;
 	ret <2 x i32> %tmp2
 }
 
 define <4 x i32> @mla4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
-;CHECK: mla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp1 = mul <4 x i32> %A, %B;
 	%tmp2 = add <4 x i32> %C, %tmp1;
 	ret <4 x i32> %tmp2
 }
 
 define <8 x i8> @mls8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
-;CHECK: mls {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = mul <8 x i8> %A, %B;
 	%tmp2 = sub <8 x i8> %C, %tmp1;
 	ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @mls16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-;CHECK: mls {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = mul <16 x i8> %A, %B;
 	%tmp2 = sub <16 x i8> %C, %tmp1;
 	ret <16 x i8> %tmp2
 }
 
 define <4 x i16> @mls4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
-;CHECK: mls {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+;CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp1 = mul <4 x i16> %A, %B;
 	%tmp2 = sub <4 x i16> %C, %tmp1;
 	ret <4 x i16> %tmp2
 }
 
 define <8 x i16> @mls8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
-;CHECK: mls {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+;CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp1 = mul <8 x i16> %A, %B;
 	%tmp2 = sub <8 x i16> %C, %tmp1;
 	ret <8 x i16> %tmp2
 }
 
 define <2 x i32> @mls2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
-;CHECK: mls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp1 = mul <2 x i32> %A, %B;
 	%tmp2 = sub <2 x i32> %C, %tmp1;
 	ret <2 x i32> %tmp2
 }
 
 define <4 x i32> @mls4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
-;CHECK: mls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp1 = mul <4 x i32> %A, %B;
 	%tmp2 = sub <4 x i32> %C, %tmp1;
 	ret <4 x i32> %tmp2
diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll
index 60b13b8..4035b91 100644
--- a/test/CodeGen/AArch64/neon-mov.ll
+++ b/test/CodeGen/AArch64/neon-mov.ll
@@ -1,204 +1,204 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @movi8b() {
-;CHECK:  movi {{v[0-31]+}}.8b, #0x8
+;CHECK:  movi {{v[0-9]+}}.8b, #0x8
    ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
 }
 
 define <16 x i8> @movi16b() {
-;CHECK:  movi {{v[0-31]+}}.16b, #0x8
+;CHECK:  movi {{v[0-9]+}}.16b, #0x8
    ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
 }
 
 define <2 x i32> @movi2s_lsl0() {
-;CHECK:  movi {{v[0-31]+}}.2s, #0xff
+;CHECK:  movi {{v[0-9]+}}.2s, #0xff
    ret <2 x i32> < i32 255, i32 255 >
 }
 
 define <2 x i32> @movi2s_lsl8() {
-;CHECK:  movi {{v[0-31]+}}.2s, #0xff, lsl #8
+;CHECK:  movi {{v[0-9]+}}.2s, #0xff, lsl #8
    ret <2 x i32> < i32 65280, i32 65280 >
 }
 
 define <2 x i32> @movi2s_lsl16() {
-;CHECK:  movi {{v[0-31]+}}.2s, #0xff, lsl #16
+;CHECK:  movi {{v[0-9]+}}.2s, #0xff, lsl #16
    ret <2 x i32> < i32 16711680, i32 16711680 >
 
 }
 
 define <2 x i32> @movi2s_lsl24() {
-;CHECK:  movi {{v[0-31]+}}.2s, #0xff, lsl #24
+;CHECK:  movi {{v[0-9]+}}.2s, #0xff, lsl #24
    ret <2 x i32> < i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i32> @movi4s_lsl0() {
-;CHECK:  movi {{v[0-31]+}}.4s, #0xff
+;CHECK:  movi {{v[0-9]+}}.4s, #0xff
    ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 >
 }
 
 define <4 x i32> @movi4s_lsl8() {
-;CHECK:  movi {{v[0-31]+}}.4s, #0xff, lsl #8
+;CHECK:  movi {{v[0-9]+}}.4s, #0xff, lsl #8
    ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 >
 }
 
 define <4 x i32> @movi4s_lsl16() {
-;CHECK:  movi {{v[0-31]+}}.4s, #0xff, lsl #16
+;CHECK:  movi {{v[0-9]+}}.4s, #0xff, lsl #16
    ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 >
 
 }
 
 define <4 x i32> @movi4s_lsl24() {
-;CHECK:  movi {{v[0-31]+}}.4s, #0xff, lsl #24
+;CHECK:  movi {{v[0-9]+}}.4s, #0xff, lsl #24
    ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i16> @movi4h_lsl0() {
-;CHECK:  movi {{v[0-31]+}}.4h, #0xff
+;CHECK:  movi {{v[0-9]+}}.4h, #0xff
    ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <4 x i16> @movi4h_lsl8() {
-;CHECK:  movi {{v[0-31]+}}.4h, #0xff, lsl #8
+;CHECK:  movi {{v[0-9]+}}.4h, #0xff, lsl #8
    ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
 define <8 x i16> @movi8h_lsl0() {
-;CHECK:  movi {{v[0-31]+}}.8h, #0xff
+;CHECK:  movi {{v[0-9]+}}.8h, #0xff
    ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <8 x i16> @movi8h_lsl8() {
-;CHECK:  movi {{v[0-31]+}}.8h, #0xff, lsl #8
+;CHECK:  movi {{v[0-9]+}}.8h, #0xff, lsl #8
    ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
 
 define <2 x i32> @mvni2s_lsl0() {
-;CHECK:  mvni {{v[0-31]+}}.2s, #0x10
+;CHECK:  mvni {{v[0-9]+}}.2s, #0x10
    ret <2 x i32> < i32 4294967279, i32 4294967279 >
 }
 
 define <2 x i32> @mvni2s_lsl8() {
-;CHECK:  mvni {{v[0-31]+}}.2s, #0x10, lsl #8
+;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, lsl #8
    ret <2 x i32> < i32 4294963199, i32 4294963199 >
 }
 
 define <2 x i32> @mvni2s_lsl16() {
-;CHECK:  mvni {{v[0-31]+}}.2s, #0x10, lsl #16
+;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, lsl #16
    ret <2 x i32> < i32 4293918719, i32 4293918719 >
 }
 
 define <2 x i32> @mvni2s_lsl24() {
-;CHECK:  mvni {{v[0-31]+}}.2s, #0x10, lsl #24
+;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, lsl #24
    ret <2 x i32> < i32 4026531839, i32 4026531839 >
 }
 
 define <4 x i32> @mvni4s_lsl0() {
-;CHECK:  mvni {{v[0-31]+}}.4s, #0x10
+;CHECK:  mvni {{v[0-9]+}}.4s, #0x10
    ret <4 x i32> < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
 }
 
 define <4 x i32> @mvni4s_lsl8() {
-;CHECK:  mvni {{v[0-31]+}}.4s, #0x10, lsl #8
+;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, lsl #8
    ret <4 x i32> < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 >
 }
 
 define <4 x i32> @mvni4s_lsl16() {
-;CHECK:  mvni {{v[0-31]+}}.4s, #0x10, lsl #16
+;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, lsl #16
    ret <4 x i32> < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 >
 
 }
 
 define <4 x i32> @mvni4s_lsl24() {
-;CHECK:  mvni {{v[0-31]+}}.4s, #0x10, lsl #24
+;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, lsl #24
    ret <4 x i32> < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839 >
 }
 
 
 define <4 x i16> @mvni4h_lsl0() {
-;CHECK:  mvni {{v[0-31]+}}.4h, #0x10
+;CHECK:  mvni {{v[0-9]+}}.4h, #0x10
    ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 >
 }
 
 define <4 x i16> @mvni4h_lsl8() {
-;CHECK:  mvni {{v[0-31]+}}.4h, #0x10, lsl #8
+;CHECK:  mvni {{v[0-9]+}}.4h, #0x10, lsl #8
    ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 >
 }
 
 define <8 x i16> @mvni8h_lsl0() {
-;CHECK:  mvni {{v[0-31]+}}.8h, #0x10
+;CHECK:  mvni {{v[0-9]+}}.8h, #0x10
    ret <8 x i16> < i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519 >
 }
 
 define <8 x i16> @mvni8h_lsl8() {
-;CHECK:  mvni {{v[0-31]+}}.8h, #0x10, lsl #8
+;CHECK:  mvni {{v[0-9]+}}.8h, #0x10, lsl #8
    ret <8 x i16> < i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439 >
 }
 
 
 define <2 x i32> @movi2s_msl8(<2 x i32> %a) {
-;CHECK:  movi {{v[0-31]+}}.2s, #0xff, msl #8
+;CHECK:  movi {{v[0-9]+}}.2s, #0xff, msl #8
 	ret <2 x i32> < i32 65535, i32 65535 >
 }
 
 define <2 x i32> @movi2s_msl16() {
-;CHECK:  movi {{v[0-31]+}}.2s, #0xff, msl #16
+;CHECK:  movi {{v[0-9]+}}.2s, #0xff, msl #16
    ret <2 x i32> < i32 16777215, i32 16777215 >
 }
 
 
 define <4 x i32> @movi4s_msl8() {
-;CHECK:  movi {{v[0-31]+}}.4s, #0xff, msl #8
+;CHECK:  movi {{v[0-9]+}}.4s, #0xff, msl #8
    ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 >
 }
 
 define <4 x i32> @movi4s_msl16() {
-;CHECK:  movi {{v[0-31]+}}.4s, #0xff, msl #16
+;CHECK:  movi {{v[0-9]+}}.4s, #0xff, msl #16
    ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 >
 }
 
 define <2 x i32> @mvni2s_msl8() {
-;CHECK:  mvni {{v[0-31]+}}.2s, #0x10, msl #8
+;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, msl #8
    ret <2 x i32> < i32 18446744073709547264, i32 18446744073709547264>
 }
 
 define <2 x i32> @mvni2s_msl16() {
-;CHECK:  mvni {{v[0-31]+}}.2s, #0x10, msl #16
+;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, msl #16
    ret <2 x i32> < i32 18446744073708437504, i32 18446744073708437504>
 }
 
 define <4 x i32> @mvni4s_msl8() {
-;CHECK:  mvni {{v[0-31]+}}.4s, #0x10, msl #8
+;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, msl #8
    ret <4 x i32> < i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264>
 }
 
 define <4 x i32> @mvni4s_msl16() {
-;CHECK:  mvni {{v[0-31]+}}.4s, #0x10, msl #16
+;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, msl #16
    ret <4 x i32> < i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504>
 }
 
 define <2 x i64> @movi2d() {
-;CHECK: movi {{v[0-31]+}}.2d, #0xff0000ff0000ffff
+;CHECK: movi {{v[0-9]+}}.2d, #0xff0000ff0000ffff
 	ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
 }
 
 define <1 x i64> @movid() {
-;CHECK: movi {{d[0-31]+}}, #0xff0000ff0000ffff
+;CHECK: movi {{d[0-9]+}}, #0xff0000ff0000ffff
 	ret  <1 x i64> < i64 18374687574888349695 >
 }
 
 define <2 x float> @fmov2s() {
-;CHECK:  fmov {{v[0-31]+}}.2s, #-12.00000000
+;CHECK:  fmov {{v[0-9]+}}.2s, #-12.00000000
 	ret <2 x float> < float -1.2e1, float -1.2e1>
 }
 
 define <4 x float> @fmov4s() {
-;CHECK:  fmov {{v[0-31]+}}.4s, #-12.00000000
+;CHECK:  fmov {{v[0-9]+}}.4s, #-12.00000000
 	ret <4 x float> < float -1.2e1, float -1.2e1, float -1.2e1, float -1.2e1>
 }
 
 define <2 x double> @fmov2d() {
-;CHECK:  fmov {{v[0-31]+}}.2d, #-12.00000000
+;CHECK:  fmov {{v[0-9]+}}.2d, #-12.00000000
 	ret <2 x double> < double -1.2e1, double -1.2e1>
 }
 
@@ -210,7 +210,9 @@ define <2 x i32> @movi1d_1() {
 
 declare <2 x i32> @test_movi1d(<2 x i32>, <2 x i32>)
 define <2 x i32> @movi1d() {
-; CHECK: movi     d1, #0xffffffff0000
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
+; CHECK-NEXT: movi     d1, #0xffffffff0000
   %1 = tail call <2 x i32> @test_movi1d(<2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>)
   ret <2 x i32> %1
 }
diff --git a/test/CodeGen/AArch64/neon-mul-div.ll b/test/CodeGen/AArch64/neon-mul-div.ll
index e1be313..da22ce8 100644
--- a/test/CodeGen/AArch64/neon-mul-div.ll
+++ b/test/CodeGen/AArch64/neon-mul-div.ll
@@ -2,76 +2,628 @@
 
 
 define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: mul {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = mul <8 x i8> %A, %B;
 	ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: mul {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+;CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = mul <16 x i8> %A, %B;
 	ret <16 x i8> %tmp3
 }
 
 define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: mul {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = mul <4 x i16> %A, %B;
 	ret <4 x i16> %tmp3
 }
 
 define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: mul {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+;CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = mul <8 x i16> %A, %B;
 	ret <8 x i16> %tmp3
 }
 
 define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: mul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = mul <2 x i32> %A, %B;
 	ret <2 x i32> %tmp3
 }
 
 define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: mul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = mul <4 x i32> %A, %B;
 	ret <4 x i32> %tmp3
 }
 
+define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK-LABEL: mul1xi64:
+;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+  %tmp3 = mul <1 x i64> %A, %B;
+  ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK-LABEL: mul2xi64:
+;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+  %tmp3 = mul <2 x i64> %A, %B;
+  ret <2 x i64> %tmp3
+}
+
  define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fmul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = fmul <2 x float> %A, %B;
 	ret <2 x float> %tmp3
 }
 
 define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fmul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = fmul <4 x float> %A, %B;
 	ret <4 x float> %tmp3
 }
 define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fmul {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+;CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = fmul <2 x double> %A, %B;
 	ret <2 x double> %tmp3
 }
 
 
  define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fdiv {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+;CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = fdiv <2 x float> %A, %B;
 	ret <2 x float> %tmp3
 }
 
 define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fdiv {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+;CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = fdiv <4 x float> %A, %B;
 	ret <4 x float> %tmp3
 }
 define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fdiv {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+;CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = fdiv <2 x double> %A, %B;
 	ret <2 x double> %tmp3
 }
 
+define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = sdiv <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = sdiv <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = udiv <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = udiv <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = srem <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = srem <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = urem <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = urem <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) {
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+	%tmp3 = frem <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) {
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+	%tmp3 = frem <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+
+define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) {
+; CHECK: bl fmod
+	%tmp3 = frem <1 x double> %A, %B;
+	ret <1 x double> %tmp3
+}
+
+define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) {
+; CHECK: bl fmod
+; CHECK: bl fmod
+	%tmp3 = frem <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
 declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>)
 declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>)
 
@@ -179,3 +731,24 @@ define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
         %val = call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
         ret <2 x double> %val
 }
+
+define <1 x i8> @test_mul_v1i8(<1 x i8> %a, <1 x i8> %b) {
+;CHECK-LABEL: test_mul_v1i8:
+;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %c = mul <1 x i8> %a, %b
+  ret <1 x i8> %c
+}
+
+define <1 x i16> @test_mul_v1i16(<1 x i16> %a, <1 x i16> %b) {
+;CHECK-LABEL: test_mul_v1i16:
+;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %c = mul <1 x i16> %a, %b
+  ret <1 x i16> %c
+}
+
+define <1 x i32> @test_mul_v1i32(<1 x i32> %a, <1 x i32> %b) {
+;CHECK-LABEL: test_mul_v1i32:
+;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %c = mul <1 x i32> %a, %b
+  ret <1 x i32> %c
+}
diff --git a/test/CodeGen/AArch64/neon-or-combine.ll b/test/CodeGen/AArch64/neon-or-combine.ll
new file mode 100644
index 0000000..260f693
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-or-combine.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+; Check that the DAGCombiner does not crash with an assertion failure
+; when performing a target specific combine to simplify a 'or' dag node
+; according to the following rule:
+;   (or (and B, A), (and C, ~A)) => (VBSL A, B, C)
+; The assertion failure was caused by an invalid comparison between APInt
+; values with different 'BitWidth'.
+
+define <8 x i8> @test1(<8 x i8> %a, <8 x i8> %b)  {
+  %tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 >
+  %tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 >
+  %tmp3 = or <8 x i8> %tmp1, %tmp2
+  ret <8 x i8> %tmp3
+}
+
+; CHECK-LABEL: test1
+; CHECK: ret
+
+define <16 x i8> @test2(<16 x i8> %a, <16 x i8> %b) {
+  %tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+  %tmp2 = and <16 x i8> %b, < i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 >
+  %tmp3 = or <16 x i8> %tmp1, %tmp2
+  ret <16 x i8> %tmp3
+}
+
+; CHECK-LABEL: test2
+; CHECK: ret
+
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
index fa4d54d..a0b17e1 100644
--- a/test/CodeGen/AArch64/neon-perm.ll
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -1030,6 +1030,1447 @@ entry:
   ret <8 x i16> %shuffle.i
 }
 
+define <8 x i8> @test_same_vuzp1_s8(<8 x i8> %a) {
+; CHECK: test_same_vuzp1_s8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vuzp1q_s8(<16 x i8> %a) {
+; CHECK: test_same_vuzp1q_s8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vuzp1_s16(<4 x i16> %a) {
+; CHECK: test_same_vuzp1_s16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vuzp1q_s16(<8 x i16> %a) {
+; CHECK: test_same_vuzp1q_s16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vuzp1q_s32(<4 x i32> %a) {
+; CHECK: test_same_vuzp1q_s32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_same_vuzp1_u8(<8 x i8> %a) {
+; CHECK: test_same_vuzp1_u8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vuzp1q_u8(<16 x i8> %a) {
+; CHECK: test_same_vuzp1q_u8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vuzp1_u16(<4 x i16> %a) {
+; CHECK: test_same_vuzp1_u16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vuzp1q_u16(<8 x i16> %a) {
+; CHECK: test_same_vuzp1q_u16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vuzp1q_u32(<4 x i32> %a) {
+; CHECK: test_same_vuzp1q_u32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_same_vuzp1q_f32(<4 x float> %a) {
+; CHECK: test_same_vuzp1q_f32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_same_vuzp1_p8(<8 x i8> %a) {
+; CHECK: test_same_vuzp1_p8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vuzp1q_p8(<16 x i8> %a) {
+; CHECK: test_same_vuzp1q_p8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vuzp1_p16(<4 x i16> %a) {
+; CHECK: test_same_vuzp1_p16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vuzp1q_p16(<8 x i16> %a) {
+; CHECK: test_same_vuzp1q_p16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_same_vuzp2_s8(<8 x i8> %a) {
+; CHECK: test_same_vuzp2_s8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vuzp2q_s8(<16 x i8> %a) {
+; CHECK: test_same_vuzp2q_s8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vuzp2_s16(<4 x i16> %a) {
+; CHECK: test_same_vuzp2_s16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vuzp2q_s16(<8 x i16> %a) {
+; CHECK: test_same_vuzp2q_s16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vuzp2q_s32(<4 x i32> %a) {
+; CHECK: test_same_vuzp2q_s32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_same_vuzp2_u8(<8 x i8> %a) {
+; CHECK: test_same_vuzp2_u8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vuzp2q_u8(<16 x i8> %a) {
+; CHECK: test_same_vuzp2q_u8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vuzp2_u16(<4 x i16> %a) {
+; CHECK: test_same_vuzp2_u16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vuzp2q_u16(<8 x i16> %a) {
+; CHECK: test_same_vuzp2q_u16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vuzp2q_u32(<4 x i32> %a) {
+; CHECK: test_same_vuzp2q_u32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_same_vuzp2q_f32(<4 x float> %a) {
+; CHECK: test_same_vuzp2q_f32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_same_vuzp2_p8(<8 x i8> %a) {
+; CHECK: test_same_vuzp2_p8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vuzp2q_p8(<16 x i8> %a) {
+; CHECK: test_same_vuzp2q_p8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vuzp2_p16(<4 x i16> %a) {
+; CHECK: test_same_vuzp2_p16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vuzp2q_p16(<8 x i16> %a) {
+; CHECK: test_same_vuzp2q_p16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_same_vzip1_s8(<8 x i8> %a) {
+; CHECK: test_same_vzip1_s8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vzip1q_s8(<16 x i8> %a) {
+; CHECK: test_same_vzip1q_s8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vzip1_s16(<4 x i16> %a) {
+; CHECK: test_same_vzip1_s16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vzip1q_s16(<8 x i16> %a) {
+; CHECK: test_same_vzip1q_s16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vzip1q_s32(<4 x i32> %a) {
+; CHECK: test_same_vzip1q_s32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_same_vzip1_u8(<8 x i8> %a) {
+; CHECK: test_same_vzip1_u8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vzip1q_u8(<16 x i8> %a) {
+; CHECK: test_same_vzip1q_u8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vzip1_u16(<4 x i16> %a) {
+; CHECK: test_same_vzip1_u16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vzip1q_u16(<8 x i16> %a) {
+; CHECK: test_same_vzip1q_u16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vzip1q_u32(<4 x i32> %a) {
+; CHECK: test_same_vzip1q_u32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_same_vzip1q_f32(<4 x float> %a) {
+; CHECK: test_same_vzip1q_f32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_same_vzip1_p8(<8 x i8> %a) {
+; CHECK: test_same_vzip1_p8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vzip1q_p8(<16 x i8> %a) {
+; CHECK: test_same_vzip1q_p8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vzip1_p16(<4 x i16> %a) {
+; CHECK: test_same_vzip1_p16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vzip1q_p16(<8 x i16> %a) {
+; CHECK: test_same_vzip1q_p16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_same_vzip2_s8(<8 x i8> %a) {
+; CHECK: test_same_vzip2_s8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vzip2q_s8(<16 x i8> %a) {
+; CHECK: test_same_vzip2q_s8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vzip2_s16(<4 x i16> %a) {
+; CHECK: test_same_vzip2_s16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vzip2q_s16(<8 x i16> %a) {
+; CHECK: test_same_vzip2q_s16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vzip2q_s32(<4 x i32> %a) {
+; CHECK: test_same_vzip2q_s32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_same_vzip2_u8(<8 x i8> %a) {
+; CHECK: test_same_vzip2_u8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vzip2q_u8(<16 x i8> %a) {
+; CHECK: test_same_vzip2q_u8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vzip2_u16(<4 x i16> %a) {
+; CHECK: test_same_vzip2_u16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vzip2q_u16(<8 x i16> %a) {
+; CHECK: test_same_vzip2q_u16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vzip2q_u32(<4 x i32> %a) {
+; CHECK: test_same_vzip2q_u32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_same_vzip2q_f32(<4 x float> %a) {
+; CHECK: test_same_vzip2q_f32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_same_vzip2_p8(<8 x i8> %a) {
+; CHECK: test_same_vzip2_p8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vzip2q_p8(<16 x i8> %a) {
+; CHECK: test_same_vzip2q_p8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vzip2_p16(<4 x i16> %a) {
+; CHECK: test_same_vzip2_p16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vzip2q_p16(<8 x i16> %a) {
+; CHECK: test_same_vzip2q_p16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_same_vtrn1_s8(<8 x i8> %a) {
+; CHECK: test_same_vtrn1_s8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vtrn1q_s8(<16 x i8> %a) {
+; CHECK: test_same_vtrn1q_s8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vtrn1_s16(<4 x i16> %a) {
+; CHECK: test_same_vtrn1_s16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vtrn1q_s16(<8 x i16> %a) {
+; CHECK: test_same_vtrn1q_s16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vtrn1q_s32(<4 x i32> %a) {
+; CHECK: test_same_vtrn1q_s32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_same_vtrn1_u8(<8 x i8> %a) {
+; CHECK: test_same_vtrn1_u8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vtrn1q_u8(<16 x i8> %a) {
+; CHECK: test_same_vtrn1q_u8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vtrn1_u16(<4 x i16> %a) {
+; CHECK: test_same_vtrn1_u16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vtrn1q_u16(<8 x i16> %a) {
+; CHECK: test_same_vtrn1q_u16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vtrn1q_u32(<4 x i32> %a) {
+; CHECK: test_same_vtrn1q_u32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_same_vtrn1q_f32(<4 x float> %a) {
+; CHECK: test_same_vtrn1q_f32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_same_vtrn1_p8(<8 x i8> %a) {
+; CHECK: test_same_vtrn1_p8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vtrn1q_p8(<16 x i8> %a) {
+; CHECK: test_same_vtrn1q_p8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vtrn1_p16(<4 x i16> %a) {
+; CHECK: test_same_vtrn1_p16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vtrn1q_p16(<8 x i16> %a) {
+; CHECK: test_same_vtrn1q_p16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_same_vtrn2_s8(<8 x i8> %a) {
+; CHECK: test_same_vtrn2_s8:
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vtrn2q_s8(<16 x i8> %a) {
+; CHECK: test_same_vtrn2q_s8:
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vtrn2_s16(<4 x i16> %a) {
+; CHECK: test_same_vtrn2_s16:
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vtrn2q_s16(<8 x i16> %a) {
+; CHECK: test_same_vtrn2q_s16:
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vtrn2q_s32(<4 x i32> %a) {
+; CHECK: test_same_vtrn2q_s32:
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_same_vtrn2_u8(<8 x i8> %a) {
+; CHECK: test_same_vtrn2_u8:
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vtrn2q_u8(<16 x i8> %a) {
+; CHECK: test_same_vtrn2q_u8:
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vtrn2_u16(<4 x i16> %a) {
+; CHECK: test_same_vtrn2_u16:
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vtrn2q_u16(<8 x i16> %a) {
+; CHECK: test_same_vtrn2q_u16:
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_same_vtrn2q_u32(<4 x i32> %a) {
+; CHECK: test_same_vtrn2q_u32:
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_same_vtrn2q_f32(<4 x float> %a) {
+; CHECK: test_same_vtrn2q_f32:
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_same_vtrn2_p8(<8 x i8> %a) {
+; CHECK: test_same_vtrn2_p8:
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_same_vtrn2q_p8(<16 x i8> %a) {
+; CHECK: test_same_vtrn2q_p8:
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_same_vtrn2_p16(<4 x i16> %a) {
+; CHECK: test_same_vtrn2_p16:
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_same_vtrn2q_p16(<8 x i16> %a) {
+; CHECK: test_same_vtrn2q_p16:
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+
+define <8 x i8> @test_undef_vuzp1_s8(<8 x i8> %a) {
+; CHECK: test_undef_vuzp1_s8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vuzp1q_s8(<16 x i8> %a) {
+; CHECK: test_undef_vuzp1q_s8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vuzp1_s16(<4 x i16> %a) {
+; CHECK: test_undef_vuzp1_s16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vuzp1q_s16(<8 x i16> %a) {
+; CHECK: test_undef_vuzp1q_s16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vuzp1q_s32(<4 x i32> %a) {
+; CHECK: test_undef_vuzp1q_s32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vuzp1_u8(<8 x i8> %a) {
+; CHECK: test_undef_vuzp1_u8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vuzp1q_u8(<16 x i8> %a) {
+; CHECK: test_undef_vuzp1q_u8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vuzp1_u16(<4 x i16> %a) {
+; CHECK: test_undef_vuzp1_u16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vuzp1q_u16(<8 x i16> %a) {
+; CHECK: test_undef_vuzp1q_u16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vuzp1q_u32(<4 x i32> %a) {
+; CHECK: test_undef_vuzp1q_u32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_undef_vuzp1q_f32(<4 x float> %a) {
+; CHECK: test_undef_vuzp1q_f32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vuzp1_p8(<8 x i8> %a) {
+; CHECK: test_undef_vuzp1_p8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vuzp1q_p8(<16 x i8> %a) {
+; CHECK: test_undef_vuzp1q_p8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vuzp1_p16(<4 x i16> %a) {
+; CHECK: test_undef_vuzp1_p16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vuzp1q_p16(<8 x i16> %a) {
+; CHECK: test_undef_vuzp1q_p16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vuzp2_s8(<8 x i8> %a) {
+; CHECK: test_undef_vuzp2_s8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vuzp2q_s8(<16 x i8> %a) {
+; CHECK: test_undef_vuzp2q_s8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vuzp2_s16(<4 x i16> %a) {
+; CHECK: test_undef_vuzp2_s16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vuzp2q_s16(<8 x i16> %a) {
+; CHECK: test_undef_vuzp2q_s16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vuzp2q_s32(<4 x i32> %a) {
+; CHECK: test_undef_vuzp2q_s32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vuzp2_u8(<8 x i8> %a) {
+; CHECK: test_undef_vuzp2_u8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vuzp2q_u8(<16 x i8> %a) {
+; CHECK: test_undef_vuzp2q_u8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vuzp2_u16(<4 x i16> %a) {
+; CHECK: test_undef_vuzp2_u16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vuzp2q_u16(<8 x i16> %a) {
+; CHECK: test_undef_vuzp2q_u16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vuzp2q_u32(<4 x i32> %a) {
+; CHECK: test_undef_vuzp2q_u32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_undef_vuzp2q_f32(<4 x float> %a) {
+; CHECK: test_undef_vuzp2q_f32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vuzp2_p8(<8 x i8> %a) {
+; CHECK: test_undef_vuzp2_p8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vuzp2q_p8(<16 x i8> %a) {
+; CHECK: test_undef_vuzp2q_p8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vuzp2_p16(<4 x i16> %a) {
+; CHECK: test_undef_vuzp2_p16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vuzp2q_p16(<8 x i16> %a) {
+; CHECK: test_undef_vuzp2q_p16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vzip1_s8(<8 x i8> %a) {
+; CHECK: test_undef_vzip1_s8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vzip1q_s8(<16 x i8> %a) {
+; CHECK: test_undef_vzip1q_s8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vzip1_s16(<4 x i16> %a) {
+; CHECK: test_undef_vzip1_s16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vzip1q_s16(<8 x i16> %a) {
+; CHECK: test_undef_vzip1q_s16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vzip1q_s32(<4 x i32> %a) {
+; CHECK: test_undef_vzip1q_s32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vzip1_u8(<8 x i8> %a) {
+; CHECK: test_undef_vzip1_u8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vzip1q_u8(<16 x i8> %a) {
+; CHECK: test_undef_vzip1q_u8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vzip1_u16(<4 x i16> %a) {
+; CHECK: test_undef_vzip1_u16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vzip1q_u16(<8 x i16> %a) {
+; CHECK: test_undef_vzip1q_u16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vzip1q_u32(<4 x i32> %a) {
+; CHECK: test_undef_vzip1q_u32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_undef_vzip1q_f32(<4 x float> %a) {
+; CHECK: test_undef_vzip1q_f32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vzip1_p8(<8 x i8> %a) {
+; CHECK: test_undef_vzip1_p8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vzip1q_p8(<16 x i8> %a) {
+; CHECK: test_undef_vzip1q_p8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vzip1_p16(<4 x i16> %a) {
+; CHECK: test_undef_vzip1_p16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vzip1q_p16(<8 x i16> %a) {
+; CHECK: test_undef_vzip1q_p16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vzip2_s8(<8 x i8> %a) {
+; CHECK: test_undef_vzip2_s8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vzip2q_s8(<16 x i8> %a) {
+; CHECK: test_undef_vzip2q_s8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vzip2_s16(<4 x i16> %a) {
+; CHECK: test_undef_vzip2_s16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vzip2q_s16(<8 x i16> %a) {
+; CHECK: test_undef_vzip2q_s16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vzip2q_s32(<4 x i32> %a) {
+; CHECK: test_undef_vzip2q_s32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vzip2_u8(<8 x i8> %a) {
+; CHECK: test_undef_vzip2_u8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vzip2q_u8(<16 x i8> %a) {
+; CHECK: test_undef_vzip2q_u8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vzip2_u16(<4 x i16> %a) {
+; CHECK: test_undef_vzip2_u16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vzip2q_u16(<8 x i16> %a) {
+; CHECK: test_undef_vzip2q_u16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vzip2q_u32(<4 x i32> %a) {
+; CHECK: test_undef_vzip2q_u32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_undef_vzip2q_f32(<4 x float> %a) {
+; CHECK: test_undef_vzip2q_f32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vzip2_p8(<8 x i8> %a) {
+; CHECK: test_undef_vzip2_p8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vzip2q_p8(<16 x i8> %a) {
+; CHECK: test_undef_vzip2q_p8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vzip2_p16(<4 x i16> %a) {
+; CHECK: test_undef_vzip2_p16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vzip2q_p16(<8 x i16> %a) {
+; CHECK: test_undef_vzip2q_p16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vtrn1_s8(<8 x i8> %a) {
+; CHECK: test_undef_vtrn1_s8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vtrn1q_s8(<16 x i8> %a) {
+; CHECK: test_undef_vtrn1q_s8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vtrn1_s16(<4 x i16> %a) {
+; CHECK: test_undef_vtrn1_s16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vtrn1q_s16(<8 x i16> %a) {
+; CHECK: test_undef_vtrn1q_s16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vtrn1q_s32(<4 x i32> %a) {
+; CHECK: test_undef_vtrn1q_s32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vtrn1_u8(<8 x i8> %a) {
+; CHECK: test_undef_vtrn1_u8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vtrn1q_u8(<16 x i8> %a) {
+; CHECK: test_undef_vtrn1q_u8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vtrn1_u16(<4 x i16> %a) {
+; CHECK: test_undef_vtrn1_u16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vtrn1q_u16(<8 x i16> %a) {
+; CHECK: test_undef_vtrn1q_u16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vtrn1q_u32(<4 x i32> %a) {
+; CHECK: test_undef_vtrn1q_u32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_undef_vtrn1q_f32(<4 x float> %a) {
+; CHECK: test_undef_vtrn1q_f32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vtrn1_p8(<8 x i8> %a) {
+; CHECK: test_undef_vtrn1_p8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vtrn1q_p8(<16 x i8> %a) {
+; CHECK: test_undef_vtrn1q_p8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vtrn1_p16(<4 x i16> %a) {
+; CHECK: test_undef_vtrn1_p16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vtrn1q_p16(<8 x i16> %a) {
+; CHECK: test_undef_vtrn1q_p16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vtrn2_s8(<8 x i8> %a) {
+; CHECK: test_undef_vtrn2_s8:
+; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vtrn2q_s8(<16 x i8> %a) {
+; CHECK: test_undef_vtrn2q_s8:
+; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vtrn2_s16(<4 x i16> %a) {
+; CHECK: test_undef_vtrn2_s16:
+; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vtrn2q_s16(<8 x i16> %a) {
+; CHECK: test_undef_vtrn2q_s16:
+; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vtrn2q_s32(<4 x i32> %a) {
+; CHECK: test_undef_vtrn2q_s32:
+; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vtrn2_u8(<8 x i8> %a) {
+; CHECK: test_undef_vtrn2_u8:
+; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vtrn2q_u8(<16 x i8> %a) {
+; CHECK: test_undef_vtrn2q_u8:
+; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vtrn2_u16(<4 x i16> %a) {
+; CHECK: test_undef_vtrn2_u16:
+; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vtrn2q_u16(<8 x i16> %a) {
+; CHECK: test_undef_vtrn2q_u16:
+; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_undef_vtrn2q_u32(<4 x i32> %a) {
+; CHECK: test_undef_vtrn2q_u32:
+; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_undef_vtrn2q_f32(<4 x float> %a) {
+; CHECK: test_undef_vtrn2q_f32:
+; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <8 x i8> @test_undef_vtrn2_p8(<8 x i8> %a) {
+; CHECK: test_undef_vtrn2_p8:
+; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_undef_vtrn2q_p8(<16 x i8> %a) {
+; CHECK: test_undef_vtrn2q_p8:
+; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_undef_vtrn2_p16(<4 x i16> %a) {
+; CHECK: test_undef_vtrn2_p16:
+; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_undef_vtrn2q_p16(<8 x i16> %a) {
+; CHECK: test_undef_vtrn2q_p16:
+; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
 define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK: test_vuzp_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
diff --git a/test/CodeGen/AArch64/neon-scalar-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-add-sub.ll
index 09ca880..4f322e0 100644
--- a/test/CodeGen/AArch64/neon-scalar-add-sub.ll
+++ b/test/CodeGen/AArch64/neon-scalar-add-sub.ll
@@ -1,13 +1,13 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp3 = add <1 x i64> %A, %B;
 	ret <1 x i64> %tmp3
 }
 
 define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp3 = sub <1 x i64> %A, %B;
 	ret <1 x i64> %tmp3
 }
@@ -18,14 +18,14 @@ declare <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_add_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_add_v1i64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+; CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @test_uadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_uadd_v1i64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
@@ -35,14 +35,14 @@ declare <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_sub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_sub_v1i64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+; CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @test_usub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_usub_v1i64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index 8ce42de..247514c 100644
--- a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -5,7 +5,7 @@ declare double @llvm.fma.f64(double, double, double)
 
 define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
   ; CHECK: test_fmla_ss4S
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
@@ -13,7 +13,7 @@ define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
 
 define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) {
   ; CHECK: test_fmla_ss4S_swap
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a)
   ret float %tmp2
@@ -21,7 +21,7 @@ define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) {
 
 define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) {
   ; CHECK: test_fmla_ss2S
-  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[1]
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
@@ -29,7 +29,7 @@ define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) {
 
 define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) {
   ; CHECK: test_fmla_ddD
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[0]
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
@@ -37,7 +37,7 @@ define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) {
 
 define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) {
   ; CHECK: test_fmla_dd2D
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
@@ -45,7 +45,7 @@ define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) {
 
 define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) {
   ; CHECK: test_fmla_dd2D_swap
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
   ret double %tmp2
@@ -53,7 +53,7 @@ define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) {
 
 define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
   ; CHECK: test_fmls_ss4S
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
   %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a)
@@ -62,7 +62,7 @@ define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
 
 define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
   ; CHECK: test_fmls_ss4S_swap
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
   %tmp3 = call float @llvm.fma.f32(float %tmp1, float %tmp2, float %a)
@@ -72,7 +72,7 @@ define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
 
 define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
   ; CHECK: test_fmls_ss2S
-  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[1]
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fsub float -0.0, %tmp1
   %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a)
@@ -81,7 +81,7 @@ define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
 
 define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) {
   ; CHECK: test_fmls_ddD
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[0]
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = fsub double -0.0, %tmp1
   %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
@@ -90,7 +90,7 @@ define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) {
 
 define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
   ; CHECK: test_fmls_dd2D
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
   %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
@@ -99,7 +99,7 @@ define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
 
 define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) {
   ; CHECK: test_fmls_dd2D_swap
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
   %tmp3 = call double @llvm.fma.f64(double %tmp1, double %tmp2, double %a)
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
index 968ad3e..c9128e7 100644
--- a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
+++ b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
@@ -2,7 +2,7 @@
 
 define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
   ; CHECK: test_fmul_lane_ss2S
-  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fmul float %a, %tmp1;
   ret float %tmp2;
@@ -10,7 +10,7 @@ define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
 
 define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
   ; CHECK: test_fmul_lane_ss2S_swap
-  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fmul float %tmp1, %a;
   ret float %tmp2;
@@ -19,7 +19,7 @@ define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
 
 define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
   ; CHECK: test_fmul_lane_ss4S
-  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fmul float %a, %tmp1;
   ret float %tmp2;
@@ -27,7 +27,7 @@ define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
 
 define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
   ; CHECK: test_fmul_lane_ss4S_swap
-  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fmul float %tmp1, %a;
   ret float %tmp2;
@@ -36,7 +36,7 @@ define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
 
 define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
   ; CHECK: test_fmul_lane_ddD
-  ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = fmul double %a, %tmp1;
   ret double %tmp2;
@@ -46,7 +46,7 @@ define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
 
 define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
   ; CHECK: test_fmul_lane_dd2D
-  ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fmul double %a, %tmp1;
   ret double %tmp2;
@@ -55,7 +55,7 @@ define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
 
 define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
   ; CHECK: test_fmul_lane_dd2D_swap
-  ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fmul double %tmp1, %a;
   ret double %tmp2;
@@ -65,7 +65,7 @@ declare float @llvm.aarch64.neon.vmulx.f32(float, float)
 
 define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
   ; CHECK: test_fmulx_lane_f32
-  ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
   ret float %tmp2;
@@ -73,7 +73,7 @@ define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
 
 define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
   ; CHECK: test_fmulx_laneq_f32
-  ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
   ret float %tmp2;
@@ -81,7 +81,7 @@ define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
 
 define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
   ; CHECK: test_fmulx_laneq_f32_swap
-  ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %tmp1, float %a)
   ret float %tmp2;
@@ -91,7 +91,7 @@ declare double @llvm.aarch64.neon.vmulx.f64(double, double)
 
 define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
   ; CHECK: test_fmulx_lane_f64
-  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
@@ -99,7 +99,7 @@ define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
 
 define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
   ; CHECK: test_fmulx_laneq_f64_0
-  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
   %tmp1 = extractelement <2 x double> %v, i32 0
   %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
@@ -108,7 +108,7 @@ define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
 
 define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
   ; CHECK: test_fmulx_laneq_f64_1
-  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
@@ -116,7 +116,7 @@ define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
 
 define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
   ; CHECK: test_fmulx_laneq_f64_1_swap
-  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %tmp1, double %a)
   ret double %tmp2;
diff --git a/test/CodeGen/AArch64/neon-scalar-compare.ll b/test/CodeGen/AArch64/neon-scalar-compare.ll
index 5f10cbb..e1f3964 100644
--- a/test/CodeGen/AArch64/neon-scalar-compare.ll
+++ b/test/CodeGen/AArch64/neon-scalar-compare.ll
@@ -122,28 +122,28 @@ entry:
 define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 {
 ; CHECK: test_vcage_f64
 ; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcage2.i = tail call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %a, <1 x double> %b) #2
+  %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2
   ret <1 x i64> %vcage2.i
 }
 
 define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 {
 ; CHECK: test_vcagt_f64
 ; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcagt2.i = tail call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %a, <1 x double> %b) #2
+  %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2
   ret <1 x i64> %vcagt2.i
 }
 
 define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 {
 ; CHECK: test_vcale_f64
 ; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcage2.i = tail call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %b, <1 x double> %a) #2
+  %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2
   ret <1 x i64> %vcage2.i
 }
 
 define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 {
 ; CHECK: test_vcalt_f64
 ; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcagt2.i = tail call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %b, <1 x double> %a) #2
+  %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2
   ret <1 x i64> %vcagt2.i
 }
 
@@ -271,7 +271,7 @@ define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 {
 ; CHECK: test_vceqz_s64
 ; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
   %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = zext <1 x i1> %1 to <1 x i64>
+  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
   ret <1 x i64> %vceqz.i
 }
 
@@ -279,7 +279,7 @@ define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 {
 ; CHECK: test_vceqz_u64
 ; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
   %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = zext <1 x i1> %1 to <1 x i64>
+  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
   ret <1 x i64> %vceqz.i
 }
 
@@ -287,7 +287,7 @@ define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 {
 ; CHECK: test_vceqz_p64
 ; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
   %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = zext <1 x i1> %1 to <1 x i64>
+  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
   ret <1 x i64> %vceqz.i
 }
 
@@ -295,7 +295,7 @@ define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 {
 ; CHECK: test_vceqzq_p64
 ; CHECK: cmeq  {{v[0-9]}}.2d, {{v[0-9]}}.2d, #0
   %1 = icmp eq <2 x i64> %a, zeroinitializer
-  %vceqz.i = zext <2 x i1> %1 to <2 x i64>
+  %vceqz.i = sext <2 x i1> %1 to <2 x i64>
   ret <2 x i64> %vceqz.i
 }
 
@@ -303,7 +303,7 @@ define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 {
 ; CHECK: test_vcgez_s64
 ; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
   %1 = icmp sge <1 x i64> %a, zeroinitializer
-  %vcgez.i = zext <1 x i1> %1 to <1 x i64>
+  %vcgez.i = sext <1 x i1> %1 to <1 x i64>
   ret <1 x i64> %vcgez.i
 }
 
@@ -311,7 +311,7 @@ define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 {
 ; CHECK: test_vclez_s64
 ; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
   %1 = icmp sle <1 x i64> %a, zeroinitializer
-  %vclez.i = zext <1 x i1> %1 to <1 x i64>
+  %vclez.i = sext <1 x i1> %1 to <1 x i64>
   ret <1 x i64> %vclez.i
 }
 
@@ -319,7 +319,7 @@ define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 {
 ; CHECK: test_vcgtz_s64
 ; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
   %1 = icmp sgt <1 x i64> %a, zeroinitializer
-  %vcgtz.i = zext <1 x i1> %1 to <1 x i64>
+  %vcgtz.i = sext <1 x i1> %1 to <1 x i64>
   ret <1 x i64> %vcgtz.i
 }
 
@@ -327,12 +327,12 @@ define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 {
 ; CHECK: test_vcltz_s64
 ; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0
   %1 = icmp slt <1 x i64> %a, zeroinitializer
-  %vcltz.i = zext <1 x i1> %1 to <1 x i64>
+  %vcltz.i = sext <1 x i1> %1 to <1 x i64>
   ret <1 x i64> %vcltz.i
 }
 
-declare <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double>, <1 x double>)
 declare <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
 declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
 declare <1 x i64> @llvm.aarch64.neon.vchs.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
index d433ff5..fadd734 100644
--- a/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -2,21 +2,30 @@
 
 define float @test_dup_sv2S(<2 x float> %v) {
  ;CHECK: test_dup_sv2S
- ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+ ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
  %tmp1 = extractelement <2 x float> %v, i32 1
  ret float  %tmp1
 }
 
+define float @test_dup_sv2S_0(<2 x float> %v) {
+ ;CHECK-LABEL: test_dup_sv2S_0
+ ;CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[0]
+ ;CHECK: ret
+ %tmp1 = extractelement <2 x float> %v, i32 0
+ ret float  %tmp1
+}
+
 define float @test_dup_sv4S(<4 x float> %v) {
- ;CHECK: test_dup_sv4S
- ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[0]
+ ;CHECK-LABEL: test_dup_sv4S
+ ;CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[0]
+ ;CHECK: ret
  %tmp1 = extractelement <4 x float> %v, i32 0
  ret float  %tmp1
 }
 
 define double @test_dup_dvD(<1 x double> %v) {
  ;CHECK: test_dup_dvD
- ;CHECK-NOT: dup {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+ ;CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
  ;CHECK: ret
  %tmp1 = extractelement <1 x double> %v, i32 0
  ret double  %tmp1
@@ -24,63 +33,71 @@ define double @test_dup_dvD(<1 x double> %v) {
 
 define double @test_dup_dv2D(<2 x double> %v) {
  ;CHECK: test_dup_dv2D
- ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+ ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+ %tmp1 = extractelement <2 x double> %v, i32 1
+ ret double  %tmp1
+}
+
+define double @test_dup_dv2D_0(<2 x double> %v) {
+ ;CHECK: test_dup_dv2D_0
+ ;CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+ ;CHECK: ret
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
 }
 
 define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) {
  ;CHECK: test_vector_dup_bv16B
- ;CHECK: dup {{b[0-31]+}}, {{v[0-31]+}}.b[14]
+ ;CHECK: dup {{b[0-9]+}}, {{v[0-9]+}}.b[14]
  %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> <i32 14> 
  ret <1 x i8> %shuffle.i
 }
 
 define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) {
  ;CHECK: test_vector_dup_bv8B
- ;CHECK: dup {{b[0-31]+}}, {{v[0-31]+}}.b[7]
+ ;CHECK: dup {{b[0-9]+}}, {{v[0-9]+}}.b[7]
  %shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> <i32 7> 
  ret <1 x i8> %shuffle.i
 }
 
 define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) {
  ;CHECK: test_vector_dup_hv8H
- ;CHECK: dup {{h[0-31]+}}, {{v[0-31]+}}.h[7]
+ ;CHECK: dup {{h[0-9]+}}, {{v[0-9]+}}.h[7]
  %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> <i32 7> 
  ret <1 x i16> %shuffle.i
 }
 
 define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) {
  ;CHECK: test_vector_dup_hv4H
- ;CHECK: dup {{h[0-31]+}}, {{v[0-31]+}}.h[3]
+ ;CHECK: dup {{h[0-9]+}}, {{v[0-9]+}}.h[3]
  %shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> <i32 3> 
  ret <1 x i16> %shuffle.i
 }
 
 define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) {
  ;CHECK: test_vector_dup_sv4S
- ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+ ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3]
  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> <i32 3> 
  ret <1 x i32> %shuffle
 }
 
 define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) {
  ;CHECK: test_vector_dup_sv2S
- ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+ ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> <i32 1> 
  ret <1 x i32> %shuffle
 }
 
 define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) {
  ;CHECK: test_vector_dup_dv2D
- ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+ ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
  %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> <i32 1> 
  ret <1 x i64> %shuffle.i
 }
 
 define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) {
   ;CHECK: test_vector_copy_dup_dv2D
-  ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %vget_lane = extractelement <2 x i64> %c, i32 1
   %vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0
   ret <1 x i64> %vset_lane
diff --git a/test/CodeGen/AArch64/neon-scalar-cvt.ll b/test/CodeGen/AArch64/neon-scalar-cvt.ll
index a06d5d6..3a19bed 100644
--- a/test/CodeGen/AArch64/neon-scalar-cvt.ll
+++ b/test/CodeGen/AArch64/neon-scalar-cvt.ll
@@ -5,133 +5,129 @@ define float @test_vcvts_f32_s32(i32 %a) {
 ; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}
 entry:
   %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtf32.s32(<1 x i32> %vcvtf.i)
+  %0 = call float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32> %vcvtf.i)
   ret float %0
 }
 
-declare float @llvm.aarch64.neon.vcvtf32.s32(<1 x i32>)
+declare float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32>)
 
 define double @test_vcvtd_f64_s64(i64 %a) {
 ; CHECK: test_vcvtd_f64_s64
 ; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
 entry:
   %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtf64.s64(<1 x i64> %vcvtf.i)
+  %0 = call double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64> %vcvtf.i)
   ret double %0
 }
 
-declare double @llvm.aarch64.neon.vcvtf64.s64(<1 x i64>)
+declare double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64>)
 
 define float @test_vcvts_f32_u32(i32 %a) {
 ; CHECK: test_vcvts_f32_u32
 ; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}
 entry:
   %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtf32.u32(<1 x i32> %vcvtf.i)
+  %0 = call float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32> %vcvtf.i)
   ret float %0
 }
 
-declare float @llvm.aarch64.neon.vcvtf32.u32(<1 x i32>)
+declare float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32>)
 
 define double @test_vcvtd_f64_u64(i64 %a) {
 ; CHECK: test_vcvtd_f64_u64
 ; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}
 entry:
   %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtf64.u64(<1 x i64> %vcvtf.i)
+  %0 = call double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64> %vcvtf.i)
   ret double %0
 }
 
-declare double @llvm.aarch64.neon.vcvtf64.u64(<1 x i64>)
+declare double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64>)
 
 define float @test_vcvts_n_f32_s32(i32 %a) {
 ; CHECK: test_vcvts_n_f32_s32
 ; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
 entry:
   %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtf32.n.s32(<1 x i32> %vcvtf, i32 1)
+  %0 = call float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1)
   ret float %0
 }
 
-declare float @llvm.aarch64.neon.vcvtf32.n.s32(<1 x i32>, i32)
+declare float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32>, i32)
 
 define double @test_vcvtd_n_f64_s64(i64 %a) {
 ; CHECK: test_vcvtd_n_f64_s64
 ; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
 entry:
   %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtf64.n.s64(<1 x i64> %vcvtf, i32 1)
+  %0 = call double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1)
   ret double %0
 }
 
-declare double @llvm.aarch64.neon.vcvtf64.n.s64(<1 x i64>, i32)
+declare double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64>, i32)
 
 define float @test_vcvts_n_f32_u32(i32 %a) {
 ; CHECK: test_vcvts_n_f32_u32
 ; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
 entry:
   %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtf32.n.u32(<1 x i32> %vcvtf, i32 1)
+  %0 = call float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1)
   ret float %0
 }
 
-declare float @llvm.aarch64.neon.vcvtf32.n.u32(<1 x i32>, i32)
+declare float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32>, i32)
 
 define double @test_vcvtd_n_f64_u64(i64 %a) {
 ; CHECK: test_vcvtd_n_f64_u64
 ; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
 entry:
   %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtf64.n.u64(<1 x i64> %vcvtf, i32 1)
+  %0 = call double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1)
   ret double %0
 }
 
-declare double @llvm.aarch64.neon.vcvtf64.n.u64(<1 x i64>, i32)
+declare double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64>, i32)
 
 define i32 @test_vcvts_n_s32_f32(float %a) {
 ; CHECK: test_vcvts_n_s32_f32
 ; CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1
 entry:
-  %fcvtzs = insertelement <1 x float> undef, float %a, i32 0
-  %fcvtzs1 = call <1 x i32> @llvm.aarch64.neon.vcvts.n.s32.f32(<1 x float> %fcvtzs, i32 1)
+  %fcvtzs1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float %a, i32 1)
   %0 = extractelement <1 x i32> %fcvtzs1, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.vcvts.n.s32.f32(<1 x float>, i32)
+declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float, i32)
 
 define i64 @test_vcvtd_n_s64_f64(double %a) {
 ; CHECK: test_vcvtd_n_s64_f64
 ; CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1
 entry:
-  %fcvtzs = insertelement <1 x double> undef, double %a, i32 0
-  %fcvtzs1 = call <1 x i64> @llvm.aarch64.neon.vcvtd.n.s64.f64(<1 x double> %fcvtzs, i32 1)
+  %fcvtzs1 = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double %a, i32 1)
   %0 = extractelement <1 x i64> %fcvtzs1, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.vcvtd.n.s64.f64(<1 x double>, i32)
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double, i32)
 
 define i32 @test_vcvts_n_u32_f32(float %a) {
 ; CHECK: test_vcvts_n_u32_f32
 ; CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32
 entry:
-  %fcvtzu = insertelement <1 x float> undef, float %a, i32 0
-  %fcvtzu1 = call <1 x i32> @llvm.aarch64.neon.vcvts.n.u32.f32(<1 x float> %fcvtzu, i32 32)
+  %fcvtzu1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float %a, i32 32)
   %0 = extractelement <1 x i32> %fcvtzu1, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.vcvts.n.u32.f32(<1 x float>, i32)
+declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float, i32)
 
 define i64 @test_vcvtd_n_u64_f64(double %a) {
 ; CHECK: test_vcvtd_n_u64_f64
 ; CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64
 entry:
-  %fcvtzu = insertelement <1 x double> undef, double %a, i32 0
-  %fcvtzu1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtd.n.u64.f64(<1 x double> %fcvtzu, i32 64)
+  %fcvtzu1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double %a, i32 64)
   %0 = extractelement <1 x i64> %fcvtzu1, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.vcvtd.n.u64.f64(<1 x double>, i32)
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-ext.ll b/test/CodeGen/AArch64/neon-scalar-ext.ll
new file mode 100644
index 0000000..51dea06
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-ext.ll
@@ -0,0 +1,113 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define <1 x i64> @test_zext_v1i32_v1i64(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: test_zext_v1i32_v1i64:
+; CHECK: ushll	v0.2d, v0.2s, #0
+  %1 = extractelement <2 x i32> %v, i32 0
+  %2 = insertelement <1 x i32> undef, i32 %1, i32 0
+  %3 = zext <1 x i32> %2 to <1 x i64>
+  ret <1 x i64> %3
+}
+
+define <1 x i32> @test_zext_v1i16_v1i32(<4 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_zext_v1i16_v1i32:
+; CHECK: ushll	v0.4s, v0.4h, #0
+  %1 = extractelement <4 x i16> %v, i32 0
+  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
+  %3 = zext <1 x i16> %2 to <1 x i32>
+  ret <1 x i32> %3
+}
+
+define <1 x i16> @test_zext_v1i8_v1i16(<8 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_zext_v1i8_v1i16:
+; CHECK: ushll	v0.8h, v0.8b, #0
+  %1 = extractelement <8 x i8> %v, i32 0
+  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
+  %3 = zext <1 x i8> %2 to <1 x i16>
+  ret <1 x i16> %3
+}
+
+define <1 x i32> @test_zext_v1i8_v1i32(<8 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_zext_v1i8_v1i32:
+; CHECK: dup     b0, v0.b[0]
+  %1 = extractelement <8 x i8> %v, i32 0
+  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
+  %3 = zext <1 x i8> %2 to <1 x i32>
+  ret <1 x i32> %3
+}
+
+define <1 x i64> @test_zext_v1i16_v1i64(<4 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_zext_v1i16_v1i64:
+; CHECK: dup    h0, v0.h[0]
+  %1 = extractelement <4 x i16> %v, i32 0
+  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
+  %3 = zext <1 x i16> %2 to <1 x i64>
+  ret <1 x i64> %3
+}
+
+define <1 x i64> @test_zext_v1i8_v1i64(<8 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_zext_v1i8_v1i64:
+; CHECK: dup	b0, v0.b[0]
+  %1 = extractelement <8 x i8> %v, i32 0
+  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
+  %3 = zext <1 x i8> %2 to <1 x i64>
+  ret <1 x i64> %3
+}
+
+define <1 x i64> @test_sext_v1i32_v1i64(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: test_sext_v1i32_v1i64:
+; CHECK: sshll	v0.2d, v0.2s, #0
+  %1 = extractelement <2 x i32> %v, i32 0
+  %2 = insertelement <1 x i32> undef, i32 %1, i32 0
+  %3 = sext <1 x i32> %2 to <1 x i64>
+  ret <1 x i64> %3
+}
+
+define <1 x i32> @test_sext_v1i16_v1i32(<4 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_sext_v1i16_v1i32:
+; CHECK: sshll	v0.4s, v0.4h, #0
+  %1 = extractelement <4 x i16> %v, i32 0
+  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
+  %3 = sext <1 x i16> %2 to <1 x i32>
+  ret <1 x i32> %3
+}
+
+define <1 x i16> @test_sext_v1i8_v1i16(<8 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_sext_v1i8_v1i16:
+; CHECK: sshll	v0.8h, v0.8b, #0
+  %1 = extractelement <8 x i8> %v, i32 0
+  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
+  %3 = sext <1 x i8> %2 to <1 x i16>
+  ret <1 x i16> %3
+}
+
+define <1 x i32> @test_sext_v1i8_v1i32(<8 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_sext_v1i8_v1i32:
+; CHECK: sshll	v0.8h, v0.8b, #0
+; CHECK: sshll	v0.4s, v0.4h, #0
+  %1 = extractelement <8 x i8> %v, i32 0
+  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
+  %3 = sext <1 x i8> %2 to <1 x i32>
+  ret <1 x i32> %3
+}
+
+define <1 x i64> @test_sext_v1i16_v1i64(<4 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_sext_v1i16_v1i64:
+; CHECK: sshll	v0.4s, v0.4h, #0
+; CHECK: sshll	v0.2d, v0.2s, #0
+  %1 = extractelement <4 x i16> %v, i32 0
+  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
+  %3 = sext <1 x i16> %2 to <1 x i64>
+  ret <1 x i64> %3
+}
+
+define <1 x i64> @test_sext_v1i8_v1i64(<8 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_sext_v1i8_v1i64:
+; CHECK: sshll	v0.8h, v0.8b, #0
+; CHECK: sshll	v0.4s, v0.4h, #0
+; CHECK: sshll	v0.2d, v0.2s, #0
+  %1 = extractelement <8 x i8> %v, i32 0
+  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
+  %3 = sext <1 x i8> %2 to <1 x i64>
+  ret <1 x i64> %3
+}
diff --git a/test/CodeGen/AArch64/neon-scalar-fabd.ll b/test/CodeGen/AArch64/neon-scalar-fabd.ll
index 75686d3..6343310 100644
--- a/test/CodeGen/AArch64/neon-scalar-fabd.ll
+++ b/test/CodeGen/AArch64/neon-scalar-fabd.ll
@@ -4,10 +4,7 @@ define float @test_vabds_f32(float %a, float %b) {
 ; CHECK-LABEL: test_vabds_f32
 ; CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 entry:
-  %vabd.i = insertelement <1 x float> undef, float %a, i32 0
-  %vabd1.i = insertelement <1 x float> undef, float %b, i32 0
-  %vabd2.i = call <1 x float> @llvm.aarch64.neon.vabd.v1f32(<1 x float> %vabd.i, <1 x float> %vabd1.i)
-  %0 = extractelement <1 x float> %vabd2.i, i32 0
+  %0 = call float @llvm.aarch64.neon.vabd.f32(float %a, float %a)
   ret float %0
 }
 
@@ -15,12 +12,9 @@ define double @test_vabdd_f64(double %a, double %b) {
 ; CHECK-LABEL: test_vabdd_f64
 ; CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 entry:
-  %vabd.i = insertelement <1 x double> undef, double %a, i32 0
-  %vabd1.i = insertelement <1 x double> undef, double %b, i32 0
-  %vabd2.i = call <1 x double> @llvm.aarch64.neon.vabd.v1f64(<1 x double> %vabd.i, <1 x double> %vabd1.i)
-  %0 = extractelement <1 x double> %vabd2.i, i32 0
+  %0 = call double @llvm.aarch64.neon.vabd.f64(double %a, double %b)
   ret double %0
 }
 
-declare <1 x double> @llvm.aarch64.neon.vabd.v1f64(<1 x double>, <1 x double>)
-declare <1 x float> @llvm.aarch64.neon.vabd.v1f32(<1 x float>, <1 x float>)
+declare double @llvm.aarch64.neon.vabd.f64(double, double)
+declare float @llvm.aarch64.neon.vabd.f32(float, float)
diff --git a/test/CodeGen/AArch64/neon-scalar-fcvt.ll b/test/CodeGen/AArch64/neon-scalar-fcvt.ll
index d7b84fa..6cf30a7 100644
--- a/test/CodeGen/AArch64/neon-scalar-fcvt.ll
+++ b/test/CodeGen/AArch64/neon-scalar-fcvt.ll
@@ -6,250 +6,228 @@ define float @test_vcvtxn(double %a) {
 ; CHECK: test_vcvtxn
 ; CHECK: fcvtxn {{s[0-9]}}, {{d[0-9]}}
 entry:
-  %vcvtf.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcvtf1.i = tail call <1 x float> @llvm.aarch64.neon.fcvtxn.v1f32.v1f64(<1 x double> %vcvtf.i)
-  %0 = extractelement <1 x float> %vcvtf1.i, i32 0
-  ret float %0
+  %vcvtf = call float @llvm.aarch64.neon.fcvtxn(double %a)
+  ret float %vcvtf
 }
 
-declare <1 x float> @llvm.aarch64.neon.fcvtxn.v1f32.v1f64(<1 x double>)
+declare float @llvm.aarch64.neon.fcvtxn(double)
 
 define i32 @test_vcvtass(float %a) {
 ; CHECK: test_vcvtass
 ; CHECK: fcvtas {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcvtas.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcvtas1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.v1f32(<1 x float> %vcvtas.i)
+  %vcvtas1.i = call <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float %a)
   %0 = extractelement <1 x i32> %vcvtas1.i, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.v1f32(<1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float)
 
 define i64 @test_test_vcvtasd(double %a) {
 ; CHECK: test_test_vcvtasd
 ; CHECK: fcvtas {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcvtas.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcvtas1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %vcvtas.i)
+  %vcvtas1.i = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double %a)
   %0 = extractelement <1 x i64> %vcvtas1.i, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double)
 
 define i32 @test_vcvtaus(float %a) {
 ; CHECK: test_vcvtaus
 ; CHECK: fcvtau {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcvtau.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcvtau1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.v1f32(<1 x float> %vcvtau.i)
+  %vcvtau1.i = call <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float %a)
   %0 = extractelement <1 x i32> %vcvtau1.i, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.v1f32(<1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float)
 
 define i64 @test_vcvtaud(double %a) {
 ; CHECK: test_vcvtaud
 ; CHECK: fcvtau {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcvtau.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcvtau1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %vcvtau.i)
+  %vcvtau1.i = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double %a)
   %0 = extractelement <1 x i64> %vcvtau1.i, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>) 
+declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double) 
 
 define i32 @test_vcvtmss(float %a) {
 ; CHECK: test_vcvtmss
 ; CHECK: fcvtms {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcvtms.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcvtms1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.v1f32(<1 x float> %vcvtms.i)
+  %vcvtms1.i = call <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float %a)
   %0 = extractelement <1 x i32> %vcvtms1.i, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.v1f32(<1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float)
 
 define i64 @test_vcvtmd_s64_f64(double %a) {
 ; CHECK: test_vcvtmd_s64_f64
 ; CHECK: fcvtms {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcvtms.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcvtms1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %vcvtms.i)
+  %vcvtms1.i = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double %a)
   %0 = extractelement <1 x i64> %vcvtms1.i, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double)
 
 define i32 @test_vcvtmus(float %a) {
 ; CHECK: test_vcvtmus
 ; CHECK: fcvtmu {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcvtmu.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcvtmu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.v1f32(<1 x float> %vcvtmu.i)
+  %vcvtmu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float %a)
   %0 = extractelement <1 x i32> %vcvtmu1.i, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.v1f32(<1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float)
 
 define i64 @test_vcvtmud(double %a) {
 ; CHECK: test_vcvtmud
 ; CHECK: fcvtmu {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcvtmu.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcvtmu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %vcvtmu.i)
+  %vcvtmu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double %a)
   %0 = extractelement <1 x i64> %vcvtmu1.i, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double)
 
 define i32 @test_vcvtnss(float %a) {
 ; CHECK: test_vcvtnss
 ; CHECK: fcvtns {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcvtns.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcvtns1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.v1f32(<1 x float> %vcvtns.i)
+  %vcvtns1.i = call <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float %a)
   %0 = extractelement <1 x i32> %vcvtns1.i, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.v1f32(<1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float)
 
 define i64 @test_vcvtnd_s64_f64(double %a) {
 ; CHECK: test_vcvtnd_s64_f64
 ; CHECK: fcvtns {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcvtns.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcvtns1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %vcvtns.i)
+  %vcvtns1.i = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double %a)
   %0 = extractelement <1 x i64> %vcvtns1.i, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double)
 
 define i32 @test_vcvtnus(float %a) {
 ; CHECK: test_vcvtnus
 ; CHECK: fcvtnu {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcvtnu.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcvtnu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.v1f32(<1 x float> %vcvtnu.i)
+  %vcvtnu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float %a)
   %0 = extractelement <1 x i32> %vcvtnu1.i, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.v1f32(<1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float)
 
 define i64 @test_vcvtnud(double %a) {
 ; CHECK: test_vcvtnud
 ; CHECK: fcvtnu {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcvtnu.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcvtnu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %vcvtnu.i)
+  %vcvtnu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double %a)
   %0 = extractelement <1 x i64> %vcvtnu1.i, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double)
 
 define i32 @test_vcvtpss(float %a) {
 ; CHECK: test_vcvtpss
 ; CHECK: fcvtps {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcvtps.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcvtps1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.v1f32(<1 x float> %vcvtps.i)
+  %vcvtps1.i = call <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float %a)
   %0 = extractelement <1 x i32> %vcvtps1.i, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.v1f32(<1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float)
 
 define i64 @test_vcvtpd_s64_f64(double %a) {
 ; CHECK: test_vcvtpd_s64_f64
 ; CHECK: fcvtps {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcvtps.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcvtps1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %vcvtps.i)
+  %vcvtps1.i = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double %a)
   %0 = extractelement <1 x i64> %vcvtps1.i, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double)
 
 define i32 @test_vcvtpus(float %a) {
 ; CHECK: test_vcvtpus
 ; CHECK: fcvtpu {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcvtpu.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcvtpu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.v1f32(<1 x float> %vcvtpu.i)
+  %vcvtpu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float %a)
   %0 = extractelement <1 x i32> %vcvtpu1.i, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.v1f32(<1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float)
 
 define i64 @test_vcvtpud(double %a) {
 ; CHECK: test_vcvtpud
 ; CHECK: fcvtpu {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcvtpu.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcvtpu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %vcvtpu.i)
+  %vcvtpu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double %a)
   %0 = extractelement <1 x i64> %vcvtpu1.i, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double)
 
 define i32 @test_vcvtss(float %a) {
 ; CHECK: test_vcvtss
 ; CHECK: fcvtzs {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcvtzs.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcvtzs1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.v1f32(<1 x float> %vcvtzs.i)
+  %vcvtzs1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float %a)
   %0 = extractelement <1 x i32> %vcvtzs1.i, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.v1f32(<1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float)
 
 define i64 @test_vcvtd_s64_f64(double %a) {
 ; CHECK: test_vcvtd_s64_f64
 ; CHECK: fcvtzs {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcvzs.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcvzs1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %vcvzs.i)
+  %vcvzs1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double %a)
   %0 = extractelement <1 x i64> %vcvzs1.i, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double)
 
 define i32 @test_vcvtus(float %a) {
 ; CHECK: test_vcvtus
 ; CHECK: fcvtzu {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcvtzu.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcvtzu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.v1f32(<1 x float> %vcvtzu.i)
+  %vcvtzu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float %a)
   %0 = extractelement <1 x i32> %vcvtzu1.i, i32 0
   ret i32 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.v1f32(<1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float)
 
 define i64 @test_vcvtud(double %a) {
 ; CHECK: test_vcvtud
 ; CHECK: fcvtzu {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcvtzu.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcvtzu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %vcvtzu.i)
+  %vcvtzu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double %a)
   %0 = extractelement <1 x i64> %vcvtzu1.i, i32 0
   ret i64 %0
 }
 
-declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double)
diff --git a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
index a6e5859..e0dce13 100644
--- a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
+++ b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
@@ -3,326 +3,280 @@
 ;; Scalar Floating-point Compare
 
 define i32 @test_vceqs_f32(float %a, float %b) {
-; CHECK: test_vceqs_f32
+; CHECK-LABEL: test_vceqs_f32
 ; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vceq.i = insertelement <1 x float> undef, float %a, i32 0
-  %vceq1.i = insertelement <1 x float> undef, float %b, i32 0
-  %vceq2.i = call <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float> %vceq.i, <1 x float> %vceq1.i)
-  %0 = extractelement <1 x i32> %vceq2.i, i32 0
+  %fceq2.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float %b)
+  %0 = extractelement <1 x i32> %fceq2.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vceqd_f64(double %a, double %b) {
-; CHECK: test_vceqd_f64
+; CHECK-LABEL: test_vceqd_f64
 ; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vceq.i = insertelement <1 x double> undef, double %a, i32 0
-  %vceq1.i = insertelement <1 x double> undef, double %b, i32 0
-  %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f64(<1 x double> %vceq.i, <1 x double> %vceq1.i)
-  %0 = extractelement <1 x i64> %vceq2.i, i32 0
+  %fceq2.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double %a, double %b)
+  %0 = extractelement <1 x i64> %fceq2.i, i32 0
   ret i64 %0
 }
 
-define <1 x i64> @test_vceqz_f64(<1 x double> %a) #0 {
-; CHECK: test_vceqz_f64
+define <1 x i64> @test_vceqz_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vceqz_f64
 ; CHECK: fcmeq  {{d[0-9]+}}, {{d[0-9]+}}, #0.0
 entry:
   %0 = fcmp oeq <1 x double> %a, zeroinitializer
-  %vceqz.i = zext <1 x i1> %0 to <1 x i64>
+  %vceqz.i = sext <1 x i1> %0 to <1 x i64>
   ret <1 x i64> %vceqz.i
 }
 
 define i32 @test_vceqzs_f32(float %a) {
-; CHECK: test_vceqzs_f32
+; CHECK-LABEL: test_vceqzs_f32
 ; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, #0.0
 entry:
-  %vceq.i = insertelement <1 x float> undef, float %a, i32 0
-  %vceq1.i = call <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float> %vceq.i, <1 x float> zeroinitializer)
-  %0 = extractelement <1 x i32> %vceq1.i, i32 0
+  %fceq1.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float 0.0)
+  %0 = extractelement <1 x i32> %fceq1.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vceqzd_f64(double %a) {
-; CHECK: test_vceqzd_f64
+; CHECK-LABEL: test_vceqzd_f64
 ; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, #0.0
 entry:
-  %vceq.i = insertelement <1 x double> undef, double %a, i32 0
-  %vceq1.i = tail call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f32(<1 x double> %vceq.i, <1 x float> zeroinitializer) #5
-  %0 = extractelement <1 x i64> %vceq1.i, i32 0
+  %fceq1.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double %a, float 0.0)
+  %0 = extractelement <1 x i64> %fceq1.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vcges_f32(float %a, float %b) {
-; CHECK: test_vcges_f32
+; CHECK-LABEL: test_vcges_f32
 ; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcge.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcge1.i = insertelement <1 x float> undef, float %b, i32 0
-  %vcge2.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> %vcge1.i)
-  %0 = extractelement <1 x i32> %vcge2.i, i32 0
+  %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b)
+  %0 = extractelement <1 x i32> %fcge2.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vcged_f64(double %a, double %b) {
-; CHECK: test_vcged_f64
+; CHECK-LABEL: test_vcged_f64
 ; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcge.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcge1.i = insertelement <1 x double> undef, double %b, i32 0
-  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double> %vcge.i, <1 x double> %vcge1.i)
-  %0 = extractelement <1 x i64> %vcge2.i, i32 0
+  %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b)
+  %0 = extractelement <1 x i64> %fcge2.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vcgezs_f32(float %a) {
-; CHECK: test_vcgezs_f32
+; CHECK-LABEL: test_vcgezs_f32
 ; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, #0.0
 entry:
-  %vcge.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcge1.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> zeroinitializer)
-  %0 = extractelement <1 x i32> %vcge1.i, i32 0
+  %fcge1.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float 0.0)
+  %0 = extractelement <1 x i32> %fcge1.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vcgezd_f64(double %a) {
-; CHECK: test_vcgezd_f64
+; CHECK-LABEL: test_vcgezd_f64
 ; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, #0.0
 entry:
-  %vcge.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcge1.i = tail call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f32(<1 x double> %vcge.i, <1 x float> zeroinitializer) #5
-  %0 = extractelement <1 x i64> %vcge1.i, i32 0
+  %fcge1.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double %a, float 0.0)
+  %0 = extractelement <1 x i64> %fcge1.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vcgts_f32(float %a, float %b) {
-; CHECK: test_vcgts_f32
+; CHECK-LABEL: test_vcgts_f32
 ; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcgt.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcgt1.i = insertelement <1 x float> undef, float %b, i32 0
-  %vcgt2.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> %vcgt1.i)
-  %0 = extractelement <1 x i32> %vcgt2.i, i32 0
+  %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b)
+  %0 = extractelement <1 x i32> %fcgt2.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vcgtd_f64(double %a, double %b) {
-; CHECK: test_vcgtd_f64
+; CHECK-LABEL: test_vcgtd_f64
 ; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcgt.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcgt1.i = insertelement <1 x double> undef, double %b, i32 0
-  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double> %vcgt.i, <1 x double> %vcgt1.i)
-  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
+  %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b)
+  %0 = extractelement <1 x i64> %fcgt2.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vcgtzs_f32(float %a) {
-; CHECK: test_vcgtzs_f32
+; CHECK-LABEL: test_vcgtzs_f32
 ; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, #0.0
 entry:
-  %vcgt.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcgt1.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> zeroinitializer)
-  %0 = extractelement <1 x i32> %vcgt1.i, i32 0
+  %fcgt1.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float 0.0)
+  %0 = extractelement <1 x i32> %fcgt1.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vcgtzd_f64(double %a) {
-; CHECK: test_vcgtzd_f64
+; CHECK-LABEL: test_vcgtzd_f64
 ; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, #0.0
 entry:
-  %vcgt.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcgt1.i = tail call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f32(<1 x double> %vcgt.i, <1 x float> zeroinitializer) #5
-  %0 = extractelement <1 x i64> %vcgt1.i, i32 0
+  %fcgt1.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double %a, float 0.0)
+  %0 = extractelement <1 x i64> %fcgt1.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vcles_f32(float %a, float %b) {
-; CHECK: test_vcles_f32
+; CHECK-LABEL: test_vcles_f32
 ; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcge.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcge1.i = insertelement <1 x float> undef, float %b, i32 0
-  %vcge2.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> %vcge1.i)
-  %0 = extractelement <1 x i32> %vcge2.i, i32 0
+  %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b)
+  %0 = extractelement <1 x i32> %fcge2.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vcled_f64(double %a, double %b) {
-; CHECK: test_vcled_f64
+; CHECK-LABEL: test_vcled_f64
 ; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcge.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcge1.i = insertelement <1 x double> undef, double %b, i32 0
-  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double> %vcge.i, <1 x double> %vcge1.i)
-  %0 = extractelement <1 x i64> %vcge2.i, i32 0
+  %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b)
+  %0 = extractelement <1 x i64> %fcge2.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vclezs_f32(float %a) {
-; CHECK: test_vclezs_f32
+; CHECK-LABEL: test_vclezs_f32
 ; CHECK: fcmle {{s[0-9]}}, {{s[0-9]}}, #0.0
 entry:
-  %vcle.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcle1.i = call <1 x i32> @llvm.aarch64.neon.vclez.v1i32.v1f32.v1f32(<1 x float> %vcle.i, <1 x float> zeroinitializer)
-  %0 = extractelement <1 x i32> %vcle1.i, i32 0
+  %fcle1.i = call <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float %a, float 0.0)
+  %0 = extractelement <1 x i32> %fcle1.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vclezd_f64(double %a) {
-; CHECK: test_vclezd_f64
+; CHECK-LABEL: test_vclezd_f64
 ; CHECK: fcmle {{d[0-9]}}, {{d[0-9]}}, #0.0
 entry:
-  %vcle.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcle1.i = tail call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1f64.v1f32(<1 x double> %vcle.i, <1 x float> zeroinitializer) #5
-  %0 = extractelement <1 x i64> %vcle1.i, i32 0
+  %fcle1.i = call <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double %a, float 0.0)
+  %0 = extractelement <1 x i64> %fcle1.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vclts_f32(float %a, float %b) {
-; CHECK: test_vclts_f32
+; CHECK-LABEL: test_vclts_f32
 ; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcgt.i = insertelement <1 x float> undef, float %b, i32 0
-  %vcgt1.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcgt2.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> %vcgt1.i)
-  %0 = extractelement <1 x i32> %vcgt2.i, i32 0
+  %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b)
+  %0 = extractelement <1 x i32> %fcgt2.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vcltd_f64(double %a, double %b) {
-; CHECK: test_vcltd_f64
+; CHECK-LABEL: test_vcltd_f64
 ; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcgt.i = insertelement <1 x double> undef, double %b, i32 0
-  %vcgt1.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double> %vcgt.i, <1 x double> %vcgt1.i)
-  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
+  %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b)
+  %0 = extractelement <1 x i64> %fcgt2.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vcltzs_f32(float %a) {
-; CHECK: test_vcltzs_f32
+; CHECK-LABEL: test_vcltzs_f32
 ; CHECK: fcmlt {{s[0-9]}}, {{s[0-9]}}, #0.0
 entry:
-  %vclt.i = insertelement <1 x float> undef, float %a, i32 0
-  %vclt1.i = call <1 x i32> @llvm.aarch64.neon.vcltz.v1i32.v1f32.v1f32(<1 x float> %vclt.i, <1 x float> zeroinitializer)
-  %0 = extractelement <1 x i32> %vclt1.i, i32 0
+  %fclt1.i = call <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float %a, float 0.0)
+  %0 = extractelement <1 x i32> %fclt1.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vcltzd_f64(double %a) {
-; CHECK: test_vcltzd_f64
+; CHECK-LABEL: test_vcltzd_f64
 ; CHECK: fcmlt {{d[0-9]}}, {{d[0-9]}}, #0.0
 entry:
-  %vclt.i = insertelement <1 x double> undef, double %a, i32 0
-  %vclt1.i = tail call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1f64.v1f32(<1 x double> %vclt.i, <1 x float> zeroinitializer) #5
-  %0 = extractelement <1 x i64> %vclt1.i, i32 0
+  %fclt1.i = call <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double %a, float 0.0)
+  %0 = extractelement <1 x i64> %fclt1.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vcages_f32(float %a, float %b) {
-; CHECK: test_vcages_f32
+; CHECK-LABEL: test_vcages_f32
 ; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcage.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcage1.i = insertelement <1 x float> undef, float %b, i32 0
-  %vcage2.i = call <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float> %vcage.i, <1 x float> %vcage1.i)
-  %0 = extractelement <1 x i32> %vcage2.i, i32 0
+  %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b)
+  %0 = extractelement <1 x i32> %fcage2.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vcaged_f64(double %a, double %b) {
-; CHECK: test_vcaged_f64
+; CHECK-LABEL: test_vcaged_f64
 ; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcage.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcage1.i = insertelement <1 x double> undef, double %b, i32 0
-  %vcage2.i = call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %vcage.i, <1 x double> %vcage1.i)
-  %0 = extractelement <1 x i64> %vcage2.i, i32 0
+  %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b)
+  %0 = extractelement <1 x i64> %fcage2.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vcagts_f32(float %a, float %b) {
-; CHECK: test_vcagts_f32
+; CHECK-LABEL: test_vcagts_f32
 ; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcagt.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcagt1.i = insertelement <1 x float> undef, float %b, i32 0
-  %vcagt2.i = call <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float> %vcagt.i, <1 x float> %vcagt1.i)
-  %0 = extractelement <1 x i32> %vcagt2.i, i32 0
+  %fcagt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b)
+  %0 = extractelement <1 x i32> %fcagt2.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vcagtd_f64(double %a, double %b) {
-; CHECK: test_vcagtd_f64
+; CHECK-LABEL: test_vcagtd_f64
 ; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcagt.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcagt1.i = insertelement <1 x double> undef, double %b, i32 0
-  %vcagt2.i = call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %vcagt.i, <1 x double> %vcagt1.i)
-  %0 = extractelement <1 x i64> %vcagt2.i, i32 0
+  %fcagt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b)
+  %0 = extractelement <1 x i64> %fcagt2.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vcales_f32(float %a, float %b) {
-; CHECK: test_vcales_f32
+; CHECK-LABEL: test_vcales_f32
 ; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcage.i = insertelement <1 x float> undef, float %b, i32 0
-  %vcage1.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcage2.i = call <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float> %vcage.i, <1 x float> %vcage1.i)
-  %0 = extractelement <1 x i32> %vcage2.i, i32 0
+  %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b)
+  %0 = extractelement <1 x i32> %fcage2.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vcaled_f64(double %a, double %b) {
-; CHECK: test_vcaled_f64
+; CHECK-LABEL: test_vcaled_f64
 ; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcage.i = insertelement <1 x double> undef, double %b, i32 0
-  %vcage1.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcage2.i = call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %vcage.i, <1 x double> %vcage1.i)
-  %0 = extractelement <1 x i64> %vcage2.i, i32 0
+  %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b)
+  %0 = extractelement <1 x i64> %fcage2.i, i32 0
   ret i64 %0
 }
 
 define i32 @test_vcalts_f32(float %a, float %b) {
-; CHECK: test_vcalts_f32
+; CHECK-LABEL: test_vcalts_f32
 ; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
 entry:
-  %vcalt.i = insertelement <1 x float> undef, float %b, i32 0
-  %vcalt1.i = insertelement <1 x float> undef, float %a, i32 0
-  %vcalt2.i = call <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float> %vcalt.i, <1 x float> %vcalt1.i)
-  %0 = extractelement <1 x i32> %vcalt2.i, i32 0
+  %fcalt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b)
+  %0 = extractelement <1 x i32> %fcalt2.i, i32 0
   ret i32 %0
 }
 
 define i64 @test_vcaltd_f64(double %a, double %b) {
-; CHECK: test_vcaltd_f64
+; CHECK-LABEL: test_vcaltd_f64
 ; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
 entry:
-  %vcalt.i = insertelement <1 x double> undef, double %b, i32 0
-  %vcalt1.i = insertelement <1 x double> undef, double %a, i32 0
-  %vcalt2.i = call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %vcalt.i, <1 x double> %vcalt1.i)
-  %0 = extractelement <1 x i64> %vcalt2.i, i32 0
+  %fcalt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b)
+  %0 = extractelement <1 x i64> %fcalt2.i, i32 0
   ret i64 %0
 }
 
-declare <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
-declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
-declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
-declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
-declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i32> @llvm.aarch64.neon.vclez.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
-declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
-declare <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
-declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
-declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i32> @llvm.aarch64.neon.vcltz.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
-declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
-declare <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
-declare <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
-declare <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float, float)
+declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double, float)
+declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double, double)
+declare <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float, float)
+declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double, float)
+declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double, double)
+declare <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float, float)
+declare <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double, float)
+declare <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float, float)
+declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double, float)
+declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double, double)
+declare <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float, float)
+declare <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double, float)
+declare <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float, float)
+declare <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double, double)
+declare <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float, float)
+declare <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double, double)
diff --git a/test/CodeGen/AArch64/neon-scalar-recip.ll b/test/CodeGen/AArch64/neon-scalar-recip.ll
index f21c27b..100839b 100644
--- a/test/CodeGen/AArch64/neon-scalar-recip.ll
+++ b/test/CodeGen/AArch64/neon-scalar-recip.ll
@@ -3,56 +3,42 @@
 define float @test_vrecpss_f32(float %a, float %b) {
 ; CHECK: test_vrecpss_f32
 ; CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = insertelement <1 x float> undef, float %a, i32 0
-  %2 = insertelement <1 x float> undef, float %b, i32 0
-  %3 = call <1 x float> @llvm.arm.neon.vrecps.v1f32(<1 x float> %1, <1 x float> %2)
-  %4 = extractelement <1 x float> %3, i32 0
-  ret float %4
+  %1 = call float @llvm.aarch64.neon.vrecps.f32(float %a, float %b)
+  ret float %1
 }
 
 define double @test_vrecpsd_f64(double %a, double %b) {
 ; CHECK: test_vrecpsd_f64
 ; CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = insertelement <1 x double> undef, double %a, i32 0
-  %2 = insertelement <1 x double> undef, double %b, i32 0
-  %3 = call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %1, <1 x double> %2)
-  %4 = extractelement <1 x double> %3, i32 0
-  ret double %4
+  %1 = call double @llvm.aarch64.neon.vrecps.f64(double %a, double %b)
+  ret double %1
 }
 
-declare <1 x float> @llvm.arm.neon.vrecps.v1f32(<1 x float>, <1 x float>)
-declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>)
+declare float @llvm.aarch64.neon.vrecps.f32(float, float)
+declare double @llvm.aarch64.neon.vrecps.f64(double, double)
 
 define float @test_vrsqrtss_f32(float %a, float %b) {
 ; CHECK: test_vrsqrtss_f32
 ; CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = insertelement <1 x float> undef, float %a, i32 0
-  %2 = insertelement <1 x float> undef, float %b, i32 0
-  %3 = call <1 x float> @llvm.arm.neon.vrsqrts.v1f32(<1 x float> %1, <1 x float> %2)
-  %4 = extractelement <1 x float> %3, i32 0
-  ret float %4
+  %1 = call float @llvm.aarch64.neon.vrsqrts.f32(float %a, float %b)
+  ret float %1
 }
 
 define double @test_vrsqrtsd_f64(double %a, double %b) {
 ; CHECK: test_vrsqrtsd_f64
 ; CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = insertelement <1 x double> undef, double %a, i32 0
-  %2 = insertelement <1 x double> undef, double %b, i32 0
-  %3 = call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %1, <1 x double> %2)
-  %4 = extractelement <1 x double> %3, i32 0
-  ret double %4
+  %1 = call double @llvm.aarch64.neon.vrsqrts.f64(double %a, double %b)
+  ret double %1
 }
 
-declare <1 x float> @llvm.arm.neon.vrsqrts.v1f32(<1 x float>, <1 x float>)
-declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>)
+declare float @llvm.aarch64.neon.vrsqrts.f32(float, float)
+declare double @llvm.aarch64.neon.vrsqrts.f64(double, double)
 
 define float @test_vrecpes_f32(float %a) {
 ; CHECK: test_vrecpes_f32
 ; CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}}
 entry:
-  %vrecpe.i = insertelement <1 x float> undef, float %a, i32 0
-  %vrecpe1.i = tail call <1 x float> @llvm.arm.neon.vrecpe.v1f32(<1 x float> %vrecpe.i)
-  %0 = extractelement <1 x float> %vrecpe1.i, i32 0
+  %0 = call float @llvm.aarch64.neon.vrecpe.f32(float %a)
   ret float %0
 }
 
@@ -60,22 +46,18 @@ define double @test_vrecped_f64(double %a) {
 ; CHECK: test_vrecped_f64
 ; CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}}
 entry:
-  %vrecpe.i = insertelement <1 x double> undef, double %a, i32 0
-  %vrecpe1.i = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %vrecpe.i)
-  %0 = extractelement <1 x double> %vrecpe1.i, i32 0
+  %0 = call double @llvm.aarch64.neon.vrecpe.f64(double %a)
   ret double %0
 }
 
-declare <1 x float> @llvm.arm.neon.vrecpe.v1f32(<1 x float>)
-declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>)
+declare float @llvm.aarch64.neon.vrecpe.f32(float)
+declare double @llvm.aarch64.neon.vrecpe.f64(double)
 
 define float @test_vrecpxs_f32(float %a) {
 ; CHECK: test_vrecpxs_f32
 ; CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}}
 entry:
-  %vrecpx.i = insertelement <1 x float> undef, float %a, i32 0
-  %vrecpx1.i = tail call <1 x float> @llvm.aarch64.neon.vrecpx.v1f32(<1 x float> %vrecpx.i)
-  %0 = extractelement <1 x float> %vrecpx1.i, i32 0
+  %0 = call float @llvm.aarch64.neon.vrecpx.f32(float %a)
   ret float %0
 }
 
@@ -83,22 +65,18 @@ define double @test_vrecpxd_f64(double %a) {
 ; CHECK: test_vrecpxd_f64
 ; CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}}
 entry:
-  %vrecpx.i = insertelement <1 x double> undef, double %a, i32 0
-  %vrecpx1.i = tail call <1 x double> @llvm.aarch64.neon.vrecpx.v1f64(<1 x double> %vrecpx.i)
-  %0 = extractelement <1 x double> %vrecpx1.i, i32 0
+  %0 = call double @llvm.aarch64.neon.vrecpx.f64(double %a)
   ret double %0
 }
 
-declare <1 x float> @llvm.aarch64.neon.vrecpx.v1f32(<1 x float>)
-declare <1 x double> @llvm.aarch64.neon.vrecpx.v1f64(<1 x double>)
+declare float @llvm.aarch64.neon.vrecpx.f32(float)
+declare double @llvm.aarch64.neon.vrecpx.f64(double)
 
 define float @test_vrsqrtes_f32(float %a) {
 ; CHECK: test_vrsqrtes_f32
 ; CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}}
 entry:
-  %vrsqrte.i = insertelement <1 x float> undef, float %a, i32 0
-  %vrsqrte1.i = tail call <1 x float> @llvm.arm.neon.vrsqrte.v1f32(<1 x float> %vrsqrte.i)
-  %0 = extractelement <1 x float> %vrsqrte1.i, i32 0
+  %0 = call float @llvm.aarch64.neon.vrsqrte.f32(float %a)
   ret float %0
 }
 
@@ -106,11 +84,9 @@ define double @test_vrsqrted_f64(double %a) {
 ; CHECK: test_vrsqrted_f64
 ; CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}}
 entry:
-  %vrsqrte.i = insertelement <1 x double> undef, double %a, i32 0
-  %vrsqrte1.i = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %vrsqrte.i)
-  %0 = extractelement <1 x double> %vrsqrte1.i, i32 0
+  %0 = call double @llvm.aarch64.neon.vrsqrte.f64(double %a)
   ret double %0
 }
 
-declare <1 x float> @llvm.arm.neon.vrsqrte.v1f32(<1 x float>)
-declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
+declare float @llvm.aarch64.neon.vrsqrte.f32(float)
+declare double @llvm.aarch64.neon.vrsqrte.f64(double)
diff --git a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
index 80e8dc3..33ce5cf 100644
--- a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
+++ b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
@@ -4,210 +4,198 @@ declare <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64>)
 
 define <1 x i64> @test_addp_v1i64(<2 x i64> %a) {
 ; CHECK: test_addp_v1i64:
-        %val = call <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64> %a)
-; CHECK: addp d0, v0.2d
-        ret <1 x i64> %val
+; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %val = call <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64> %a)
+  ret <1 x i64> %val
 }
 
-declare <1 x float> @llvm.aarch64.neon.vpfadd(<2 x float>)
+declare float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float>)
 
-define <1 x float> @test_faddp_v1f32(<2 x float> %a) {
-; CHECK: test_faddp_v1f32:
-        %val = call <1 x float> @llvm.aarch64.neon.vpfadd(<2 x float> %a)
-; CHECK: faddp s0, v0.2s
-        ret <1 x float> %val
+define float @test_faddp_f32(<2 x float> %a) {
+; CHECK: test_faddp_f32:
+; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %val = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a)
+  ret float %val
 }
 
-declare <1 x double> @llvm.aarch64.neon.vpfaddq(<2 x double>)
+declare double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double>)
 
-define <1 x double> @test_faddp_v1f64(<2 x double> %a) {
-; CHECK: test_faddp_v1f64:
-        %val = call <1 x double> @llvm.aarch64.neon.vpfaddq(<2 x double> %a)
-; CHECK: faddp d0, v0.2d
-        ret <1 x double> %val
+define double @test_faddp_f64(<2 x double> %a) {
+; CHECK: test_faddp_f64:
+; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %val = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a)
+  ret double %val
 }
 
 
-declare <1 x float> @llvm.aarch64.neon.vpmax(<2 x float>)
+declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>)
 
-define <1 x float> @test_fmaxp_v1f32(<2 x float> %a) {
-; CHECK: test_fmaxp_v1f32:
-        %val = call <1 x float> @llvm.aarch64.neon.vpmax(<2 x float> %a)
-; CHECK: fmaxp s0, v0.2s
-        ret <1 x float> %val
+define float @test_fmaxp_f32(<2 x float> %a) {
+; CHECK: test_fmaxp_f32:
+; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %val = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
+  ret float %val
 }
 
-declare <1 x double> @llvm.aarch64.neon.vpmaxq(<2 x double>)
+declare double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double>)
 
-define <1 x double> @test_fmaxp_v1f64(<2 x double> %a) {
-; CHECK: test_fmaxp_v1f64:
-        %val = call <1 x double> @llvm.aarch64.neon.vpmaxq(<2 x double> %a)
-; CHECK: fmaxp d0, v0.2d
-        ret <1 x double> %val
+define double @test_fmaxp_f64(<2 x double> %a) {
+; CHECK: test_fmaxp_f64:
+; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %val = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a)
+  ret double %val
 }
 
+declare float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float>)
 
-declare <1 x float> @llvm.aarch64.neon.vpmin(<2 x float>)
-
-define <1 x float> @test_fminp_v1f32(<2 x float> %a) {
-; CHECK: test_fminp_v1f32:
-        %val = call <1 x float> @llvm.aarch64.neon.vpmin(<2 x float> %a)
-; CHECK: fminp s0, v0.2s
-        ret <1 x float> %val
+define float @test_fminp_f32(<2 x float> %a) {
+; CHECK: test_fminp_f32:
+; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %val = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a)
+  ret float %val
 }
 
-declare <1 x double> @llvm.aarch64.neon.vpminq(<2 x double>)
+declare double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double>)
 
-define <1 x double> @test_fminp_v1f64(<2 x double> %a) {
-; CHECK: test_fminp_v1f64:
-        %val = call <1 x double> @llvm.aarch64.neon.vpminq(<2 x double> %a)
-; CHECK: fminp d0, v0.2d
-        ret <1 x double> %val
+define double @test_fminp_f64(<2 x double> %a) {
+; CHECK: test_fminp_f64:
+; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %val = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a)
+  ret double %val
 }
 
-declare <1 x float> @llvm.aarch64.neon.vpfmaxnm(<2 x float>)
+declare float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float>)
 
-define <1 x float> @test_fmaxnmp_v1f32(<2 x float> %a) {
-; CHECK: test_fmaxnmp_v1f32:
-        %val = call <1 x float> @llvm.aarch64.neon.vpfmaxnm(<2 x float> %a)
-; CHECK: fmaxnmp s0, v0.2s
-        ret <1 x float> %val
+define float @test_fmaxnmp_f32(<2 x float> %a) {
+; CHECK: test_fmaxnmp_f32:
+; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %val = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a)
+  ret float %val
 }
 
-declare <1 x double> @llvm.aarch64.neon.vpfmaxnmq(<2 x double>)
+declare double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double>)
 
-define <1 x double> @test_fmaxnmp_v1f64(<2 x double> %a) {
-; CHECK: test_fmaxnmp_v1f64:
-        %val = call <1 x double> @llvm.aarch64.neon.vpfmaxnmq(<2 x double> %a)
-; CHECK: fmaxnmp d0, v0.2d
-        ret <1 x double> %val
+define double @test_fmaxnmp_f64(<2 x double> %a) {
+; CHECK: test_fmaxnmp_f64:
+; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %val = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a)
+  ret double %val
 }
 
-declare <1 x float> @llvm.aarch64.neon.vpfminnm(<2 x float>)
+declare float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float>)
 
-define <1 x float> @test_fminnmp_v1f32(<2 x float> %a) {
-; CHECK: test_fminnmp_v1f32:
-        %val = call <1 x float> @llvm.aarch64.neon.vpfminnm(<2 x float> %a)
-; CHECK: fminnmp s0, v0.2s
-        ret <1 x float> %val
+define float @test_fminnmp_f32(<2 x float> %a) {
+; CHECK: test_fminnmp_f32:
+; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %val = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a)
+  ret float %val
 }
 
-declare <1 x double> @llvm.aarch64.neon.vpfminnmq(<2 x double>)
+declare double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double>)
 
-define <1 x double> @test_fminnmp_v1f64(<2 x double> %a) {
-; CHECK: test_fminnmp_v1f64:
-        %val = call <1 x double> @llvm.aarch64.neon.vpfminnmq(<2 x double> %a)
-; CHECK: fminnmp d0, v0.2d
-        ret <1 x double> %val
+define double @test_fminnmp_f64(<2 x double> %a) {
+; CHECK: test_fminnmp_f64:
+; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %val = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a)
+  ret double %val
 }
 
 define float @test_vaddv_f32(<2 x float> %a) {
 ; CHECK-LABEL: test_vaddv_f32
 ; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = tail call <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v2f32(<2 x float> %a)
-  %2 = extractelement <1 x float> %1, i32 0
-  ret float %2
+  %1 = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a)
+  ret float %1
 }
 
 define float @test_vaddvq_f32(<4 x float> %a) {
 ; CHECK-LABEL: test_vaddvq_f32
 ; CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = tail call <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v4f32(<4 x float> %a)
-  %2 = extractelement <1 x float> %1, i32 0
-  ret float %2
+  %1 = call float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float> %a)
+  ret float %1
 }
 
 define double @test_vaddvq_f64(<2 x double> %a) {
 ; CHECK-LABEL: test_vaddvq_f64
 ; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = tail call <1 x double> @llvm.aarch64.neon.vaddv.v1f64.v2f64(<2 x double> %a)
-  %2 = extractelement <1 x double> %1, i32 0
-  ret double %2
+  %1 = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a)
+  ret double %1
 }
 
 define float @test_vmaxv_f32(<2 x float> %a) {
 ; CHECK-LABEL: test_vmaxv_f32
 ; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = tail call <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v2f32(<2 x float> %a)
-  %2 = extractelement <1 x float> %1, i32 0
-  ret float %2
+  %1 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
+  ret float %1
 }
 
 define double @test_vmaxvq_f64(<2 x double> %a) {
 ; CHECK-LABEL: test_vmaxvq_f64
 ; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxv.v1f64.v2f64(<2 x double> %a)
-  %2 = extractelement <1 x double> %1, i32 0
-  ret double %2
+  %1 = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a)
+  ret double %1
 }
 
 define float @test_vminv_f32(<2 x float> %a) {
 ; CHECK-LABEL: test_vminv_f32
 ; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = tail call <1 x float> @llvm.aarch64.neon.vminv.v1f32.v2f32(<2 x float> %a)
-  %2 = extractelement <1 x float> %1, i32 0
-  ret float %2
+  %1 = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a)
+  ret float %1
 }
 
 define double @test_vminvq_f64(<2 x double> %a) {
 ; CHECK-LABEL: test_vminvq_f64
 ; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = tail call <1 x double> @llvm.aarch64.neon.vminv.v1f64.v2f64(<2 x double> %a)
-  %2 = extractelement <1 x double> %1, i32 0
-  ret double %2
+  %1 = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a)
+  ret double %1
 }
 
 define double @test_vmaxnmvq_f64(<2 x double> %a) {
 ; CHECK-LABEL: test_vmaxnmvq_f64
 ; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnmv.v1f64.v2f64(<2 x double> %a)
-  %2 = extractelement <1 x double> %1, i32 0
-  ret double %2
+  %1 = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a)
+  ret double %1
 }
 
 define float @test_vmaxnmv_f32(<2 x float> %a) {
 ; CHECK-LABEL: test_vmaxnmv_f32
 ; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = tail call <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v2f32(<2 x float> %a)
-  %2 = extractelement <1 x float> %1, i32 0
-  ret float %2
+  %1 = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a)
+  ret float %1
 }
 
 define double @test_vminnmvq_f64(<2 x double> %a) {
 ; CHECK-LABEL: test_vminnmvq_f64
 ; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = tail call <1 x double> @llvm.aarch64.neon.vminnmv.v1f64.v2f64(<2 x double> %a)
-  %2 = extractelement <1 x double> %1, i32 0
-  ret double %2
+  %1 = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a)
+  ret double %1
 }
 
 define float @test_vminnmv_f32(<2 x float> %a) {
 ; CHECK-LABEL: test_vminnmv_f32
 ; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = tail call <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v2f32(<2 x float> %a)
-  %2 = extractelement <1 x float> %1, i32 0
-  ret float %2
+  %1 = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a)
+  ret float %1
 }
 
 define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vpaddq_s64
 ; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  %1 = tail call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
+  %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i64> %1
 }
 
 define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vpaddq_u64
 ; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  %1 = tail call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
+  %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i64> %1
 }
 
 define i64 @test_vaddvq_s64(<2 x i64> %a) {
 ; CHECK-LABEL: test_vaddvq_s64
 ; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
+  %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
   %2 = extractelement <1 x i64> %1, i32 0
   ret i64 %2
 }
@@ -215,7 +203,7 @@ define i64 @test_vaddvq_s64(<2 x i64> %a) {
 define i64 @test_vaddvq_u64(<2 x i64> %a) {
 ; CHECK-LABEL: test_vaddvq_u64
 ; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
+  %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
   %2 = extractelement <1 x i64> %1, i32 0
   ret i64 %2
 }
@@ -224,24 +212,4 @@ declare <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64>)
 
 declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
 
-declare <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v2f32(<2 x float>)
-
-declare <1 x double> @llvm.aarch64.neon.vminnmv.v1f64.v2f64(<2 x double>)
-
-declare <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v2f32(<2 x float>)
-
-declare <1 x double> @llvm.aarch64.neon.vmaxnmv.v1f64.v2f64(<2 x double>)
-
-declare <1 x double> @llvm.aarch64.neon.vminv.v1f64.v2f64(<2 x double>)
-
-declare <1 x float> @llvm.aarch64.neon.vminv.v1f32.v2f32(<2 x float>)
-
-declare <1 x double> @llvm.aarch64.neon.vmaxv.v1f64.v2f64(<2 x double>)
-
-declare <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v2f32(<2 x float>)
-
-declare <1 x double> @llvm.aarch64.neon.vaddv.v1f64.v2f64(<2 x double>)
-
-declare <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v4f32(<4 x float>)
-
-declare <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v2f32(<2 x float>)
-\ No newline at end of file
+declare float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float>)
diff --git a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
index 83ceb4e..7c9ffa0 100644
--- a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
+++ b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
@@ -7,14 +7,14 @@ declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_urshl_v1i64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: urshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_srshl_v1i64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: srshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
@@ -24,14 +24,14 @@ declare <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_urshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_urshl_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: urshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @test_srshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_srshl_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: srshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
index bd66f80..5c010ef 100644
--- a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
+++ b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
@@ -6,14 +6,14 @@ declare <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8>, <1 x i8>)
 define <1 x i8> @test_uqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
 ; CHECK: test_uqadd_v1i8_aarch64:
   %tmp1 = call <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqadd {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+;CHECK: uqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
   ret <1 x i8> %tmp1
 }
 
 define <1 x i8> @test_sqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
 ; CHECK: test_sqadd_v1i8_aarch64:
   %tmp1 = call <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqadd {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+;CHECK: sqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
   ret <1 x i8> %tmp1
 }
 
@@ -23,14 +23,14 @@ declare <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8>, <1 x i8>)
 define <1 x i8> @test_uqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
 ; CHECK: test_uqsub_v1i8_aarch64:
   %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqsub {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+;CHECK: uqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
   ret <1 x i8> %tmp1
 }
 
 define <1 x i8> @test_sqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
 ; CHECK: test_sqsub_v1i8_aarch64:
   %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqsub {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+;CHECK: sqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
   ret <1 x i8> %tmp1
 }
 
@@ -40,14 +40,14 @@ declare <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16>, <1 x i16>)
 define <1 x i16> @test_uqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
 ; CHECK: test_uqadd_v1i16_aarch64:
   %tmp1 = call <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqadd {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+;CHECK: uqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
   ret <1 x i16> %tmp1
 }
 
 define <1 x i16> @test_sqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
 ; CHECK: test_sqadd_v1i16_aarch64:
   %tmp1 = call <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqadd {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+;CHECK: sqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
   ret <1 x i16> %tmp1
 }
 
@@ -57,14 +57,14 @@ declare <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16>, <1 x i16>)
 define <1 x i16> @test_uqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
 ; CHECK: test_uqsub_v1i16_aarch64:
   %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqsub {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+;CHECK: uqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
   ret <1 x i16> %tmp1
 }
 
 define <1 x i16> @test_sqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
 ; CHECK: test_sqsub_v1i16_aarch64:
   %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqsub {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+;CHECK: sqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
   ret <1 x i16> %tmp1
 }
 
@@ -74,14 +74,14 @@ declare <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32>, <1 x i32>)
 define <1 x i32> @test_uqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
 ; CHECK: test_uqadd_v1i32_aarch64:
   %tmp1 = call <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqadd {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+;CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   ret <1 x i32> %tmp1
 }
 
 define <1 x i32> @test_sqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
 ; CHECK: test_sqadd_v1i32_aarch64:
   %tmp1 = call <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqadd {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+;CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   ret <1 x i32> %tmp1
 }
 
@@ -91,7 +91,7 @@ declare <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32>, <1 x i32>)
 define <1 x i32> @test_uqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
 ; CHECK: test_uqsub_v1i32_aarch64:
   %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqsub {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+;CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   ret <1 x i32> %tmp1
 }
 
@@ -99,7 +99,7 @@ define <1 x i32> @test_uqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
 define <1 x i32> @test_sqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
 ; CHECK: test_sqsub_v1i32_aarch64:
   %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqsub {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+;CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   ret <1 x i32> %tmp1
 }
 
@@ -109,14 +109,14 @@ declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_uqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_uqadd_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqadd {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @test_sqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_sqadd_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqadd {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
@@ -126,14 +126,14 @@ declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_uqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_uqsub_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqsub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @test_sqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_sqsub_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqsub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
index 0fd67df..dbf9669 100644
--- a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
+++ b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
@@ -6,7 +6,7 @@ declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_uqrshl_v1i64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 
   ret <1 x i64> %tmp1
 }
@@ -14,7 +14,7 @@ define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_sqrshl_v1i64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
@@ -24,7 +24,7 @@ declare <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8>, <1 x i8>)
 define <1 x i8> @test_uqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
 ; CHECK: test_uqrshl_v1i8_aarch64:
   %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqrshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+;CHECK: uqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
 
   ret <1 x i8> %tmp1
 }
@@ -32,7 +32,7 @@ define <1 x i8> @test_uqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
 define <1 x i8> @test_sqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
 ; CHECK: test_sqrshl_v1i8_aarch64:
   %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqrshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+;CHECK: sqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
   ret <1 x i8> %tmp1
 }
 
@@ -42,7 +42,7 @@ declare <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16>, <1 x i16>)
 define <1 x i16> @test_uqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
 ; CHECK: test_uqrshl_v1i16_aarch64:
   %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqrshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+;CHECK: uqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
 
   ret <1 x i16> %tmp1
 }
@@ -50,7 +50,7 @@ define <1 x i16> @test_uqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
 define <1 x i16> @test_sqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
 ; CHECK: test_sqrshl_v1i16_aarch64:
   %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqrshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+;CHECK: sqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
   ret <1 x i16> %tmp1
 }
 
@@ -60,7 +60,7 @@ declare <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32>, <1 x i32>)
 define <1 x i32> @test_uqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
 ; CHECK: test_uqrshl_v1i32_aarch64:
   %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqrshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+;CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 
   ret <1 x i32> %tmp1
 }
@@ -68,7 +68,7 @@ define <1 x i32> @test_uqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
 define <1 x i32> @test_sqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
 ; CHECK: test_sqrshl_v1i32_aarch64:
   %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqrshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+;CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   ret <1 x i32> %tmp1
 }
 
@@ -78,7 +78,7 @@ declare <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_uqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_uqrshl_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 
   ret <1 x i64> %tmp1
 }
@@ -86,7 +86,7 @@ define <1 x i64> @test_uqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 define <1 x i64> @test_sqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_sqrshl_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
index 8fdea24..0a1f4c9 100644
--- a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
+++ b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
@@ -6,14 +6,14 @@ declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_uqshl_v1i64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_sqshl_v1i64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
@@ -23,14 +23,14 @@ declare <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8>, <1 x i8>)
 define <1 x i8> @test_uqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
 ; CHECK: test_uqshl_v1i8_aarch64:
   %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+;CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
   ret <1 x i8> %tmp1
 }
 
 define <1 x i8> @test_sqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
 ; CHECK: test_sqshl_v1i8_aarch64:
   %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+;CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
   ret <1 x i8> %tmp1
 }
 
@@ -40,14 +40,14 @@ declare <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16>, <1 x i16>)
 define <1 x i16> @test_uqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
 ; CHECK: test_uqshl_v1i16_aarch64:
   %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+;CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
   ret <1 x i16> %tmp1
 }
 
 define <1 x i16> @test_sqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
 ; CHECK: test_sqshl_v1i16_aarch64:
   %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+;CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
   ret <1 x i16> %tmp1
 }
 
@@ -57,14 +57,14 @@ declare <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32>, <1 x i32>)
 define <1 x i32> @test_uqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
 ; CHECK: test_uqshl_v1i32_aarch64:
   %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+;CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   ret <1 x i32> %tmp1
 }
 
 define <1 x i32> @test_sqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
 ; CHECK: test_sqshl_v1i32_aarch64:
   %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+;CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   ret <1 x i32> %tmp1
 }
 
@@ -74,14 +74,14 @@ declare <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_uqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_uqshl_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @test_sqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_sqshl_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
diff --git a/test/CodeGen/AArch64/neon-scalar-shift.ll b/test/CodeGen/AArch64/neon-scalar-shift.ll
index 1222be5..b712ea4 100644
--- a/test/CodeGen/AArch64/neon-scalar-shift.ll
+++ b/test/CodeGen/AArch64/neon-scalar-shift.ll
@@ -6,7 +6,7 @@ declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_ushl_v1i64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 
   ret <1 x i64> %tmp1
 }
@@ -14,7 +14,7 @@ define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_sshl_v1i64:
   %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
@@ -24,15 +24,213 @@ declare <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64>, <1 x i64>)
 define <1 x i64> @test_ushl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_ushl_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @test_sshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
 ; CHECK: test_sshl_v1i64_aarch64:
   %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   ret <1 x i64> %tmp1
 }
 
+define <1 x i64> @test_vtst_s64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vtst_s64
+; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %0 = and <1 x i64> %a, %b
+  %1 = icmp ne <1 x i64> %0, zeroinitializer
+  %vtst.i = sext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vtst.i
+}
+
+define <1 x i64> @test_vtst_u64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vtst_u64
+; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %0 = and <1 x i64> %a, %b
+  %1 = icmp ne <1 x i64> %0, zeroinitializer
+  %vtst.i = sext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vtst.i
+}
+
+define <1 x i64> @test_vsli_n_p64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vsli_n_p64
+; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #0
+entry:
+  %vsli_n2 = tail call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %a, <1 x i64> %b, i32 0)
+  ret <1 x i64> %vsli_n2
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32)
+
+define <2 x i64> @test_vsliq_n_p64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vsliq_n_p64
+; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+entry:
+  %vsli_n2 = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 0)
+  ret <2 x i64> %vsli_n2
+}
+
+declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32)
+
+define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) {
+; CHECK-LABEL: test_vrsqrte_u32
+; CHECK: ursqrte {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vrsqrte1.i = tail call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a)
+  ret <2 x i32> %vrsqrte1.i
+}
+
+define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vrsqrteq_u32
+; CHECK: ursqrte {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsqrte1.i = tail call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a)
+  ret <4 x i32> %vrsqrte1.i
+}
+
+define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) {
+; CHECK-LABEL: test_vqshl_n_s8
+; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+entry:
+  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
+  ret <8 x i8> %vqshl_n
+}
+
+declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
+
+define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vqshlq_n_s8
+; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+entry:
+  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
+  ret <16 x i8> %vqshl_n
+}
+
+declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
+
+define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) {
+; CHECK-LABEL: test_vqshl_n_s16
+; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+entry:
+  %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer)
+  ret <4 x i16> %vqshl_n1
+}
+
+declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
+
+define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vqshlq_n_s16
+; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+entry:
+  %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer)
+  ret <8 x i16> %vqshl_n1
+}
+
+declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
+
+define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) {
+; CHECK-LABEL: test_vqshl_n_s32
+; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+entry:
+  %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer)
+  ret <2 x i32> %vqshl_n1
+}
+
+declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
+
+define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vqshlq_n_s32
+; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+entry:
+  %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer)
+  ret <4 x i32> %vqshl_n1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
+
+define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) {
+; CHECK-LABEL: test_vqshlq_n_s64
+; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+entry:
+  %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer)
+  ret <2 x i64> %vqshl_n1
+}
+
+declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
+
+define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) {
+; CHECK-LABEL: test_vqshl_n_u8
+; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+entry:
+  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
+  ret <8 x i8> %vqshl_n
+}
+
+declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
+
+define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vqshlq_n_u8
+; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+entry:
+  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
+  ret <16 x i8> %vqshl_n
+}
+
+declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
+
+define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) {
+; CHECK-LABEL: test_vqshl_n_u16
+; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+entry:
+  %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer)
+  ret <4 x i16> %vqshl_n1
+}
+
+declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
+
+define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vqshlq_n_u16
+; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+entry:
+  %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer)
+  ret <8 x i16> %vqshl_n1
+}
+
+declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
+
+define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) {
+; CHECK-LABEL: test_vqshl_n_u32
+; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+entry:
+  %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer)
+  ret <2 x i32> %vqshl_n1
+}
+
+declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
+
+define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vqshlq_n_u32
+; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+entry:
+  %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer)
+  ret <4 x i32> %vqshl_n1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
+
+define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) {
+; CHECK-LABEL: test_vqshlq_n_u64
+; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d,
+entry:
+  %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer)
+  ret <2 x i64> %vqshl_n1
+}
+
+declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>)
 
+declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/neon-select_cc.ll b/test/CodeGen/AArch64/neon-select_cc.ll
new file mode 100644
index 0000000..f6b5d3c
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-select_cc.ll
@@ -0,0 +1,202 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_i8:
+; CHECK: and	w0, w0, #0xff
+; CHECK-NEXT: cmp	w0, w1, uxtb
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.8b, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i8 %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_f32:
+; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup	v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_f64:
+; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_i8:
+; CHECK: and	w0, w0, #0xff
+; CHECK-NEXT: cmp	w0, w1, uxtb
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.16b, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i8 %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_f32:
+; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup	v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_f64:
+; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: dup	v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) {
+; CHECK-LABEL: test_select_cc_v4i16:
+; CHECK: and	w0, w0, #0xffff
+; CHECK-NEXT: cmp	w0, w1, uxth
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.4h, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i16 %a, %b
+  %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
+  ret <4x i16> %e
+}
+
+define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) {
+; CHECK-LABEL: test_select_cc_v8i16:
+; CHECK: and	w0, w0, #0xffff
+; CHECK-NEXT: cmp	w0, w1, uxth
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.8h, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i16 %a, %b
+  %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
+  ret <8x i16> %e
+}
+
+define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) {
+; CHECK-LABEL: test_select_cc_v2i32:
+; CHECK: cmp	w0, w1, uxtw
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.2s, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i32 %a, %b
+  %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
+  ret <2x i32> %e
+}
+
+define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) {
+; CHECK-LABEL: test_select_cc_v4i32:
+; CHECK: cmp	w0, w1, uxtw
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.4s, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i32 %a, %b
+  %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
+  ret <4x i32> %e
+}
+
+define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) {
+; CHECK-LABEL: test_select_cc_v1i64:
+; CHECK: cmp	x0, x1
+; CHECK-NEXT: csinv	x0, xzr, xzr, ne
+; CHECK-NEXT: fmov	d{{[0-9]+}}, x0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i64 %a, %b
+  %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
+  ret <1x i64> %e
+}
+
+define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) {
+; CHECK-LABEL: test_select_cc_v2i64:
+; CHECK: cmp	x0, x1
+; CHECK-NEXT: csinv	x0, xzr, xzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.2d, x0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i64 %a, %b
+  %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
+  ret <2x i64> %e
+}
+
+define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
+; CHECK-LABEL: test_select_cc_v1f32:
+; CHECK: fcmp	s0, s1
+; CHECK-NEXT: fcsel	s0, s2, s3, eq
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
+  ret <1 x float> %e
+}
+  
+define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) {
+; CHECK-LABEL: test_select_cc_v2f32:
+; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup	v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <2 x float> %c, <2 x float> %d
+  ret <2 x float> %e
+}
+
+define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) {
+; CHECK-LABEL: test_select_cc_v4f32:
+; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup	v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
+  ret <4x float> %e
+}
+
+define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x float> %d ) {
+; CHECK-LABEL: test_select_cc_v4f32_icmp:
+; CHECK: cmp	w0, w1, uxtw
+; CHECK: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.4s, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i32 %a, %b
+  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
+  ret <4x float> %e
+}
+
+define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v1f64:
+; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
+  ret <1 x double> %e
+}
+
+define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c, <1 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v1f64_icmp:
+; CHECK: cmp	 x0, x1
+; CHECK-NEXT: csinv	x0, xzr, xzr, ne
+; CHECK-NEXT: fmov	d{{[0-9]+}}, x0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i64 %a, %b
+  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
+  ret <1 x double> %e
+}
+
+define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v2f64:
+; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: dup	v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d
+  ret <2 x double> %e
+}
diff --git a/test/CodeGen/AArch64/neon-shift-left-long.ll b/test/CodeGen/AArch64/neon-shift-left-long.ll
index d45c476..d10d551 100644
--- a/test/CodeGen/AArch64/neon-shift-left-long.ll
+++ b/test/CodeGen/AArch64/neon-shift-left-long.ll
@@ -191,3 +191,13 @@ define <2 x i64> @test_ushll2_shl0_v4i32(<4 x i32> %a) {
   %tmp = zext <2 x i32> %1 to <2 x i64>
   ret <2 x i64> %tmp
 }
+
+define <8 x i16> @test_ushll_cmp(<8 x i8> %a, <8 x i8> %b) #0 {
+; CHECK: test_ushll_cmp:
+; CHECK: cmeq	{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: ushll	{{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0
+  %cmp.i = icmp eq <8 x i8> %a, %b
+  %vcgtz.i.i = sext <8 x i1> %cmp.i to <8 x i8>
+  %vmovl.i.i.i = zext <8 x i8> %vcgtz.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i.i
+}
diff --git a/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll b/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
new file mode 100644
index 0000000..0b520d7
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
@@ -0,0 +1,333 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @shl.v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: shl.v8i8:
+; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %c = shl <8 x i8> %a, %b
+  ret <8 x i8> %c
+}
+
+define <4 x i16> @shl.v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: shl.v4i16:
+; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %c = shl <4 x i16> %a, %b
+  ret <4 x i16> %c
+}
+
+define <2 x i32> @shl.v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: shl.v2i32:
+; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %c = shl <2 x i32> %a, %b
+  ret <2 x i32> %c
+}
+
+define <1 x i64> @shl.v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: shl.v1i64:
+; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %c = shl <1 x i64> %a, %b
+  ret <1 x i64> %c
+}
+
+define <16 x i8> @shl.v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shl.v16i8:
+; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %c = shl <16 x i8> %a, %b
+  ret <16 x i8> %c
+}
+
+define <8 x i16> @shl.v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shl.v8i16:
+; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %c = shl <8 x i16> %a, %b
+  ret <8 x i16> %c
+}
+
+define <4 x i32> @shl.v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shl.v4i32:
+; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %c = shl <4 x i32> %a, %b
+  ret <4 x i32> %c
+}
+
+define <2 x i64> @shl.v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shl.v2i64:
+; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %c = shl <2 x i64> %a, %b
+  ret <2 x i64> %c
+}
+
+define <8 x i8> @lshr.v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: lshr.v8i8:
+; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %c = lshr <8 x i8> %a, %b
+  ret <8 x i8> %c
+}
+
+define <4 x i16> @lshr.v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: lshr.v4i16:
+; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %c = lshr <4 x i16> %a, %b
+  ret <4 x i16> %c
+}
+
+define <2 x i32> @lshr.v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: lshr.v2i32:
+; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %c = lshr <2 x i32> %a, %b
+  ret <2 x i32> %c
+}
+
+define <1 x i64> @lshr.v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: lshr.v1i64:
+; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %c = lshr <1 x i64> %a, %b
+  ret <1 x i64> %c
+}
+
+define <16 x i8> @lshr.v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: lshr.v16i8:
+; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %c = lshr <16 x i8> %a, %b
+  ret <16 x i8> %c
+}
+
+define <8 x i16> @lshr.v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: lshr.v8i16:
+; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %c = lshr <8 x i16> %a, %b
+  ret <8 x i16> %c
+}
+
+define <4 x i32> @lshr.v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: lshr.v4i32:
+; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %c = lshr <4 x i32> %a, %b
+  ret <4 x i32> %c
+}
+
+define <2 x i64> @lshr.v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: lshr.v2i64:
+; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %c = lshr <2 x i64> %a, %b
+  ret <2 x i64> %c
+}
+
+define <8 x i8> @ashr.v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: ashr.v8i8:
+; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %c = ashr <8 x i8> %a, %b
+  ret <8 x i8> %c
+}
+
+define <4 x i16> @ashr.v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: ashr.v4i16:
+; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %c = ashr <4 x i16> %a, %b
+  ret <4 x i16> %c
+}
+
+define <2 x i32> @ashr.v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: ashr.v2i32:
+; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %c = ashr <2 x i32> %a, %b
+  ret <2 x i32> %c
+}
+
+define <1 x i64> @ashr.v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: ashr.v1i64:
+; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %c = ashr <1 x i64> %a, %b
+  ret <1 x i64> %c
+}
+
+define <16 x i8> @ashr.v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: ashr.v16i8:
+; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: sshl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %c = ashr <16 x i8> %a, %b
+  ret <16 x i8> %c
+}
+
+define <8 x i16> @ashr.v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: ashr.v8i16:
+; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+; CHECK: sshl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %c = ashr <8 x i16> %a, %b
+  ret <8 x i16> %c
+}
+
+define <4 x i32> @ashr.v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: ashr.v4i32:
+; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+; CHECK: sshl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %c = ashr <4 x i32> %a, %b
+  ret <4 x i32> %c
+}
+
+define <2 x i64> @ashr.v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: ashr.v2i64:
+; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+; CHECK: sshl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %c = ashr <2 x i64> %a, %b
+  ret <2 x i64> %c
+}
+
+define <1 x i64> @shl.v1i64.0(<1 x i64> %a) {
+; CHECK-LABEL: shl.v1i64.0:
+; CHECK-NOT: shl d{{[0-9]+}}, d{{[0-9]+}}, #0
+  %c = shl <1 x i64> %a, zeroinitializer
+  ret <1 x i64> %c
+}
+
+define <2 x i32> @shl.v2i32.0(<2 x i32> %a) {
+; CHECK-LABEL: shl.v2i32.0:
+; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0
+  %c = shl <2 x i32> %a, zeroinitializer
+  ret <2 x i32> %c
+}
+
+; The following test cases test shl/ashr/lshr with v1i8/v1i16/v1i32 types
+
+define <1 x i8> @shl.v1i8(<1 x i8> %a, <1 x i8> %b) {
+; CHECK-LABEL: shl.v1i8:
+; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %c = shl <1 x i8> %a, %b
+  ret <1 x i8> %c
+}
+
+define <1 x i16> @shl.v1i16(<1 x i16> %a, <1 x i16> %b) {
+; CHECK-LABEL: shl.v1i16:
+; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %c = shl <1 x i16> %a, %b
+  ret <1 x i16> %c
+}
+
+define <1 x i32> @shl.v1i32(<1 x i32> %a, <1 x i32> %b) {
+; CHECK-LABEL: shl.v1i32:
+; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %c = shl <1 x i32> %a, %b
+  ret <1 x i32> %c
+}
+
+define <1 x i8> @ashr.v1i8(<1 x i8> %a, <1 x i8> %b) {
+; CHECK-LABEL: ashr.v1i8:
+; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %c = ashr <1 x i8> %a, %b
+  ret <1 x i8> %c
+}
+
+define <1 x i16> @ashr.v1i16(<1 x i16> %a, <1 x i16> %b) {
+; CHECK-LABEL: ashr.v1i16:
+; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %c = ashr <1 x i16> %a, %b
+  ret <1 x i16> %c
+}
+
+define <1 x i32> @ashr.v1i32(<1 x i32> %a, <1 x i32> %b) {
+; CHECK-LABEL: ashr.v1i32:
+; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %c = ashr <1 x i32> %a, %b
+  ret <1 x i32> %c
+}
+
+define <1 x i8> @lshr.v1i8(<1 x i8> %a, <1 x i8> %b) {
+; CHECK-LABEL: lshr.v1i8:
+; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %c = lshr <1 x i8> %a, %b
+  ret <1 x i8> %c
+}
+
+define <1 x i16> @lshr.v1i16(<1 x i16> %a, <1 x i16> %b) {
+; CHECK-LABEL: lshr.v1i16:
+; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %c = lshr <1 x i16> %a, %b
+  ret <1 x i16> %c
+}
+
+define <1 x i32> @lshr.v1i32(<1 x i32> %a, <1 x i32> %b) {
+; CHECK-LABEL: lshr.v1i32:
+; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %c = lshr <1 x i32> %a, %b
+  ret <1 x i32> %c
+}
+
+define <1 x i8> @shl.v1i8.imm(<1 x i8> %a) {
+; CHECK-LABEL: shl.v1i8.imm:
+; CHECK: shl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
+  %c = shl <1 x i8> %a, <i8 3>
+  ret <1 x i8> %c
+}
+
+define <1 x i16> @shl.v1i16.imm(<1 x i16> %a) {
+; CHECK-LABEL: shl.v1i16.imm:
+; CHECK: shl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #5
+  %c = shl <1 x i16> %a, <i16 5>
+  ret <1 x i16> %c
+}
+
+define <1 x i32> @shl.v1i32.imm(<1 x i32> %a) {
+; CHECK-LABEL: shl.v1i32.imm:
+; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0
+  %c = shl <1 x i32> %a, zeroinitializer
+  ret <1 x i32> %c
+}
+
+define <1 x i8> @ashr.v1i8.imm(<1 x i8> %a) {
+; CHECK-LABEL: ashr.v1i8.imm:
+; CHECK: sshr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
+  %c = ashr <1 x i8> %a, <i8 3>
+  ret <1 x i8> %c
+}
+
+define <1 x i16> @ashr.v1i16.imm(<1 x i16> %a) {
+; CHECK-LABEL: ashr.v1i16.imm:
+; CHECK: sshr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10
+  %c = ashr <1 x i16> %a, <i16 10>
+  ret <1 x i16> %c
+}
+
+define <1 x i32> @ashr.v1i32.imm(<1 x i32> %a) {
+; CHECK-LABEL: ashr.v1i32.imm:
+; CHECK: sshr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31
+  %c = ashr <1 x i32> %a, <i32 31>
+  ret <1 x i32> %c
+}
+
+define <1 x i8> @lshr.v1i8.imm(<1 x i8> %a) {
+; CHECK-LABEL: lshr.v1i8.imm:
+; CHECK: ushr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
+  %c = lshr <1 x i8> %a, <i8 3>
+  ret <1 x i8> %c
+}
+
+define <1 x i16> @lshr.v1i16.imm(<1 x i16> %a) {
+; CHECK-LABEL: lshr.v1i16.imm:
+; CHECK: ushr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10
+  %c = lshr <1 x i16> %a, <i16 10>
+  ret <1 x i16> %c
+}
+
+define <1 x i32> @lshr.v1i32.imm(<1 x i32> %a) {
+; CHECK-LABEL: lshr.v1i32.imm:
+; CHECK: ushr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31
+  %c = lshr <1 x i32> %a, <i32 31>
+  ret <1 x i32> %c
+}
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
index 3f28320..927c933 100644
--- a/test/CodeGen/AArch64/neon-simd-ldst-one.ll
+++ b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
@@ -1,5 +1,8 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
+%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
+%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
+%struct.uint8x16x3_t = type { [3 x <16 x i8>] }
 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
@@ -37,6 +40,87 @@
 %struct.float32x2x4_t = type { [4 x <2 x float>] }
 %struct.float64x1x4_t = type { [4 x <1 x double>] }
 
+define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
+; CHECK-LABEL: test_ld_from_poll_v16i8
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
+  ret <16 x i8> %b
+}
+
+define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: test_ld_from_poll_v8i16
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+  ret <8 x i16> %b
+}
+
+define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4i32
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %b
+}
+
+define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2i64
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <2 x i64> %a, <i64 1, i64 2>
+  ret <2 x i64> %b
+}
+
+define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4f32
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
+  ret <4 x float> %b
+}
+
+define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2f64
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = fadd <2 x double> %a, <double 1.0, double 2.0>
+  ret <2 x double> %b
+}
+
+define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
+; CHECK-LABEL: test_ld_from_poll_v8i8
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
+  ret <8 x i8> %b
+}
+
+define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4i16
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
+  ret <4 x i16> %b
+}
+
+define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2i32
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <2 x i32> %a, <i32 1, i32 2>
+  ret <2 x i32> %b
+}
+
 define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
 ; CHECK-LABEL: test_vld1q_dup_s8
 ; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
@@ -155,6 +239,31 @@ entry:
   ret <1 x double> %1
 }
 
+define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
+; As there is a store operation depending on %1, LD1R pattern can't be selected.
+; So LDR and FMOV should be emitted.
+; CHECK-LABEL: testDUP.v1i64
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}]
+  %1 = load i64* %a, align 8
+  store i64 %1, i64* %b, align 8
+  %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
+  ret <1 x i64> %vecinit.i
+}
+
+define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
+; As there is a store operation depending on %1, LD1R pattern can't be selected.
+; So LDR and FMOV should be emitted.
+; CHECK-LABEL: testDUP.v1f64
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
+  %1 = load double* %a, align 8
+  store double %1, double* %b, align 8
+  %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
+  ret <1 x double> %vecinit.i
+}
+
 define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
 ; CHECK-LABEL: test_vld2q_dup_s8
 ; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
@@ -2110,4 +2219,81 @@ declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>,
 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
 declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-\ No newline at end of file
+declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+
+define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s8
+; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
+entry:
+  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
+  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
+  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
+  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
+; CHECK-LABEL: test_vld2q_lane_u8
+; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
+entry:
+  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
+  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
+  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
+  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.uint8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
+; CHECK-LABEL: test_vld2q_lane_p8
+; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
+entry:
+  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
+  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
+  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
+  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.poly8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s8
+; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
+entry:
+  %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
+  %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
+  %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
+  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
+  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int8x16x3_t %.fca.0.2.insert
+}
+
+define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
+; CHECK-LABEL: test_vld3q_lane_u8
+; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
+entry:
+  %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
+  %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
+  %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
+  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
+  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.uint8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.uint8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.uint8x16x3_t %.fca.0.2.insert
+}
+
diff --git a/test/CodeGen/AArch64/neon-simd-tbl.ll b/test/CodeGen/AArch64/neon-simd-tbl.ll
index 8eac1e8..7a51c0f 100644
--- a/test/CodeGen/AArch64/neon-simd-tbl.ll
+++ b/test/CodeGen/AArch64/neon-simd-tbl.ll
@@ -1,45 +1,45 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
-declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 
-declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 
-declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 
-declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
 
-declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
 
-declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
 
-declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
 
-declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8>, <16 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>)
 
-declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8>, <16 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>)
 
 declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
 
-declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8>, <8 x i8>)
 
-declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 
-declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 
-declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
 
-declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8>, <16 x i8>)
 
-declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
 
-declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
 
 define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK: test_vtbl1_s8:
 ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
   %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
   ret <8 x i8> %vtbl11.i
 }
 
@@ -47,7 +47,7 @@ define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) {
 ; CHECK: test_vqtbl1_s8:
 ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b)
+  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
   ret <8 x i8> %vtbl1.i
 }
 
@@ -58,7 +58,7 @@ entry:
   %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
   %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
   ret <8 x i8> %vtbl17.i
 }
 
@@ -68,7 +68,7 @@ define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
 entry:
   %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
+  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
   ret <8 x i8> %vtbl2.i
 }
 
@@ -81,7 +81,7 @@ entry:
   %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
   %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
   ret <8 x i8> %vtbl212.i
 }
 
@@ -92,7 +92,7 @@ entry:
   %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
+  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
   ret <8 x i8> %vtbl3.i
 }
 
@@ -106,7 +106,7 @@ entry:
   %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
   %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
+  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
   ret <8 x i8> %vtbl216.i
 }
 
@@ -118,7 +118,7 @@ entry:
   %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
   %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
+  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
   ret <8 x i8> %vtbl4.i
 }
 
@@ -126,7 +126,7 @@ define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: test_vqtbl1q_s8:
 ; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
 entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b)
+  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
   ret <16 x i8> %vtbl1.i
 }
 
@@ -136,7 +136,7 @@ define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
 entry:
   %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
+  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
   ret <16 x i8> %vtbl2.i
 }
 
@@ -147,7 +147,7 @@ entry:
   %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
+  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
   ret <16 x i8> %vtbl3.i
 }
 
@@ -159,7 +159,7 @@ entry:
   %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
   %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
+  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
   ret <16 x i8> %vtbl4.i
 }
 
@@ -168,7 +168,7 @@ define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
 ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
   %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
   %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
   %1 = sext <8 x i1> %0 to <8 x i8>
   %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
@@ -182,7 +182,7 @@ entry:
   %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
   %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
+  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
   ret <8 x i8> %vtbx17.i
 }
 
@@ -195,7 +195,7 @@ entry:
   %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
   %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
   %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
   %1 = sext <8 x i1> %0 to <8 x i8>
   %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
@@ -212,7 +212,7 @@ entry:
   %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
   %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
+  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
   ret <8 x i8> %vtbx216.i
 }
 
@@ -220,7 +220,7 @@ define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
 ; CHECK: test_vqtbx1_s8:
 ; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
+  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
   ret <8 x i8> %vtbx1.i
 }
 
@@ -230,7 +230,7 @@ define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8>
 entry:
   %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
+  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
   ret <8 x i8> %vtbx2.i
 }
 
@@ -241,7 +241,7 @@ entry:
   %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
+  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
   ret <8 x i8> %vtbx3.i
 }
 
@@ -253,7 +253,7 @@ entry:
   %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
   %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
+  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
   ret <8 x i8> %vtbx4.i
 }
 
@@ -261,7 +261,7 @@ define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK: test_vqtbx1q_s8:
 ; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
 entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
   ret <16 x i8> %vtbx1.i
 }
 
@@ -271,7 +271,7 @@ define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x
 entry:
   %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
+  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
   ret <16 x i8> %vtbx2.i
 }
 
@@ -282,7 +282,7 @@ entry:
   %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
+  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
   ret <16 x i8> %vtbx3.i
 }
 
@@ -294,7 +294,7 @@ entry:
   %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
   %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
+  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
   ret <16 x i8> %vtbx4.i
 }
 
@@ -303,7 +303,7 @@ define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
   %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
   ret <8 x i8> %vtbl11.i
 }
 
@@ -311,7 +311,7 @@ define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) {
 ; CHECK: test_vqtbl1_u8:
 ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b)
+  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
   ret <8 x i8> %vtbl1.i
 }
 
@@ -322,7 +322,7 @@ entry:
   %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
   %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
   ret <8 x i8> %vtbl17.i
 }
 
@@ -332,7 +332,7 @@ define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
 entry:
   %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
+  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
   ret <8 x i8> %vtbl2.i
 }
 
@@ -345,7 +345,7 @@ entry:
   %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
   %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
   ret <8 x i8> %vtbl212.i
 }
 
@@ -356,7 +356,7 @@ entry:
   %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
+  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
   ret <8 x i8> %vtbl3.i
 }
 
@@ -370,7 +370,7 @@ entry:
   %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
   %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
+  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
   ret <8 x i8> %vtbl216.i
 }
 
@@ -382,7 +382,7 @@ entry:
   %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
   %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
+  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
   ret <8 x i8> %vtbl4.i
 }
 
@@ -390,7 +390,7 @@ define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: test_vqtbl1q_u8:
 ; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
 entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b)
+  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
   ret <16 x i8> %vtbl1.i
 }
 
@@ -400,7 +400,7 @@ define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
 entry:
   %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
+  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
   ret <16 x i8> %vtbl2.i
 }
 
@@ -411,7 +411,7 @@ entry:
   %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
+  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
   ret <16 x i8> %vtbl3.i
 }
 
@@ -423,7 +423,7 @@ entry:
   %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
   %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
+  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
   ret <16 x i8> %vtbl4.i
 }
 
@@ -432,7 +432,7 @@ define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
 ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
   %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
   %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
   %1 = sext <8 x i1> %0 to <8 x i8>
   %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
@@ -446,7 +446,7 @@ entry:
   %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
   %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
+  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
   ret <8 x i8> %vtbx17.i
 }
 
@@ -459,7 +459,7 @@ entry:
   %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
   %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
   %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
   %1 = sext <8 x i1> %0 to <8 x i8>
   %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
@@ -476,7 +476,7 @@ entry:
   %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
   %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
+  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
   ret <8 x i8> %vtbx216.i
 }
 
@@ -484,7 +484,7 @@ define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
 ; CHECK: test_vqtbx1_u8:
 ; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
+  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
   ret <8 x i8> %vtbx1.i
 }
 
@@ -494,7 +494,7 @@ define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8>
 entry:
   %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
+  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
   ret <8 x i8> %vtbx2.i
 }
 
@@ -505,7 +505,7 @@ entry:
   %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
+  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
   ret <8 x i8> %vtbx3.i
 }
 
@@ -517,7 +517,7 @@ entry:
   %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
   %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
+  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
   ret <8 x i8> %vtbx4.i
 }
 
@@ -525,7 +525,7 @@ define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK: test_vqtbx1q_u8:
 ; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
 entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
   ret <16 x i8> %vtbx1.i
 }
 
@@ -535,7 +535,7 @@ define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x
 entry:
   %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
+  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
   ret <16 x i8> %vtbx2.i
 }
 
@@ -546,7 +546,7 @@ entry:
   %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
+  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
   ret <16 x i8> %vtbx3.i
 }
 
@@ -558,7 +558,7 @@ entry:
   %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
   %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
+  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
   ret <16 x i8> %vtbx4.i
 }
 
@@ -567,7 +567,7 @@ define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
   %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
   ret <8 x i8> %vtbl11.i
 }
 
@@ -575,7 +575,7 @@ define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) {
 ; CHECK: test_vqtbl1_p8:
 ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b)
+  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
   ret <8 x i8> %vtbl1.i
 }
 
@@ -586,7 +586,7 @@ entry:
   %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
   %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
   ret <8 x i8> %vtbl17.i
 }
 
@@ -596,7 +596,7 @@ define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
 entry:
   %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
+  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
   ret <8 x i8> %vtbl2.i
 }
 
@@ -609,7 +609,7 @@ entry:
   %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
   %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
   ret <8 x i8> %vtbl212.i
 }
 
@@ -620,7 +620,7 @@ entry:
   %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
+  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
   ret <8 x i8> %vtbl3.i
 }
 
@@ -634,7 +634,7 @@ entry:
   %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
   %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
+  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
   ret <8 x i8> %vtbl216.i
 }
 
@@ -646,7 +646,7 @@ entry:
   %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
   %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
+  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
   ret <8 x i8> %vtbl4.i
 }
 
@@ -654,7 +654,7 @@ define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: test_vqtbl1q_p8:
 ; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
 entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b)
+  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
   ret <16 x i8> %vtbl1.i
 }
 
@@ -664,7 +664,7 @@ define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
 entry:
   %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
+  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
   ret <16 x i8> %vtbl2.i
 }
 
@@ -675,7 +675,7 @@ entry:
   %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
   %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
+  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
   ret <16 x i8> %vtbl3.i
 }
 
@@ -687,7 +687,7 @@ entry:
   %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
   %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
   %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
+  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
   ret <16 x i8> %vtbl4.i
 }
 
@@ -696,7 +696,7 @@ define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
 ; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
   %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
   %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
   %1 = sext <8 x i1> %0 to <8 x i8>
   %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
@@ -710,7 +710,7 @@ entry:
   %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
   %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
+  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
   ret <8 x i8> %vtbx17.i
 }
 
@@ -723,7 +723,7 @@ entry:
   %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
   %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
   %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
   %1 = sext <8 x i1> %0 to <8 x i8>
   %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
@@ -740,7 +740,7 @@ entry:
   %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
   %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
+  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
   ret <8 x i8> %vtbx216.i
 }
 
@@ -748,7 +748,7 @@ define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
 ; CHECK: test_vqtbx1_p8:
 ; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
 entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
+  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
   ret <8 x i8> %vtbx1.i
 }
 
@@ -758,7 +758,7 @@ define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8>
 entry:
   %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
+  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
   ret <8 x i8> %vtbx2.i
 }
 
@@ -769,7 +769,7 @@ entry:
   %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
+  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
   ret <8 x i8> %vtbx3.i
 }
 
@@ -781,7 +781,7 @@ entry:
   %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
   %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
+  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
   ret <8 x i8> %vtbx4.i
 }
 
@@ -789,7 +789,7 @@ define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK: test_vqtbx1q_p8:
 ; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
 entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
   ret <16 x i8> %vtbx1.i
 }
 
@@ -799,7 +799,7 @@ define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x
 entry:
   %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
+  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
   ret <16 x i8> %vtbx2.i
 }
 
@@ -810,7 +810,7 @@ entry:
   %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
   %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
+  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
   ret <16 x i8> %vtbx3.i
 }
 
@@ -822,7 +822,7 @@ entry:
   %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
   %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
   %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
+  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
   ret <16 x i8> %vtbx4.i
 }
 
diff --git a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll b/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
new file mode 100644
index 0000000..bb3300e
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+; This file tests the spill of FPR8/FPR16. The volatile loads/stores force the
+; allocator to keep the value live until it's needed.
+
+%bigtype_v1i8 = type [20 x <1 x i8>]
+
+define void @spill_fpr8(%bigtype_v1i8* %addr) {
+; CHECK-LABEL: spill_fpr8:
+; CHECK: 1-byte Folded Spill
+; CHECK: 1-byte Folded Reload
+  %val1 = load volatile %bigtype_v1i8* %addr
+  %val2 = load volatile %bigtype_v1i8* %addr
+  store volatile %bigtype_v1i8 %val1, %bigtype_v1i8* %addr
+  store volatile %bigtype_v1i8 %val2, %bigtype_v1i8* %addr
+  ret void
+}
+
+%bigtype_v1i16 = type [20 x <1 x i16>]
+
+define void @spill_fpr16(%bigtype_v1i16* %addr) {
+; CHECK-LABEL: spill_fpr16:
+; CHECK: 2-byte Folded Spill
+; CHECK: 2-byte Folded Reload
+  %val1 = load volatile %bigtype_v1i16* %addr
+  %val2 = load volatile %bigtype_v1i16* %addr
+  store volatile %bigtype_v1i16 %val1, %bigtype_v1i16* %addr
+  store volatile %bigtype_v1i16 %val2, %bigtype_v1i16* %addr
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
new file mode 100644
index 0000000..e5b7694
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+; A vector TruncStore can not be selected.
+; Test a trunc IR and a vector store IR can be selected correctly.
+define void @truncStore.v2i64(<2 x i64> %a, <2 x i32>* %result) {
+; CHECK-LABEL: truncStore.v2i64:
+; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %b = trunc <2 x i64> %a to <2 x i32>
+  store <2 x i32> %b, <2 x i32>* %result
+  ret void
+}
+
+define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) {
+; CHECK-LABEL: truncStore.v4i32:
+; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %b = trunc <4 x i32> %a to <4 x i16>
+  store <4 x i16> %b, <4 x i16>* %result
+  ret void
+}
+
+define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) {
+; CHECK-LABEL: truncStore.v8i16:
+; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
+; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %b = trunc <8 x i16> %a to <8 x i8>
+  store <8 x i8> %b, <8 x i8>* %result
+  ret void
+}
+
+; A vector LoadExt can not be selected.
+; Test a vector load IR and a sext/zext IR can be selected correctly.
+define <4 x i32> @loadSExt.v4i8(<4 x i8>* %ref) {
+; CHECK-LABEL: loadSExt.v4i8:
+; CHECK: ldrsb
+  %a = load <4 x i8>* %ref
+  %conv = sext <4 x i8> %a to <4 x i32>
+  ret <4 x i32> %conv
+}
+
+define <4 x i32> @loadZExt.v4i8(<4 x i8>* %ref) {
+; CHECK-LABEL: loadZExt.v4i8:
+; CHECK: ldrb
+  %a = load <4 x i8>* %ref
+  %conv = zext <4 x i8> %a to <4 x i32>
+  ret <4 x i32> %conv
+}
+
+define i32 @loadExt.i32(<4 x i8>* %ref) {
+; CHECK-LABEL: loadExt.i32:
+; CHECK: ldrb
+  %a = load <4 x i8>* %ref
+  %vecext = extractelement <4 x i8> %a, i32 0
+  %conv = zext i8 %vecext to i32
+  ret i32 %conv
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-v1i1-setcc.ll b/test/CodeGen/AArch64/neon-v1i1-setcc.ll
new file mode 100644
index 0000000..6c7d009
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-v1i1-setcc.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+; This file test the DAG node like "v1i1 SETCC v1i64, v1i64". As the v1i1 type
+; is illegal in AArch64 backend, the legalizer tries to scalarize this node.
+; As the v1i64 operands of SETCC are legal types, they will not be scalarized.
+; Currently the type legalizer will have an assertion failure as it assumes all
+; operands of SETCC have been legalized.
+; FIXME: If the algorithm of type scalarization is improved and can legaize
+; "v1i1 SETCC" correctly, these test cases are not needed.
+
+define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_0:
+; CHECK: cmge d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = icmp sge <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_1:
+; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_0:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_1:
+; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
+; CHECK-LABEL: test_select_v1i1_2:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
+  ret <1 x double> %res
+}
+
+define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_br_extr_cmp:
+; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}}
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  br i1 %2, label %if.end, label %if.then
+
+if.then:
+  ret i32 0;
+
+if.end:
+  ret i32 1;
+}
diff --git a/test/CodeGen/AArch64/neon-vector-list-spill.ll b/test/CodeGen/AArch64/neon-vector-list-spill.ll
new file mode 100644
index 0000000..3ab69c4
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-vector-list-spill.ll
@@ -0,0 +1,175 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast
+
+; FIXME: We should not generate ld/st for such register spill/fill, because the
+; test case seems very simple and the register pressure is not high. If the
+; spill/fill algorithm is optimized, this test case may not be triggered. And
+; then we can delete it.
+define i32 @spill.DPairReg(i8* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.DPairReg:
+; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %arg1, i32 4)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <2 x i32>, <2 x i32> } %vld, 0
+  %res = extractelement <2 x i32> %vld.extract, i32 1
+  ret i32 %res
+}
+
+define i16 @spill.DTripleReg(i8* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.DTripleReg:
+; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %arg1, i32 4)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
+  %res = extractelement <4 x i16> %vld.extract, i32 1
+  ret i16 %res
+}
+
+define i16 @spill.DQuadReg(i8* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.DQuadReg:
+; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %arg1, i32 4)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
+  %res = extractelement <4 x i16> %vld.extract, i32 0
+  ret i16 %res
+}
+
+define i32 @spill.QPairReg(i8* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.QPairReg:
+; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %arg1, i32 4)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
+  %res = extractelement <4 x i32> %vld.extract, i32 1
+  ret i32 %res
+}
+
+define float @spill.QTripleReg(i8* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.QTripleReg:
+; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+entry:
+  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %arg1, i32 4)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
+  %res = extractelement <4 x float> %vld3.extract, i32 1
+  ret float %res
+}
+
+define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.QQuadReg:
+; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %arg1, i32 4)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld, 0
+  %res = extractelement <16 x i8> %vld.extract, i32 1
+  ret i8 %res
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
+
+declare void @foo()
+
+; FIXME: We should not generate ld/st for such register spill/fill, because the
+; test case seems very simple and the register pressure is not high. If the
+; spill/fill algorithm is optimized, this test case may not be triggered. And
+; then we can delete it.
+; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_2xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
+  tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_3xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
+  tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_4xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
+  tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
index 6ec4b19..3404d3f 100644
--- a/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 
 ; Make sure exception-handling PIC code can be linked correctly. An alternative
 ; to the sequence described below would have .gcc_except_table itself writable
@@ -10,8 +11,8 @@
   ; ... referring indirectly to stubs for its typeinfo ...
 ; CHECK: // @TType Encoding = indirect pcrel sdata8
   ; ... one of which is "int"'s typeinfo
-; CHECK: .Ltmp9:
-; CHECK-NEXT: .xword  .L_ZTIi.DW.stub-.Ltmp9
+; CHECK: .Ltmp7:
+; CHECK-NEXT: .xword  .L_ZTIi.DW.stub-.Ltmp7
 
   ; .. and which is properly defined (in a writable section for the dynamic loader) later.
 ; CHECK: .section .data.rel,"aw"
diff --git a/test/CodeGen/AArch64/ragreedy-csr.ll b/test/CodeGen/AArch64/ragreedy-csr.ll
new file mode 100644
index 0000000..18a948b
--- /dev/null
+++ b/test/CodeGen/AArch64/ragreedy-csr.ll
@@ -0,0 +1,297 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -regalloc=greedy -regalloc-csr-first-time-cost=15 | FileCheck %s
+
+; This testing case is reduced from 197.parser prune_match function.
+; We make sure that we do not use callee-saved registers (x19 to x25).
+; rdar://16162005
+
+; CHECK-LABEL: prune_match:
+; CHECK: entry
+; CHECK: str x30, [sp
+; CHECK-NOT: stp x25,
+; CHECK-NOT: stp x23, x24
+; CHECK-NOT: stp x21, x22
+; CHECK-NOT: stp x19, x20
+; CHECK: if.end
+; CHECK: return
+; CHECK: ldr x30, [sp
+; CHECK-NOT: ldp x19, x20
+; CHECK-NOT: ldp x21, x22
+; CHECK-NOT: ldp x23, x24
+; CHECK-NOT: ldp x25,
+
+%struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* }
+%struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* }
+%struct._RuneLocale = type { [8 x i8], [32 x i8], i32 (i8*, i64, i8**)*, i32 (i32, i8*, i64, i8**)*, i32, [256 x i32], [256 x i32], [256 x i32], %struct._RuneRange, %struct._RuneRange, %struct._RuneRange, i8*, i32, i32, %struct._RuneCharClass* }
+%struct._RuneRange = type { i32, %struct._RuneEntry* }
+%struct._RuneEntry = type { i32, i32, i32, i32* }
+%struct._RuneCharClass = type { [14 x i8], i32 }
+%struct.Exp_struct = type { i8, i8, i8, i8, %union.anon }
+%union.anon = type { %struct.E_list_struct* }
+%struct.E_list_struct = type { %struct.E_list_struct*, %struct.Exp_struct* }
+%struct.domain_struct = type { i8*, i32, %struct.List_o_links_struct*, i32, i32, %struct.d_tree_leaf_struct*, %struct.domain_struct* }
+%struct.d_tree_leaf_struct = type { %struct.domain_struct*, i32, %struct.d_tree_leaf_struct* }
+@_DefaultRuneLocale = external global %struct._RuneLocale
+declare i32 @__maskrune(i32, i64) #7
+define fastcc i32 @prune_match(%struct.Connector_struct* nocapture readonly %a, %struct.Connector_struct* nocapture readonly %b) #9 {
+entry:
+  %label56 = bitcast %struct.Connector_struct* %a to i16*
+  %0 = load i16* %label56, align 2
+  %label157 = bitcast %struct.Connector_struct* %b to i16*
+  %1 = load i16* %label157, align 2
+  %cmp = icmp eq i16 %0, %1
+  br i1 %cmp, label %if.end, label %return, !prof !988
+if.end:
+  %priority = getelementptr inbounds %struct.Connector_struct* %a, i64 0, i32 2
+  %2 = load i8* %priority, align 1
+  %priority5 = getelementptr inbounds %struct.Connector_struct* %b, i64 0, i32 2
+  %3 = load i8* %priority5, align 1
+  %string = getelementptr inbounds %struct.Connector_struct* %a, i64 0, i32 5
+  %4 = load i8** %string, align 8
+  %string7 = getelementptr inbounds %struct.Connector_struct* %b, i64 0, i32 5
+  %5 = load i8** %string7, align 8
+  br label %while.cond
+while.cond:
+  %lsr.iv27 = phi i64 [ %lsr.iv.next28, %if.end17 ], [ 0, %if.end ]
+  %scevgep55 = getelementptr i8* %4, i64 %lsr.iv27
+  %6 = load i8* %scevgep55, align 1
+  %idxprom.i.i = sext i8 %6 to i64
+  %isascii.i.i224 = icmp sgt i8 %6, -1
+  br i1 %isascii.i.i224, label %cond.true.i.i, label %cond.false.i.i, !prof !181
+cond.true.i.i:
+  %arrayidx.i.i = getelementptr inbounds %struct._RuneLocale* @_DefaultRuneLocale, i64 0, i32 5, i64 %idxprom.i.i
+  %7 = load i32* %arrayidx.i.i, align 4
+  %and.i.i = and i32 %7, 32768
+  br label %isupper.exit
+cond.false.i.i:
+  %8 = trunc i64 %idxprom.i.i to i8
+  %conv8 = sext i8 %8 to i32
+  %call3.i.i = tail call i32 @__maskrune(i32 %conv8, i64 32768) #3
+  br label %isupper.exit
+isupper.exit:
+  %tobool1.sink.i.in.i = phi i32 [ %and.i.i, %cond.true.i.i ], [ %call3.i.i, %cond.false.i.i ]
+  %tobool1.sink.i.i = icmp eq i32 %tobool1.sink.i.in.i, 0
+  br i1 %tobool1.sink.i.i, label %lor.rhs, label %while.body, !prof !989
+lor.rhs:
+  %sunkaddr = ptrtoint i8* %5 to i64
+  %sunkaddr58 = add i64 %sunkaddr, %lsr.iv27
+  %sunkaddr59 = inttoptr i64 %sunkaddr58 to i8*
+  %9 = load i8* %sunkaddr59, align 1
+  %idxprom.i.i214 = sext i8 %9 to i64
+  %isascii.i.i213225 = icmp sgt i8 %9, -1
+  br i1 %isascii.i.i213225, label %cond.true.i.i217, label %cond.false.i.i219, !prof !181
+cond.true.i.i217:
+  %arrayidx.i.i215 = getelementptr inbounds %struct._RuneLocale* @_DefaultRuneLocale, i64 0, i32 5, i64 %idxprom.i.i214
+  %10 = load i32* %arrayidx.i.i215, align 4
+  %and.i.i216 = and i32 %10, 32768
+  br label %isupper.exit223
+cond.false.i.i219:
+  %11 = trunc i64 %idxprom.i.i214 to i8
+  %conv9 = sext i8 %11 to i32
+  %call3.i.i218 = tail call i32 @__maskrune(i32 %conv9, i64 32768) #3
+  br label %isupper.exit223
+isupper.exit223:
+  %tobool1.sink.i.in.i220 = phi i32 [ %and.i.i216, %cond.true.i.i217 ], [ %call3.i.i218, %cond.false.i.i219 ]
+  %tobool1.sink.i.i221 = icmp eq i32 %tobool1.sink.i.in.i220, 0
+  br i1 %tobool1.sink.i.i221, label %while.end, label %while.body, !prof !990
+while.body:
+  %sunkaddr60 = ptrtoint i8* %4 to i64
+  %sunkaddr61 = add i64 %sunkaddr60, %lsr.iv27
+  %sunkaddr62 = inttoptr i64 %sunkaddr61 to i8*
+  %12 = load i8* %sunkaddr62, align 1
+  %sunkaddr63 = ptrtoint i8* %5 to i64
+  %sunkaddr64 = add i64 %sunkaddr63, %lsr.iv27
+  %sunkaddr65 = inttoptr i64 %sunkaddr64 to i8*
+  %13 = load i8* %sunkaddr65, align 1
+  %cmp14 = icmp eq i8 %12, %13
+  br i1 %cmp14, label %if.end17, label %return, !prof !991
+if.end17:
+  %lsr.iv.next28 = add i64 %lsr.iv27, 1
+  br label %while.cond
+while.end:
+  %14 = or i8 %3, %2
+  %15 = icmp eq i8 %14, 0
+  br i1 %15, label %if.then23, label %if.else88, !prof !992
+if.then23:
+  %sunkaddr66 = ptrtoint %struct.Connector_struct* %a to i64
+  %sunkaddr67 = add i64 %sunkaddr66, 16
+  %sunkaddr68 = inttoptr i64 %sunkaddr67 to i8**
+  %16 = load i8** %sunkaddr68, align 8
+  %17 = load i8* %16, align 1
+  %cmp26 = icmp eq i8 %17, 83
+  %sunkaddr69 = ptrtoint i8* %4 to i64
+  %sunkaddr70 = add i64 %sunkaddr69, %lsr.iv27
+  %sunkaddr71 = inttoptr i64 %sunkaddr70 to i8*
+  %18 = load i8* %sunkaddr71, align 1
+  br i1 %cmp26, label %land.lhs.true28, label %while.cond59.preheader, !prof !993
+land.lhs.true28:
+  switch i8 %18, label %land.rhs.preheader [
+    i8 112, label %land.lhs.true35
+    i8 0, label %return
+  ], !prof !994
+land.lhs.true35:
+  %sunkaddr72 = ptrtoint i8* %5 to i64
+  %sunkaddr73 = add i64 %sunkaddr72, %lsr.iv27
+  %sunkaddr74 = inttoptr i64 %sunkaddr73 to i8*
+  %19 = load i8* %sunkaddr74, align 1
+  switch i8 %19, label %land.rhs.preheader [
+    i8 112, label %land.lhs.true43
+  ], !prof !995
+land.lhs.true43:
+  %20 = ptrtoint i8* %16 to i64
+  %21 = sub i64 0, %20
+  %scevgep52 = getelementptr i8* %4, i64 %21
+  %scevgep53 = getelementptr i8* %scevgep52, i64 %lsr.iv27
+  %scevgep54 = getelementptr i8* %scevgep53, i64 -1
+  %cmp45 = icmp eq i8* %scevgep54, null
+  br i1 %cmp45, label %return, label %lor.lhs.false47, !prof !996
+lor.lhs.false47:
+  %22 = ptrtoint i8* %16 to i64
+  %23 = sub i64 0, %22
+  %scevgep47 = getelementptr i8* %4, i64 %23
+  %scevgep48 = getelementptr i8* %scevgep47, i64 %lsr.iv27
+  %scevgep49 = getelementptr i8* %scevgep48, i64 -2
+  %cmp50 = icmp eq i8* %scevgep49, null
+  br i1 %cmp50, label %land.lhs.true52, label %while.cond59.preheader, !prof !997
+land.lhs.true52:
+  %sunkaddr75 = ptrtoint i8* %4 to i64
+  %sunkaddr76 = add i64 %sunkaddr75, %lsr.iv27
+  %sunkaddr77 = add i64 %sunkaddr76, -1
+  %sunkaddr78 = inttoptr i64 %sunkaddr77 to i8*
+  %24 = load i8* %sunkaddr78, align 1
+  %cmp55 = icmp eq i8 %24, 73
+  %cmp61233 = icmp eq i8 %18, 0
+  %or.cond265 = or i1 %cmp55, %cmp61233
+  br i1 %or.cond265, label %return, label %land.rhs.preheader, !prof !998
+while.cond59.preheader:
+  %cmp61233.old = icmp eq i8 %18, 0
+  br i1 %cmp61233.old, label %return, label %land.rhs.preheader, !prof !999
+land.rhs.preheader:
+  %scevgep33 = getelementptr i8* %5, i64 %lsr.iv27
+  %scevgep43 = getelementptr i8* %4, i64 %lsr.iv27
+  br label %land.rhs
+land.rhs:
+  %lsr.iv = phi i64 [ 0, %land.rhs.preheader ], [ %lsr.iv.next, %if.then83 ]
+  %25 = phi i8 [ %27, %if.then83 ], [ %18, %land.rhs.preheader ]
+  %scevgep34 = getelementptr i8* %scevgep33, i64 %lsr.iv
+  %26 = load i8* %scevgep34, align 1
+  %cmp64 = icmp eq i8 %26, 0
+  br i1 %cmp64, label %return, label %while.body66, !prof !1000
+while.body66:
+  %cmp68 = icmp eq i8 %25, 42
+  %cmp72 = icmp eq i8 %26, 42
+  %or.cond = or i1 %cmp68, %cmp72
+  br i1 %or.cond, label %if.then83, label %lor.lhs.false74, !prof !1001
+lor.lhs.false74:
+  %cmp77 = icmp ne i8 %25, %26
+  %cmp81 = icmp eq i8 %25, 94
+  %or.cond208 = or i1 %cmp77, %cmp81
+  br i1 %or.cond208, label %return, label %if.then83, !prof !1002
+if.then83:
+  %scevgep44 = getelementptr i8* %scevgep43, i64 %lsr.iv
+  %scevgep45 = getelementptr i8* %scevgep44, i64 1
+  %27 = load i8* %scevgep45, align 1
+  %cmp61 = icmp eq i8 %27, 0
+  %lsr.iv.next = add i64 %lsr.iv, 1
+  br i1 %cmp61, label %return, label %land.rhs, !prof !999
+if.else88:
+  %cmp89 = icmp eq i8 %2, 1
+  %cmp92 = icmp eq i8 %3, 2
+  %or.cond159 = and i1 %cmp89, %cmp92
+  br i1 %or.cond159, label %while.cond95.preheader, label %if.else123, !prof !1003
+while.cond95.preheader:
+  %sunkaddr79 = ptrtoint i8* %4 to i64
+  %sunkaddr80 = add i64 %sunkaddr79, %lsr.iv27
+  %sunkaddr81 = inttoptr i64 %sunkaddr80 to i8*
+  %28 = load i8* %sunkaddr81, align 1
+  %cmp97238 = icmp eq i8 %28, 0
+  br i1 %cmp97238, label %return, label %land.rhs99.preheader, !prof !1004
+land.rhs99.preheader:
+  %scevgep31 = getelementptr i8* %5, i64 %lsr.iv27
+  %scevgep40 = getelementptr i8* %4, i64 %lsr.iv27
+  br label %land.rhs99
+land.rhs99:
+  %lsr.iv17 = phi i64 [ 0, %land.rhs99.preheader ], [ %lsr.iv.next18, %if.then117 ]
+  %29 = phi i8 [ %31, %if.then117 ], [ %28, %land.rhs99.preheader ]
+  %scevgep32 = getelementptr i8* %scevgep31, i64 %lsr.iv17
+  %30 = load i8* %scevgep32, align 1
+  %cmp101 = icmp eq i8 %30, 0
+  br i1 %cmp101, label %return, label %while.body104, !prof !1005
+while.body104:
+  %cmp107 = icmp eq i8 %29, %30
+  %cmp111 = icmp eq i8 %29, 42
+  %or.cond209 = or i1 %cmp107, %cmp111
+  %cmp115 = icmp eq i8 %30, 94
+  %or.cond210 = or i1 %or.cond209, %cmp115
+  br i1 %or.cond210, label %if.then117, label %return, !prof !1006
+if.then117:
+  %scevgep41 = getelementptr i8* %scevgep40, i64 %lsr.iv17
+  %scevgep42 = getelementptr i8* %scevgep41, i64 1
+  %31 = load i8* %scevgep42, align 1
+  %cmp97 = icmp eq i8 %31, 0
+  %lsr.iv.next18 = add i64 %lsr.iv17, 1
+  br i1 %cmp97, label %return, label %land.rhs99, !prof !1004
+if.else123:
+  %cmp124 = icmp eq i8 %3, 1
+  %cmp127 = icmp eq i8 %2, 2
+  %or.cond160 = and i1 %cmp124, %cmp127
+  br i1 %or.cond160, label %while.cond130.preheader, label %return, !prof !1007
+while.cond130.preheader:
+  %sunkaddr82 = ptrtoint i8* %4 to i64
+  %sunkaddr83 = add i64 %sunkaddr82, %lsr.iv27
+  %sunkaddr84 = inttoptr i64 %sunkaddr83 to i8*
+  %32 = load i8* %sunkaddr84, align 1
+  %cmp132244 = icmp eq i8 %32, 0
+  br i1 %cmp132244, label %return, label %land.rhs134.preheader, !prof !1008
+land.rhs134.preheader:
+  %scevgep29 = getelementptr i8* %5, i64 %lsr.iv27
+  %scevgep37 = getelementptr i8* %4, i64 %lsr.iv27
+  br label %land.rhs134
+land.rhs134:
+  %lsr.iv22 = phi i64 [ 0, %land.rhs134.preheader ], [ %lsr.iv.next23, %if.then152 ]
+  %33 = phi i8 [ %35, %if.then152 ], [ %32, %land.rhs134.preheader ]
+  %scevgep30 = getelementptr i8* %scevgep29, i64 %lsr.iv22
+  %34 = load i8* %scevgep30, align 1
+  %cmp136 = icmp eq i8 %34, 0
+  br i1 %cmp136, label %return, label %while.body139, !prof !1009
+while.body139:
+  %cmp142 = icmp eq i8 %33, %34
+  %cmp146 = icmp eq i8 %34, 42
+  %or.cond211 = or i1 %cmp142, %cmp146
+  %cmp150 = icmp eq i8 %33, 94
+  %or.cond212 = or i1 %or.cond211, %cmp150
+  br i1 %or.cond212, label %if.then152, label %return, !prof !1010
+if.then152:
+  %scevgep38 = getelementptr i8* %scevgep37, i64 %lsr.iv22
+  %scevgep39 = getelementptr i8* %scevgep38, i64 1
+  %35 = load i8* %scevgep39, align 1
+  %cmp132 = icmp eq i8 %35, 0
+  %lsr.iv.next23 = add i64 %lsr.iv22, 1
+  br i1 %cmp132, label %return, label %land.rhs134, !prof !1008
+return:
+  %retval.0 = phi i32 [ 0, %entry ], [ 1, %land.lhs.true52 ], [ 1, %land.lhs.true43 ], [ 0, %if.else123 ], [ 1, %while.cond59.preheader ], [ 1, %while.cond95.preheader ], [ 1, %while.cond130.preheader ], [ 1, %land.lhs.true28 ], [ 1, %if.then83 ], [ 0, %lor.lhs.false74 ], [ 1, %land.rhs ], [ 1, %if.then117 ], [ 0, %while.body104 ], [ 1, %land.rhs99 ], [ 1, %if.then152 ], [ 0, %while.body139 ], [ 1, %land.rhs134 ], [ 0, %while.body ]
+  ret i32 %retval.0
+}
+!181 = metadata !{metadata !"branch_weights", i32 662038, i32 1}
+!988 = metadata !{metadata !"branch_weights", i32 12091450, i32 1916}
+!989 = metadata !{metadata !"branch_weights", i32 7564670, i32 4526781}
+!990 = metadata !{metadata !"branch_weights", i32 7484958, i32 13283499}
+!991 = metadata !{metadata !"branch_weights", i32 8677007, i32 4606493}
+!992 = metadata !{metadata !"branch_weights", i32 -1172426948, i32 145094705}
+!993 = metadata !{metadata !"branch_weights", i32 1468914, i32 5683688}
+!994 = metadata !{metadata !"branch_weights", i32 114025221, i32 -1217548794, i32 -1199521551, i32 87712616}
+!995 = metadata !{metadata !"branch_weights", i32 1853716452, i32 -444717951, i32 932776759}
+!996 = metadata !{metadata !"branch_weights", i32 1004870, i32 20259}
+!997 = metadata !{metadata !"branch_weights", i32 20071, i32 189}
+!998 = metadata !{metadata !"branch_weights", i32 -1020255939, i32 572177766}
+!999 = metadata !{metadata !"branch_weights", i32 2666513, i32 3466431}
+!1000 = metadata !{metadata !"branch_weights", i32 5117635, i32 1859780}
+!1001 = metadata !{metadata !"branch_weights", i32 354902465, i32 -1444604407}
+!1002 = metadata !{metadata !"branch_weights", i32 -1762419279, i32 1592770684}
+!1003 = metadata !{metadata !"branch_weights", i32 1435905930, i32 -1951930624}
+!1004 = metadata !{metadata !"branch_weights", i32 1, i32 504888}
+!1005 = metadata !{metadata !"branch_weights", i32 94662, i32 504888}
+!1006 = metadata !{metadata !"branch_weights", i32 -1897793104, i32 160196332}
+!1007 = metadata !{metadata !"branch_weights", i32 2074643678, i32 -29579071}
+!1008 = metadata !{metadata !"branch_weights", i32 1, i32 226163}
+!1009 = metadata !{metadata !"branch_weights", i32 58357, i32 226163}
+!1010 = metadata !{metadata !"branch_weights", i32 -2072848646, i32 92907517}
diff --git a/test/CodeGen/AArch64/sext_inreg.ll b/test/CodeGen/AArch64/sext_inreg.ll
new file mode 100644
index 0000000..2f76081
--- /dev/null
+++ b/test/CodeGen/AArch64/sext_inreg.ll
@@ -0,0 +1,198 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+; For formal arguments, we have the following vector type promotion,
+; v2i8 is promoted to v2i32(f64)
+; v2i16 is promoted to v2i32(f64)
+; v4i8 is promoted to v4i16(f64)
+; v8i1 is promoted to v8i16(f128)
+
+define <2 x i8> @test_sext_inreg_v2i8i16(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v2i8i16
+; CHECK: sshll   v0.8h, v0.8b, #0
+; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
+; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
+  %1 = sext <2 x i8> %v1 to <2 x i16>
+  %2 = sext <2 x i8> %v2 to <2 x i16>
+  %3 = shufflevector <2 x i16> %1, <2 x i16> %2, <2 x i32> <i32 0, i32 2>
+  %4 = trunc <2 x i16> %3 to <2 x i8>
+  ret <2 x i8> %4
+}
+
+define <2 x i8> @test_sext_inreg_v2i8i16_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v2i8i16_2
+; CHECK: sshll   v0.8h, v0.8b, #0
+; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
+; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
+  %a1 = shl <2 x i32> %v1, <i32 24, i32 24>
+  %a2 = ashr <2 x i32> %a1, <i32 24, i32 24>
+  %b1 = shl <2 x i32> %v2, <i32 24, i32 24>
+  %b2 = ashr <2 x i32> %b1, <i32 24, i32 24>
+  %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
+  %d = trunc <2 x i32> %c to <2 x i8>
+  ret <2 x i8> %d
+}
+
+define <2 x i8> @test_sext_inreg_v2i8i32(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v2i8i32
+; CHECK: sshll	 v0.8h, v0.8b, #0
+; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: sshll	 v1.8h, v1.8b, #0
+; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
+  %1 = sext <2 x i8> %v1 to <2 x i32>
+  %2 = sext <2 x i8> %v2 to <2 x i32>
+  %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> <i32 0, i32 2>
+  %4 = trunc <2 x i32> %3 to <2 x i8>
+  ret <2 x i8> %4
+}
+
+define <2 x i8> @test_sext_inreg_v2i8i64(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v2i8i64
+; CHECK: ushll   v1.2d, v1.2s, #0
+; CHECK: ushll   v0.2d, v0.2s, #0
+; CHECK: shl     v0.2d, v0.2d, #56
+; CHECK: sshr    v0.2d, v0.2d, #56
+; CHECK: shl     v1.2d, v1.2d, #56
+; CHECK: sshr    v1.2d, v1.2d, #56
+  %1 = sext <2 x i8> %v1 to <2 x i64>
+  %2 = sext <2 x i8> %v2 to <2 x i64>
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
+  %4 = trunc <2 x i64> %3 to <2 x i8>
+  ret <2 x i8> %4
+}
+
+define <4 x i8> @test_sext_inreg_v4i8i16(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v4i8i16
+; CHECK: sshll   v0.8h, v0.8b, #0
+; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
+; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
+  %1 = sext <4 x i8> %v1 to <4 x i16>
+  %2 = sext <4 x i8> %v2 to <4 x i16>
+  %3 = shufflevector <4 x i16> %1, <4 x i16> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %4 = trunc <4 x i16> %3 to <4 x i8>
+  ret <4 x i8> %4
+}
+
+define <4 x i8> @test_sext_inreg_v4i8i16_2(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v4i8i16_2
+; CHECK: sshll   v0.8h, v0.8b, #0
+; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
+; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
+  %a1 = shl <4 x i16> %v1, <i16 8, i16 8, i16 8, i16 8>
+  %a2 = ashr <4 x i16> %a1, <i16 8, i16 8, i16 8, i16 8>
+  %b1 = shl <4 x i16> %v2, <i16 8, i16 8, i16 8, i16 8>
+  %b2 = ashr <4 x i16> %b1, <i16 8, i16 8, i16 8, i16 8>
+  %c = shufflevector <4 x i16> %a2, <4 x i16> %b2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %d = trunc <4 x i16> %c to <4 x i8>
+  ret <4 x i8> %d
+}
+
+define <4 x i8> @test_sext_inreg_v4i8i32(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v4i8i32
+; CHECK: ushll   v1.4s, v1.4h, #0
+; CHECK: ushll   v0.4s, v0.4h, #0
+; CHECK: shl     v0.4s, v0.4s, #24
+; CHECK: sshr    v0.4s, v0.4s, #24
+; CHECK: shl     v1.4s, v1.4s, #24
+; CHECK: sshr    v1.4s, v1.4s, #24
+  %1 = sext <4 x i8> %v1 to <4 x i32>
+  %2 = sext <4 x i8> %v2 to <4 x i32>
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %4 = trunc <4 x i32> %3 to <4 x i8>
+  ret <4 x i8> %4
+}
+
+define <8 x i8> @test_sext_inreg_v8i8i16(<8 x i8> %v1, <8 x i8> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v8i8i16
+; CHECK: sshll   v0.8h, v0.8b, #0
+; CHECK: sshll   v1.8h, v1.8b, #0
+  %1 = sext <8 x i8> %v1 to <8 x i16>
+  %2 = sext <8 x i8> %v2 to <8 x i16>
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %4 = trunc <8 x i16> %3 to <8 x i8>
+  ret <8 x i8> %4
+}
+
+define <8 x i1> @test_sext_inreg_v8i1i16(<8 x i1> %v1, <8 x i1> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v8i1i16
+; CHECK: ushll   v1.8h, v1.8b, #0
+; CHECK: ushll   v0.8h, v0.8b, #0
+; CHECK: shl     v0.8h, v0.8h, #15
+; CHECK: sshr    v0.8h, v0.8h, #15
+; CHECK: shl     v1.8h, v1.8h, #15
+; CHECK: sshr    v1.8h, v1.8h, #15
+  %1 = sext <8 x i1> %v1 to <8 x i16>
+  %2 = sext <8 x i1> %v2 to <8 x i16>
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %4 = trunc <8 x i16> %3 to <8 x i1>
+  ret <8 x i1> %4
+}
+
+define <2 x i16> @test_sext_inreg_v2i16i32(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v2i16i32
+; CHECK: sshll   v0.4s, v0.4h, #0
+; CHECK-NEXT: uzp1    v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: sshll   v1.4s, v1.4h, #0
+; CHECK-NEXT: uzp1    v1.4s, v1.4s, v1.4s
+  %1 = sext <2 x i16> %v1 to <2 x i32>
+  %2 = sext <2 x i16> %v2 to <2 x i32>
+  %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> <i32 0, i32 2>
+  %4 = trunc <2 x i32> %3 to <2 x i16>
+  ret <2 x i16> %4
+}
+
+define <2 x i16> @test_sext_inreg_v2i16i32_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v2i16i32_2
+; CHECK: sshll   v0.4s, v0.4h, #0
+; CHECK-NEXT: uzp1    v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: sshll   v1.4s, v1.4h, #0
+; CHECK-NEXT: uzp1    v1.4s, v1.4s, v1.4s
+  %a1 = shl <2 x i32> %v1, <i32 16, i32 16>
+  %a2 = ashr <2 x i32> %a1, <i32 16, i32 16>
+  %b1 = shl <2 x i32> %v2, <i32 16, i32 16>
+  %b2 = ashr <2 x i32> %b1, <i32 16, i32 16>
+  %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
+  %d = trunc <2 x i32> %c to <2 x i16>
+  ret <2 x i16> %d
+}
+
+define <2 x i16> @test_sext_inreg_v2i16i64(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v2i16i64
+; CHECK: ushll   v1.2d, v1.2s, #0
+; CHECK: ushll   v0.2d, v0.2s, #0
+; CHECK: shl     v0.2d, v0.2d, #48
+; CHECK: sshr    v0.2d, v0.2d, #48
+; CHECK: shl     v1.2d, v1.2d, #48
+; CHECK: sshr    v1.2d, v1.2d, #48
+  %1 = sext <2 x i16> %v1 to <2 x i64>
+  %2 = sext <2 x i16> %v2 to <2 x i64>
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
+  %4 = trunc <2 x i64> %3 to <2 x i16>
+  ret <2 x i16> %4
+}
+
+define <4 x i16> @test_sext_inreg_v4i16i32(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v4i16i32
+; CHECK: sshll v0.4s, v0.4h, #0
+; CHECK: sshll v1.4s, v1.4h, #0
+  %1 = sext <4 x i16> %v1 to <4 x i32>
+  %2 = sext <4 x i16> %v2 to <4 x i32>
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  ret <4 x i16> %4
+}
+
+define <2 x i32> @test_sext_inreg_v2i32i64(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
+; CHECK-LABEL: test_sext_inreg_v2i32i64
+; CHECK: sshll v0.2d, v0.2s, #0
+; CHECK: sshll v1.2d, v1.2s, #0
+  %1 = sext <2 x i32> %v1 to <2 x i64>
+  %2 = sext <2 x i32> %v2 to <2 x i64>
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
+  %4 = trunc <2 x i64> %3 to <2 x i32>
+  ret <2 x i32> %4
+}
+
diff --git a/test/CodeGen/AArch64/sincospow-vector-expansion.ll b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
new file mode 100644
index 0000000..259a55e
--- /dev/null
+++ b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+
+define <2 x float> @test_cos_v2f64(<2 x double> %v1) {
+; CHECK-LABEL: test_cos_v2f64:
+; CHECK: bl cos
+; CHECK: bl cos
+  %1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %v1)
+  %2 = fptrunc <2 x double> %1 to <2 x float>
+  ret <2 x float> %2
+}
+
+define <2 x float> @test_sin_v2f64(<2 x double> %v1) {
+; CHECK-LABEL: test_sin_v2f64:
+; CHECK: bl sin
+; CHECK: bl sin
+  %1 = call <2 x double> @llvm.sin.v2f64(<2 x double> %v1)
+  %2 = fptrunc <2 x double> %1 to <2 x float>
+  ret <2 x float> %2
+}
+
+define <2 x float> @test_pow_v2f64(<2 x double> %v1, <2 x double> %v2) {
+; CHECK-LABEL: test_pow_v2f64:
+; CHECK: bl pow
+; CHECK: bl pow
+  %1 = call <2 x double> @llvm.pow.v2f64(<2 x double> %v1, <2 x double> %v2)
+  %2 = fptrunc <2 x double> %1 to <2 x float>
+  ret <2 x float> %2
+}
+
+declare <2 x double> @llvm.cos.v2f64(<2 x double>)
+declare <2 x double> @llvm.sin.v2f64(<2 x double>)
+declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_cos_v2f32(<2 x float> %v1) {
+; CHECK-LABEL: test_cos_v2f32:
+; CHECK: bl cos
+; CHECK: bl cos
+  %1 = call <2 x float> @llvm.cos.v2f32(<2 x float> %v1)
+  ret <2 x float> %1
+}
+
+define <2 x float> @test_sin_v2f32(<2 x float> %v1) {
+; CHECK-LABEL: test_sin_v2f32:
+; CHECK: bl sin
+; CHECK: bl sin
+  %1 = call <2 x float> @llvm.sin.v2f32(<2 x float> %v1)
+  ret <2 x float> %1
+}
+
+define <2 x float> @test_pow_v2f32(<2 x float> %v1, <2 x float> %v2) {
+; CHECK-LABEL: test_pow_v2f32:
+; CHECK: bl pow
+; CHECK: bl pow
+  %1 = call <2 x float> @llvm.pow.v2f32(<2 x float> %v1, <2 x float> %v2)
+  ret <2 x float> %1
+}
+
+declare <2 x float> @llvm.cos.v2f32(<2 x float>)
+declare <2 x float> @llvm.sin.v2f32(<2 x float>)
+declare <2 x float> @llvm.pow.v2f32(<2 x float>, <2 x float>)
+
+define <4 x float> @test_cos_v4f32(<4 x float> %v1) {
+; CHECK-LABEL: test_cos_v4f32:
+; CHECK: bl cos
+; CHECK: bl cos
+; CHECK: bl cos
+; CHECK: bl cos
+  %1 = call <4 x float> @llvm.cos.v4f32(<4 x float> %v1)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_sin_v4f32(<4 x float> %v1) {
+; CHECK-LABEL: test_sin_v4f32:
+; CHECK: bl sin
+; CHECK: bl sin
+; CHECK: bl sin
+; CHECK: bl sin
+  %1 = call <4 x float> @llvm.sin.v4f32(<4 x float> %v1)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_pow_v4f32(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_pow_v4f32:
+; CHECK: bl pow
+; CHECK: bl pow
+; CHECK: bl pow
+; CHECK: bl pow
+  %1 = call <4 x float> @llvm.pow.v4f32(<4 x float> %v1, <4 x float> %v2)
+  ret <4 x float> %1
+}
+
+declare <4 x float> @llvm.cos.v4f32(<4 x float>)
+declare <4 x float> @llvm.sin.v4f32(<4 x float>)
+declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>)
+
diff --git a/test/CodeGen/AArch64/variadic.ll b/test/CodeGen/AArch64/variadic.ll
index f3d376b..1c7f1e0 100644
--- a/test/CodeGen/AArch64/variadic.ll
+++ b/test/CodeGen/AArch64/variadic.ll
@@ -10,14 +10,12 @@ declare void @llvm.va_start(i8*)
 define void @test_simple(i32 %n, ...) {
 ; CHECK-LABEL: test_simple:
 ; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
 ; CHECK: mov x[[FPRBASE:[0-9]+]], sp
 ; CHECK: str q7, [x[[FPRBASE]], #112]
 ; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
 ; CHECK: str x7, [x[[GPRBASE]], #48]
 
 ; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
 ; CHECK-NOFP: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
 ; CHECK-NOFP: str x7, [x[[GPRBASE]], #48]
 ; CHECK-NOFP-NOT: str q7,
@@ -27,8 +25,10 @@ define void @test_simple(i32 %n, ...) {
 
 ; CHECK: str q0, [sp]
 ; CHECK: str x1, [sp, #[[GPRFROMSP]]]
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
 
 ; CHECK-NOFP-NOT: str q0, [sp]
+; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
 
   %addr = bitcast %va_list* @var to i8*
   call void @llvm.va_start(i8* %addr)
@@ -179,26 +179,63 @@ define void @test_va_copy() {
 
 ; Check beginning and end again:
 
-; CHECK: ldr [[BLOCK:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
 ; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: ldr [[BLOCK:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
+; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
+; CHECK: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
+; CHECK: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24]
+; CHECK: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list]
+; CHECK: str [[BLOCK2]], [x[[DEST_LIST]], #24]
+
 ; CHECK-NOFP: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK-NOFP: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
+; CHECK-NOFP: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
+; CHECK-NOFP: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24]
+; CHECK-NOFP: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list]
+; CHECK-NOFP: str [[BLOCK2]], [x[[DEST_LIST]], #24]
 
-; CHECK: str [[BLOCK]], [{{x[0-9]+}}, #:lo12:second_list]
+  ret void
+; CHECK: ret
+; CHECK-NOFP: ret
+}
 
-; CHECK: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24]
-; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
+%struct.s_3i = type { i32, i32, i32 }
 
-; CHECK: str [[BLOCK]], [x[[DEST_LIST]], #24]
+; This checks that, if the last named argument is not a multiple of 8 bytes,
+; and is allocated on the stack, that __va_list.__stack is initialised to the
+; first 8-byte aligned location above it.
+define void @test_va_odd_struct_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, [1 x i64], %struct.s_3i* byval nocapture readnone align 4 %h, ...) {
+; CHECK-LABEL: test_va_odd_struct_on_stack:
 
-; CHECK-NOFP: str [[BLOCK]], [{{x[0-9]+}}, #:lo12:second_list]
+; CHECK: sub sp, sp, #128
+; CHECK: mov x[[FPRBASE:[0-9]+]], sp
+; CHECK: str q7, [x[[FPRBASE]], #112]
 
-; CHECK-NOFP: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24]
-; CHECK-NOFP: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
+; CHECK-NOT: str x{{[0-9]+}},
+
+; CHECK-NOFP-NOT: str q7,
+; CHECK-NOT: str x7,
 
-; CHECK-NOFP: str [[BLOCK]], [x[[DEST_LIST]], #24]
+; Omit the middle ones
+
+; CHECK: str q0, [sp]
 
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+; CHECK: str wzr, [x[[VA_LIST]], #24]
+; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #128
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+; This constant would be #140 if it was not 8-byte aligned
+; CHECK: add [[STACK:x[0-9]+]], sp, #144
+; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+
+; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; This constant would be #12 if it was not 8-byte aligned
+; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #16
+; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
+; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24]
   ret void
-; CHECK: ret
-; CHECK-NOFP: ret
 }
diff --git a/test/CodeGen/ARM/2006-11-10-CycleInDAG.ll b/test/CodeGen/ARM/2006-11-10-CycleInDAG.ll
index a0235f7..f8bd886 100644
--- a/test/CodeGen/ARM/2006-11-10-CycleInDAG.ll
+++ b/test/CodeGen/ARM/2006-11-10-CycleInDAG.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o /dev/null
 
 %struct.layer_data = type { i32, [2048 x i8], i8*, [16 x i8], i32, i8*, i32, i32, [64 x i32], [64 x i32], [64 x i32], [64 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [12 x [64 x i16]] }
 @ld = external global %struct.layer_data*               ; <%struct.layer_data**> [#uses=1]
diff --git a/test/CodeGen/ARM/2007-04-03-PEIBug.ll b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
index 8d3337c..cf5094f 100644
--- a/test/CodeGen/ARM/2007-04-03-PEIBug.ll
+++ b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | not grep "add.*#0"
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @foo() {
 entry:
@@ -10,3 +10,6 @@ entry:
 }
 
 declare i32 @bar(...)
+
+; CHECK-NOT: add{{.*}}#0
+
diff --git a/test/CodeGen/ARM/2007-05-14-InlineAsmCstCrash.ll b/test/CodeGen/ARM/2007-05-14-InlineAsmCstCrash.ll
index b3b0769..99e67d5 100644
--- a/test/CodeGen/ARM/2007-05-14-InlineAsmCstCrash.ll
+++ b/test/CodeGen/ARM/2007-05-14-InlineAsmCstCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o /dev/null
 
 define i32 @test3() {
 	tail call void asm sideeffect "/* number: ${0:c} */", "i"( i32 1 )
diff --git a/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll b/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
index 670048b..5988c65 100644
--- a/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
+++ b/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | not grep "str.*\!"
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 	%struct.shape_edge_t = type { %struct.shape_edge_t*, %struct.shape_edge_t*, i32, i32, i32, i32 }
 	%struct.shape_path_t = type { %struct.shape_edge_t*, %struct.shape_edge_t*, i32, i32, i32, i32, i32, i32 }
@@ -32,3 +32,6 @@ bb140:		; preds = %bb140, %cond_false
 bb174:		; preds = %bb140, %cond_false
 	ret %struct.shape_path_t* null
 }
+
+; CHECK-NOT: str{{.*}}!
+
diff --git a/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll b/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
index a604c5c..95aa595 100644
--- a/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
+++ b/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6 | not grep 255
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s
 
 define i32 @main(i32 %argc, i8** %argv) {
 entry:
@@ -12,3 +12,6 @@ bb2:		; preds = %bb1
 bb3:		; preds = %bb1
 	ret i32 0
 }
+
+; CHECK-NOT: 255
+
diff --git a/test/CodeGen/ARM/2008-07-17-Fdiv.ll b/test/CodeGen/ARM/2008-07-17-Fdiv.ll
index 4cb768e..9f50d92 100644
--- a/test/CodeGen/ARM/2008-07-17-Fdiv.ll
+++ b/test/CodeGen/ARM/2008-07-17-Fdiv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define float @f(float %a, float %b) nounwind  {
 	%tmp = fdiv float %a, %b
diff --git a/test/CodeGen/ARM/2008-07-24-CodeGenPrepCrash.ll b/test/CodeGen/ARM/2008-07-24-CodeGenPrepCrash.ll
index 83fde07..e86bc1b 100644
--- a/test/CodeGen/ARM/2008-07-24-CodeGenPrepCrash.ll
+++ b/test/CodeGen/ARM/2008-07-24-CodeGenPrepCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 ; PR2589
 
 define void @main({ i32 }*) {
diff --git a/test/CodeGen/ARM/2008-11-18-ScavengerAssert.ll b/test/CodeGen/ARM/2008-11-18-ScavengerAssert.ll
index 601a516..d16ad8c 100644
--- a/test/CodeGen/ARM/2008-11-18-ScavengerAssert.ll
+++ b/test/CodeGen/ARM/2008-11-18-ScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6,+vfp2
+; RUN: llc -mtriple=arm-eabi -mattr=+v6,+vfp2 %s -o /dev/null
 
 define hidden i64 @__muldi3(i64 %u, i64 %v) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/2009-03-09-AddrModeBug.ll b/test/CodeGen/ARM/2009-03-09-AddrModeBug.ll
index a1ce384..7bb1429 100644
--- a/test/CodeGen/ARM/2009-03-09-AddrModeBug.ll
+++ b/test/CodeGen/ARM/2009-03-09-AddrModeBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 	%struct.hit_t = type { %struct.v_t, double }
 	%struct.node_t = type { %struct.hit_t, %struct.hit_t, i32 }
diff --git a/test/CodeGen/ARM/2009-04-06-AsmModifier.ll b/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
index 7342f69..e90c5b3 100644
--- a/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
+++ b/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep "swi 107"
+; RUN: llc -mtriple=arm-eabi -no-integrated-as %s -o - | FileCheck %s
 
 define i32 @_swilseek(i32) nounwind {
 entry:
@@ -18,3 +18,6 @@ return:		; preds = %entry
 	%4 = load i32* %retval		; <i32> [#uses=1]
 	ret i32 %4
 }
+
+; CHECK: swi 107
+
diff --git a/test/CodeGen/ARM/2009-04-08-AggregateAddr.ll b/test/CodeGen/ARM/2009-04-08-AggregateAddr.ll
index f6b3d2c..ade6a10 100644
--- a/test/CodeGen/ARM/2009-04-08-AggregateAddr.ll
+++ b/test/CodeGen/ARM/2009-04-08-AggregateAddr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 ; PR3795
 
 define fastcc void @_D3foo3fooFAriZv({ i32, { double, double }* } %d_arg, i32 %x_arg) {
diff --git a/test/CodeGen/ARM/2009-04-08-FREM.ll b/test/CodeGen/ARM/2009-04-08-FREM.ll
index 99907fc..606c6b1 100644
--- a/test/CodeGen/ARM/2009-04-08-FREM.ll
+++ b/test/CodeGen/ARM/2009-04-08-FREM.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 declare i32 @printf(i8*, ...)
 
diff --git a/test/CodeGen/ARM/2009-04-08-FloatUndef.ll b/test/CodeGen/ARM/2009-04-08-FloatUndef.ll
index 05d2f26..9e32e05 100644
--- a/test/CodeGen/ARM/2009-04-08-FloatUndef.ll
+++ b/test/CodeGen/ARM/2009-04-08-FloatUndef.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define void @execute_shader(<4 x float>* %OUT, <4 x float>* %IN, <4 x float>* %CONST) {
 entry:
diff --git a/test/CodeGen/ARM/2009-04-09-RegScavengerAsm.ll b/test/CodeGen/ARM/2009-04-09-RegScavengerAsm.ll
index deb092b..5b17463 100644
--- a/test/CodeGen/ARM/2009-04-09-RegScavengerAsm.ll
+++ b/test/CodeGen/ARM/2009-04-09-RegScavengerAsm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 ; PR3954
 
 define void @foo(...) nounwind {
diff --git a/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll b/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll
index 7046fcc..2bc7df0 100644
--- a/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll
+++ b/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
+
 	%struct.List = type { %struct.List*, i32 }
 @Node5 = external constant %struct.List		; <%struct.List*> [#uses=1]
 @"\01LC" = external constant [7 x i8]		; <[7 x i8]*> [#uses=1]
diff --git a/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll b/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll
index 1e2707f..5d59fc6 100644
--- a/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll
+++ b/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm | FileCheck %s
-; RUN: llc < %s -march=thumb | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 ; PR4091
 
 define void @foo(i32 %i, i32* %p) nounwind {
diff --git a/test/CodeGen/ARM/2009-07-09-asm-p-constraint.ll b/test/CodeGen/ARM/2009-07-09-asm-p-constraint.ll
index e1e94b6..3cef0aa 100644
--- a/test/CodeGen/ARM/2009-07-09-asm-p-constraint.ll
+++ b/test/CodeGen/ARM/2009-07-09-asm-p-constraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o /dev/null
 
 define void @test(i8* %x) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/2009-07-22-SchedulerAssert.ll b/test/CodeGen/ARM/2009-07-22-SchedulerAssert.ll
index 6761687..bc4a95c 100644
--- a/test/CodeGen/ARM/2009-07-22-SchedulerAssert.ll
+++ b/test/CodeGen/ARM/2009-07-22-SchedulerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 	%struct.cli_ac_alt = type { i8, i8*, i16, i16, %struct.cli_ac_alt* }
 	%struct.cli_ac_node = type { i8, i8, %struct.cli_ac_patt*, %struct.cli_ac_node**, %struct.cli_ac_node* }
diff --git a/test/CodeGen/ARM/2009-08-23-linkerprivate.ll b/test/CodeGen/ARM/2009-08-23-linkerprivate.ll
deleted file mode 100644
index 392c70a..0000000
--- a/test/CodeGen/ARM/2009-08-23-linkerprivate.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=arm -mtriple=arm-apple-darwin | FileCheck %s
-
-; ModuleID = '/Volumes/MacOS9/tests/WebKit/JavaScriptCore/profiler/ProfilerServer.mm'
-
-@"\01l_objc_msgSend_fixup_alloc" = linker_private_weak hidden global i32 0, section "__DATA, __objc_msgrefs, coalesced", align 16
-
-; CHECK: .globl l_objc_msgSend_fixup_alloc
-; CHECK: .weak_definition l_objc_msgSend_fixup_alloc
diff --git a/test/CodeGen/ARM/2009-08-31-TwoRegShuffle.ll b/test/CodeGen/ARM/2009-08-31-TwoRegShuffle.ll
index ee99c70..b078ec0 100644
--- a/test/CodeGen/ARM/2009-08-31-TwoRegShuffle.ll
+++ b/test/CodeGen/ARM/2009-08-31-TwoRegShuffle.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 ; pr4843
+
 define <4 x i16> @v2regbug(<4 x i16>* %B) nounwind {
 ;CHECK-LABEL: v2regbug:
 ;CHECK: vzip.16
diff --git a/test/CodeGen/ARM/2009-09-10-postdec.ll b/test/CodeGen/ARM/2009-09-10-postdec.ll
index 10653b5..66ffe6a 100644
--- a/test/CodeGen/ARM/2009-09-10-postdec.ll
+++ b/test/CodeGen/ARM/2009-09-10-postdec.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm < %s | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 ; Radar 7213850
 
 define i32 @test(i8* %d, i32 %x, i32 %y) nounwind {
diff --git a/test/CodeGen/ARM/2009-09-13-InvalidSuperReg.ll b/test/CodeGen/ARM/2009-09-13-InvalidSuperReg.ll
index 758b59a..dd9a6fd 100644
--- a/test/CodeGen/ARM/2009-09-13-InvalidSuperReg.ll
+++ b/test/CodeGen/ARM/2009-09-13-InvalidSuperReg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon -mcpu=cortex-a9
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -mcpu=cortex-a9 %s -o /dev/null
 
 define arm_aapcs_vfpcc <4 x float> @foo(i8* nocapture %pBuffer, i32 %numItems) nounwind {
   %1 = ptrtoint i8* %pBuffer to i32
diff --git a/test/CodeGen/ARM/2009-09-24-spill-align.ll b/test/CodeGen/ARM/2009-09-24-spill-align.ll
index eb9c2d0..224bd01 100644
--- a/test/CodeGen/ARM/2009-09-24-spill-align.ll
+++ b/test/CodeGen/ARM/2009-09-24-spill-align.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 ; pr4926
 
 define void @test_vget_lanep16() nounwind {
diff --git a/test/CodeGen/ARM/2010-03-04-stm-undef-addr.ll b/test/CodeGen/ARM/2010-03-04-stm-undef-addr.ll
index b0b4cb3..5e75d46 100644
--- a/test/CodeGen/ARM/2010-03-04-stm-undef-addr.ll
+++ b/test/CodeGen/ARM/2010-03-04-stm-undef-addr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define void @"java.lang.String::getChars"([84 x i8]* %method, i32 %base_pc, [788 x i8]* %thread) {
   %1 = sub i32 undef, 48                          ; <i32> [#uses=1]
diff --git a/test/CodeGen/ARM/2010-04-09-NeonSelect.ll b/test/CodeGen/ARM/2010-04-09-NeonSelect.ll
index 89d6a68..ceef083 100644
--- a/test/CodeGen/ARM/2010-04-09-NeonSelect.ll
+++ b/test/CodeGen/ARM/2010-04-09-NeonSelect.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=arm -mattr=+neon < %s
-; Radar 7770501: Don't crash on SELECT and SELECT_CC with NEON vector values.
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o /dev/null
+; rdar://7770501 : Don't crash on SELECT and SELECT_CC with NEON vector values.
 
 define void @vDSP_FFT16_copv(float* nocapture %O, float* nocapture %I, i32 %Direction) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/2010-04-14-SplitVector.ll b/test/CodeGen/ARM/2010-04-14-SplitVector.ll
index 5d0c3cf..cb3e042 100644
--- a/test/CodeGen/ARM/2010-04-14-SplitVector.ll
+++ b/test/CodeGen/ARM/2010-04-14-SplitVector.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mcpu=arm1136jf-s
+; RUN: llc -mtriple=arm-eabi -mcpu=arm1136jf-s %s -o /dev/null
 ; Radar 7854640
 
 define void @test() nounwind {
diff --git a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
index e0f50c9..cfaffd8 100644
--- a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
+++ b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon -O0 -optimize-regalloc -regalloc=basic
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -O0 -optimize-regalloc -regalloc=basic %s -o /dev/null
 
 ; This test would crash the rewriter when trying to handle a spill after one of
 ; the @llvm.arm.neon.vld3.v8i8 defined three parts of a register.
diff --git a/test/CodeGen/ARM/2010-05-21-BuildVector.ll b/test/CodeGen/ARM/2010-05-21-BuildVector.ll
index a400b7b..5bc08b0 100644
--- a/test/CodeGen/ARM/2010-05-21-BuildVector.ll
+++ b/test/CodeGen/ARM/2010-05-21-BuildVector.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
 ; Radar 7872877
 
 define void @test(float* %fltp, i32 %packedValue, float* %table) nounwind {
diff --git a/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll b/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
index 6f48796..f7ceb6e 100644
--- a/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
+++ b/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+neon
-; Radar 8084742
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o /dev/null
+; rdar://8084742
 
 %struct.__int8x8x2_t = type { [2 x <8 x i8>] }
 
diff --git a/test/CodeGen/ARM/2010-06-29-SubregImpDefs.ll b/test/CodeGen/ARM/2010-06-29-SubregImpDefs.ll
index 984583e..fcabc90 100644
--- a/test/CodeGen/ARM/2010-06-29-SubregImpDefs.ll
+++ b/test/CodeGen/ARM/2010-06-29-SubregImpDefs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o /dev/null
 
 @.str271 = external constant [21 x i8], align 4   ; <[21 x i8]*> [#uses=1]
 @llvm.used = appending global [1 x i8*] [i8* bitcast (i32 (i32, i8**)* @main to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
diff --git a/test/CodeGen/ARM/2010-07-26-GlobalMerge.ll b/test/CodeGen/ARM/2010-07-26-GlobalMerge.ll
index 2842437..80822c2 100644
--- a/test/CodeGen/ARM/2010-07-26-GlobalMerge.ll
+++ b/test/CodeGen/ARM/2010-07-26-GlobalMerge.ll
@@ -1,4 +1,4 @@
-; RUN: llc -enable-correct-eh-support < %s
+; RUN: llc < %s
 ; PR7716
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
 target triple = "thumbv7-apple-darwin10.0.0"
diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
index 7aacd1a..bc4cc98 100644
--- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
@@ -124,6 +124,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !44 = metadata !{i32 786688, metadata !39, metadata !"k", metadata !2, i32 26, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
 !45 = metadata !{i32 27, i32 0, metadata !39, null}
 !46 = metadata !{metadata !0, metadata !9, metadata !16, metadata !17, metadata !20}
-!47 = metadata !{i32 0}
+!47 = metadata !{}
 !48 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
 !49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2010-12-07-PEIBug.ll b/test/CodeGen/ARM/2010-12-07-PEIBug.ll
index eef6abd..4baee64 100644
--- a/test/CodeGen/ARM/2010-12-07-PEIBug.ll
+++ b/test/CodeGen/ARM/2010-12-07-PEIBug.ll
@@ -5,11 +5,11 @@ define hidden void @foo() nounwind ssp {
 entry:
 ; CHECK-LABEL: foo:
 ; CHECK: mov r7, sp
-; CHECK-NEXT: vpush {d8}
 ; CHECK-NEXT: vpush {d10, d11}
+; CHECK-NEXT: vpush {d8}
   tail call void asm sideeffect "","~{d8},~{d10},~{d11}"() nounwind
-; CHECK: vpop {d10, d11}
-; CHECK-NEXT: vpop {d8}
+; CHECK: vpop {d8}
+; CHECK-NEXT: vpop {d10, d11}
   ret void
 }
 
diff --git a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
index f57411b..b1d59aa 100644
--- a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
@@ -17,7 +17,7 @@ target triple = "thumbv7-apple-darwin10"
 ; DW_OP_constu
 ; offset
 
-;CHECK: .long Lset6
+;CHECK: .long Lset7
 ;CHECK-NEXT:        @ DW_AT_type
 ;CHECK-NEXT:        @ DW_AT_decl_file
 ;CHECK-NEXT:        @ DW_AT_decl_line
@@ -80,7 +80,7 @@ entry:
 
 !0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get1", metadata !"get1", metadata !"get1", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get1, null, null, metadata !42, i32 4} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !47, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !41, metadata !""} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{i32 786449, metadata !47, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48, metadata !""} ; [ DW_TAG_compile_unit ]
 !3 = metadata !{i32 786453, metadata !47, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !5}
 !5 = metadata !{i32 786468, metadata !47, metadata !1, metadata !"_Bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
@@ -126,5 +126,5 @@ entry:
 !45 = metadata !{metadata !24, metadata !25}
 !46 = metadata !{metadata !27, metadata !28}
 !47 = metadata !{metadata !"foo.c", metadata !"/tmp/"}
-!48 = metadata !{i32 0}
+!48 = metadata !{}
 !49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2011-04-12-AlignBug.ll b/test/CodeGen/ARM/2011-04-12-AlignBug.ll
index 317be94..97297f7 100644
--- a/test/CodeGen/ARM/2011-04-12-AlignBug.ll
+++ b/test/CodeGen/ARM/2011-04-12-AlignBug.ll
@@ -3,9 +3,9 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
 target triple = "thumbv7-apple-darwin10.0.0"
 
 ; CHECK: align 3
-@.v = linker_private unnamed_addr constant <4 x i32> <i32 1, i32 2, i32 3, i32 4>, align 8
+@.v = private unnamed_addr constant <4 x i32> <i32 1, i32 2, i32 3, i32 4>, align 8
 ; CHECK: align 2
-@.strA = linker_private unnamed_addr constant [4 x i8] c"bar\00"
+@.strA = private unnamed_addr constant [4 x i8] c"bar\00"
 ; CHECK-NOT: align
-@.strB = linker_private unnamed_addr constant [4 x i8] c"foo\00", align 1
-@.strC = linker_private unnamed_addr constant [4 x i8] c"baz\00", section "__TEXT,__cstring,cstring_literals", align 1
+@.strB = private unnamed_addr constant [4 x i8] c"foo\00", align 1
+@.strC = private unnamed_addr constant [4 x i8] c"baz\00", section "__TEXT,__cstring,cstring_literals", align 1
diff --git a/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll b/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll
index 7f0f795..12cdd04 100644
--- a/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll
+++ b/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=pic -mcpu=cortex-a8 -arm-tail-calls=1 | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -mcpu=cortex-a8 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-darwin10"
diff --git a/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll b/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
index 101a913..d93cc57 100644
--- a/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
+++ b/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -arm-tail-calls=1 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
 
 ; tail call inside a function where byval argument is splitted between
 ; registers and stack is currently unsupported.
 ; XFAIL: *
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
-target triple = "thumbv7-apple-ios"
+target triple = "thumbv7-apple-ios5.0"
 
 %struct.A = type <{ i16, i16, i32, i16, i16, i32, i16, [8 x %struct.B], [418 x i8], %struct.C }>
 %struct.B = type <{ i32, i16, i16 }>
diff --git a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
index bb78707..ed2840b 100644
--- a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
@@ -8,7 +8,7 @@
 ; DW_OP_constu
 ; offset
 
-;CHECK: .long Lset8
+;CHECK: .long Lset9
 ;CHECK-NEXT:        @ DW_AT_type
 ;CHECK-NEXT:        @ DW_AT_decl_file
 ;CHECK-NEXT:        @ DW_AT_decl_line
@@ -75,7 +75,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!49}
 
-!0 = metadata !{i32 786449, metadata !47, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !41, null} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{i32 786449, metadata !47, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get1", metadata !"get1", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get1, null, null, metadata !42, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [get1]
 !2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
@@ -123,5 +123,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !45 = metadata !{metadata !19, metadata !20}
 !46 = metadata !{metadata !27, metadata !28}
 !47 = metadata !{metadata !"ss3.c", metadata !"/private/tmp"}
-!48 = metadata !{i32 0}
+!48 = metadata !{}
 !49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2011-10-26-memset-inline.ll b/test/CodeGen/ARM/2011-10-26-memset-inline.ll
index 03614ed..17bd291 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-inline.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-inline.ll
@@ -6,10 +6,10 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
 target triple = "thumbv7-apple-ios5.0.0"
 
 ; CHECK-GENERIC:      strb
-; CHECK-GENERIT-NEXT: strb
-; CHECK-GENERIT-NEXT: strb
-; CHECK-GENERIT-NEXT: strb
-; CHECK-GENERIT-NEXT: strb
+; CHECK-GENERIC-NEXT: strb
+; CHECK-GENERIC-NEXT: strb
+; CHECK-GENERIC-NEXT: strb
+; CHECK-GENERIC-NEXT: strb
 ; CHECK-UNALIGNED:    strb
 ; CHECK-UNALIGNED:    str
 define void @foo(i8* nocapture %c) nounwind optsize {
diff --git a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
index 850c511..c8e08c2 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
 
 ; Trigger multiple NEON stores.
 ; CHECK: vst1.64
diff --git a/test/CodeGen/ARM/2011-11-07-PromoteVectorLoadStore.ll b/test/CodeGen/ARM/2011-11-07-PromoteVectorLoadStore.ll
index 8a65f2e..a707a92 100644
--- a/test/CodeGen/ARM/2011-11-07-PromoteVectorLoadStore.ll
+++ b/test/CodeGen/ARM/2011-11-07-PromoteVectorLoadStore.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 ; PR11319
 
 @i8_res  = global <2 x i8> <i8 0, i8 0>
diff --git a/test/CodeGen/ARM/2011-11-09-BitcastVectorDouble.ll b/test/CodeGen/ARM/2011-11-09-BitcastVectorDouble.ll
index 42eb32d..c1554d8 100644
--- a/test/CodeGen/ARM/2011-11-09-BitcastVectorDouble.ll
+++ b/test/CodeGen/ARM/2011-11-09-BitcastVectorDouble.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 ; PR11319
 
 @src1_v2i16 = global <2 x i16> <i16 0, i16 1>
diff --git a/test/CodeGen/ARM/2011-11-09-IllegalVectorFPIntConvert.ll b/test/CodeGen/ARM/2011-11-09-IllegalVectorFPIntConvert.ll
index 719571b..c50461a 100644
--- a/test/CodeGen/ARM/2011-11-09-IllegalVectorFPIntConvert.ll
+++ b/test/CodeGen/ARM/2011-11-09-IllegalVectorFPIntConvert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <2 x i32> @test1(<2 x double>* %A) {
 ; CHECK: test1
diff --git a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
index a263c9c..86b58c8 100644
--- a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
+++ b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mcpu=cortex-a9 %s -o - | FileCheck %s
 
 @A = global <4 x float> <float 0., float 1., float 2., float 3.>
 
diff --git a/test/CodeGen/ARM/2012-04-10-DAGCombine.ll b/test/CodeGen/ARM/2012-04-10-DAGCombine.ll
index 089dc91..9b71be2 100644
--- a/test/CodeGen/ARM/2012-04-10-DAGCombine.ll
+++ b/test/CodeGen/ARM/2012-04-10-DAGCombine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 -enable-unsafe-fp-math
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 -enable-unsafe-fp-math %s -o /dev/null
 ;target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
 ;target triple = "armv7-none-linux-gnueabi"
 
diff --git a/test/CodeGen/ARM/2012-05-04-vmov.ll b/test/CodeGen/ARM/2012-05-04-vmov.ll
index 14dbf7f..c604eed 100644
--- a/test/CodeGen/ARM/2012-05-04-vmov.ll
+++ b/test/CodeGen/ARM/2012-05-04-vmov.ll
@@ -1,5 +1,9 @@
-; RUN: llc -O1 -march=arm -mcpu=cortex-a9 < %s | FileCheck -check-prefix=A9-CHECK %s
-; RUN: llc -O1 -march=arm -mcpu=swift < %s | FileCheck -check-prefix=SWIFT-CHECK %s
+; RUN: llc -O1 -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \
+; RUN:  | FileCheck -check-prefix=A9-CHECK %s
+
+; RUN: llc -O1 -mtriple=arm-eabi -mcpu=swift %s -o - \
+; RUN:  | FileCheck -check-prefix=SWIFT-CHECK %s
+
 ; Check that swift doesn't use vmov.32. <rdar://problem/10453003>.
 
 define <2 x i32> @testuvec(<2 x i32> %A, <2 x i32> %B) nounwind {
diff --git a/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
index dd67843..7f30ae1 100644
--- a/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
+++ b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm -mcpu=swift < %s | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=swift %s -o - | FileCheck %s
 ; <rdar://problem/10451892>
 
 define void @f(i32 %x, i32* %p) nounwind ssp {
diff --git a/test/CodeGen/ARM/2012-08-23-legalize-vmull.ll b/test/CodeGen/ARM/2012-08-23-legalize-vmull.ll
index 647ebd6..e8d4fb2 100644
--- a/test/CodeGen/ARM/2012-08-23-legalize-vmull.ll
+++ b/test/CodeGen/ARM/2012-08-23-legalize-vmull.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 ; PR12281
 ; Test generataion of code for vmull instruction when multiplying 128-bit
diff --git a/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll b/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll
index 3bdbb3c..8d77763 100644
--- a/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll
+++ b/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mcpu=arm7tdmi | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=arm7tdmi %s -o - | FileCheck %s
 
 ; movw is only legal for V6T2 and later.
 ; rdar://12300648
diff --git a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
index 38624e0..5235e9c 100644
--- a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
+++ b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
@@ -1,4 +1,4 @@
-; RUN: not llc < %s -march=arm -mcpu=cortex-a8 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - 2>&1 | FileCheck %s
 
 ; Check for error message:
 ; CHECK: non-trivial scalar-to-vector conversion, possible invalid constraint for vector type
diff --git a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
index 7ba693d..d389b5c 100644
--- a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
+++ b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
@@ -1,4 +1,4 @@
-; RUN: not llc < %s -march=arm -mcpu=cortex-a8 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - 2>&1 | FileCheck %s
 
 ; Check for error message:
 ; CHECK: scalar-to-vector conversion failed, possible invalid constraint for vector type
diff --git a/test/CodeGen/ARM/2013-04-05-Small-ByVal-Structs-PR15293.ll b/test/CodeGen/ARM/2013-04-05-Small-ByVal-Structs-PR15293.ll
index 127429b..c5eba7d 100644
--- a/test/CodeGen/ARM/2013-04-05-Small-ByVal-Structs-PR15293.ll
+++ b/test/CodeGen/ARM/2013-04-05-Small-ByVal-Structs-PR15293.ll
@@ -4,8 +4,8 @@
 ;CHECK-LABEL: foo:
 ;CHECK: 	sub	sp, sp, #8
 ;CHECK: 	push	{r11, lr}
-;CHECK: 	str	r0, [sp, #8]
-;CHECK: 	add	r0, sp, #8
+;CHECK: 	str	r0, [sp, #12]
+;CHECK: 	add	r0, sp, #12
 ;CHECK: 	bl	fooUseParam
 ;CHECK: 	pop	{r11, lr}
 ;CHECK: 	add	sp, sp, #8
diff --git a/test/CodeGen/ARM/2013-04-16-AAPCS-C4-vs-VFP.ll b/test/CodeGen/ARM/2013-04-16-AAPCS-C4-vs-VFP.ll
index 08bf99b..6bd23b1 100644
--- a/test/CodeGen/ARM/2013-04-16-AAPCS-C4-vs-VFP.ll
+++ b/test/CodeGen/ARM/2013-04-16-AAPCS-C4-vs-VFP.ll
@@ -72,7 +72,7 @@ define void @foo(double %p0, ; --> D0
 		 double %p8, ; --> Stack
 		 i32 %p9) #0 { ; --> R0, not Stack+8
 entry:
-  tail call void @fooUseI32(i32 %p9)
+  call void @fooUseI32(i32 %p9)
   ret void
 }
 
diff --git a/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll b/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
index 6db71fe..e79a3ba 100644
--- a/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
+++ b/test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll
@@ -23,9 +23,9 @@ define void @foo(double %vfp0,     ; --> D0,     NSAA=SP
 entry:
   ;CHECK: sub sp, #8
   ;CHECK: push.w {r11, lr}
-  ;CHECK: add r0, sp, #16
-  ;CHECK: str r2, [sp, #20]
-  ;CHECK: str r1, [sp, #16]
+  ;CHECK: add r0, sp, #8
+  ;CHECK: str r2, [sp, #12]
+  ;CHECK: str r1, [sp, #8]
   ;CHECK: bl  fooUseStruct
   call void @fooUseStruct(%st_t* %p1)
   ret void
diff --git a/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll b/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
index c4f5f54..480d087 100644
--- a/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
+++ b/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
@@ -72,6 +72,27 @@ KBBlockZero.exit:                                 ; preds = %bb2.i
   indirectbr i8* undef, [label %KBBlockZero_return_1, label %KBBlockZero_return_0]
 }
 
+@foo = global i32 ()* null
+define i32 @t4(i32 %x, i32 ()* %p_foo) {
+entry:
+;CHECK-LABEL: t4:
+;CHECK-V8-LABEL: t4:
+  %cmp = icmp slt i32 %x, 60
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %tmp.2 = call i32 %p_foo()
+  %sub = add nsw i32 %x, -1
+  br label %return
+
+if.else:                                          ; preds = %entry
+  %sub1 = add nsw i32 %x, -120
+  br label %return
+
+return:                                           ; preds = %if.end5, %if.then4, %if.then
+  %retval.0 = phi i32 [ %sub, %if.then ], [ %sub1, %if.else ]
+  ret i32 %retval.0
+}
 
 ; If-converter was checking for the wrong predicate subsumes pattern when doing
 ; nested predicates.
diff --git a/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll b/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll
new file mode 100644
index 0000000..6c0fbd0
--- /dev/null
+++ b/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -print-before=post-RA-sched %s -o - 2>&1 \
+; RUN:  | FileCheck %s
+
+define void @vst(i8* %m, [4 x i64] %v) {
+entry:
+; CHECK: vst:
+; CHECK: VST1d64Q %R{{[0-9]+}}<kill>, 8, %D{{[0-9]+}}, pred:14, pred:%noreg, %Q{{[0-9]+}}_Q{{[0-9]+}}<imp-use>
+
+  %v0 = extractvalue [4 x i64] %v, 0
+  %v1 = extractvalue [4 x i64] %v, 1
+  %v2 = extractvalue [4 x i64] %v, 2
+  %v3 = extractvalue [4 x i64] %v, 3
+
+  %t0 = bitcast i64 %v0 to <8 x i8>
+  %t1 = bitcast i64 %v1 to <8 x i8>
+  %t2 = bitcast i64 %v2 to <8 x i8>
+  %t3 = bitcast i64 %v3 to <8 x i8>
+
+  %s0 = bitcast <8 x i8> %t0 to <1 x i64>
+  %s1 = bitcast <8 x i8> %t1 to <1 x i64>
+  %s2 = bitcast <8 x i8> %t2 to <1 x i64>
+  %s3 = bitcast <8 x i8> %t3 to <1 x i64>
+
+  %tmp0 = bitcast <1 x i64> %s2 to i64
+  %tmp1 = bitcast <1 x i64> %s3 to i64
+
+  %n0 = insertelement <2 x i64> undef, i64 %tmp0, i32 0
+  %n1 = insertelement <2 x i64> %n0, i64 %tmp1, i32 1
+
+  call void @llvm.arm.neon.vst4.v1i64(i8* %m, <1 x i64> %s0, <1 x i64> %s1, <1 x i64> %s2, <1 x i64> %s3, i32 8)
+
+  call void @bar(<2 x i64> %n1)
+
+  ret void
+}
+
+%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>,  <8 x i8>, <8 x i8> }
+define <8 x i8> @vtbx4(<8 x i8>* %A, %struct.__neon_int8x8x4_t* %B, <8 x i8>* %C) nounwind {
+; CHECK: vtbx4:
+; CHECK: VTBX4 {{.*}}, pred:14, pred:%noreg, %Q{{[0-9]+}}_Q{{[0-9]+}}<imp-use>
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load %struct.__neon_int8x8x4_t* %B
+        %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
+        %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
+        %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
+        %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
+	%tmp7 = load <8 x i8>* %C
+	%tmp8 = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %tmp1, <8 x i8> %tmp3, <8 x i8> %tmp4, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7)
+  call void @bar2(%struct.__neon_int8x8x4_t %tmp2, <8 x i8> %tmp8)
+	ret <8 x i8> %tmp8
+}
+
+declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare <8 x i8>  @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare void @bar2(%struct.__neon_int8x8x4_t, <8 x i8>)
+declare void @bar(<2 x i64> %arg)
diff --git a/test/CodeGen/ARM/2014-02-05-vfp-regs-after-stack.ll b/test/CodeGen/ARM/2014-02-05-vfp-regs-after-stack.ll
new file mode 100644
index 0000000..4c36a2a
--- /dev/null
+++ b/test/CodeGen/ARM/2014-02-05-vfp-regs-after-stack.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -o - -filetype=asm | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv8-none--eabi"
+
+; CHECK-LABEL: fn1:
+define arm_aapcs_vfpcc float @fn1(double %a, double %b, double %c, double %d, double %e, double %f, double %g, float %h, double %i, float %j) {
+  ret float %j
+; CHECK: vldr    s0, [sp, #8]
+}
+
+; CHECK-LABEL: fn2:
+define arm_aapcs_vfpcc float @fn2(double %a, double %b, double %c, double %d, double %e, double %f, float %h, <4 x float> %i, float %j) {
+  ret float %j
+; CHECK: vldr    s0, [sp, #16]
+}
+
+; CHECK-LABEL: fn3:
+define arm_aapcs_vfpcc float @fn3(float %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, float %j) #0 {
+  ret float %j
+; CHECK: vldr    s0, [sp, #8]
+}
diff --git a/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll b/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll
new file mode 100644
index 0000000..33bfa2f
--- /dev/null
+++ b/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll
@@ -0,0 +1,114 @@
+; RUN: llc -mtriple=arm-linux-gnueabihf < %s | FileCheck %s
+
+%struct4bytes = type { i32 }
+%struct8bytes8align = type { i64 }
+%struct12bytes = type { i32, i32, i32 }
+
+declare void @useIntPtr(%struct4bytes*)
+declare void @useLong(i64)
+declare void @usePtr(%struct8bytes8align*)
+
+; a -> r0
+; b -> r1..r3
+; c -> sp+0..sp+7
+define void @foo1(i32 %a, %struct12bytes* byval %b, i64 %c) {
+; CHECK-LABEL: foo1
+; CHECK: sub  sp, sp, #16
+; CHECK: push  {r11, lr}
+; CHECK: add  [[SCRATCH:r[0-9]+]], sp, #12
+; CHECK: stm  [[SCRATCH]], {r1, r2, r3}
+; CHECK: ldr  r0, [sp, #24]
+; CHECK: ldr  r1, [sp, #28]
+; CHECK: bl  useLong
+; CHECK: pop  {r11, lr}
+; CHECK: add  sp, sp, #16
+
+  call void @useLong(i64 %c)
+  ret void
+}
+
+; a -> r0
+; b -> r2..r3
+define void @foo2(i32 %a, %struct8bytes8align* byval %b) {
+; CHECK-LABEL: foo2
+; CHECK: sub  sp, sp, #8
+; CHECK: push  {r11, lr}
+; CHECK: add  r0, sp, #8
+; CHECK: str  r3, [sp, #12]
+; CHECK: str  r2, [sp, #8]
+; CHECK: bl   usePtr
+; CHECK: pop  {r11, lr}
+; CHECK: add  sp, sp, #8
+
+  call void @usePtr(%struct8bytes8align* %b)
+  ret void
+}
+
+; a -> r0..r1
+; b -> r2
+define void @foo3(%struct8bytes8align* byval %a, %struct4bytes* byval %b) {
+; CHECK-LABEL: foo3
+; CHECK: sub  sp, sp, #16
+; CHECK: push  {r11, lr}
+; CHECK: add  [[SCRATCH:r[0-9]+]], sp, #8
+; CHECK: stm  [[SCRATCH]], {r0, r1, r2}
+; CHECK: add  r0, sp, #8
+; CHECK: bl   usePtr
+; CHECK: pop  {r11, lr}
+; CHECK: add  sp, sp, #16
+
+  call void @usePtr(%struct8bytes8align* %a)
+  ret void
+}
+
+; a -> r0
+; b -> r2..r3
+define void @foo4(%struct4bytes* byval %a, %struct8bytes8align* byval %b) {
+; CHECK-LABEL: foo4
+; CHECK: sub     sp, sp, #16
+; CHECK: push    {r11, lr}
+; CHECK: str     r0, [sp, #8]
+; CHECK: add     r0, sp, #16
+; CHECK: str     r3, [sp, #20]
+; CHECK: str     r2, [sp, #16]
+; CHECK: bl      usePtr
+; CHECK: pop     {r11, lr}
+; CHECK: add     sp, sp, #16
+; CHECK: mov     pc, lr
+
+  call void @usePtr(%struct8bytes8align* %b)
+  ret void
+}
+
+; a -> r0..r1
+; b -> r2
+; c -> r3
+define void @foo5(%struct8bytes8align* byval %a, %struct4bytes* byval %b, %struct4bytes* byval %c) {
+; CHECK-LABEL: foo5
+; CHECK: sub     sp, sp, #16
+; CHECK: push    {r11, lr}
+; CHECK: add     [[SCRATCH:r[0-9]+]], sp, #8
+; CHECK: stm     [[SCRATCH]], {r0, r1, r2, r3}
+; CHECK: add     r0, sp, #8
+; CHECK: bl      usePtr
+; CHECK: pop     {r11, lr}
+; CHECK: add     sp, sp, #16
+; CHECK: mov     pc, lr
+
+  call void @usePtr(%struct8bytes8align* %a)
+  ret void
+}
+
+; a..c -> r0..r2
+; d -> sp+0..sp+7
+define void @foo6(i32 %a, i32 %b, i32 %c, %struct8bytes8align* byval %d) {
+; CHECK-LABEL: foo6
+; CHECK: push {r11, lr}
+; CHECK: add  r0, sp, #8
+; CHECK: bl   usePtr
+; CHECK: pop  {r11, lr}
+; CHECK: mov  pc, lr
+
+  call void @usePtr(%struct8bytes8align* %d)
+  ret void
+}
diff --git a/test/CodeGen/ARM/DbgValueOtherTargets.test b/test/CodeGen/ARM/DbgValueOtherTargets.test
index bf90891..9ce2459 100644
--- a/test/CodeGen/ARM/DbgValueOtherTargets.test
+++ b/test/CodeGen/ARM/DbgValueOtherTargets.test
@@ -1 +1 @@
-RUN: llc -O0 -march=arm -asm-verbose < %S/../Inputs/DbgValueOtherTargets.ll | FileCheck %S/../Inputs/DbgValueOtherTargets.ll
+RUN: llc -O0 -mtriple=arm-eabi -asm-verbose %S/../Inputs/DbgValueOtherTargets.ll -o - | FileCheck %S/../Inputs/DbgValueOtherTargets.ll
diff --git a/test/CodeGen/ARM/Windows/aapcs.ll b/test/CodeGen/ARM/Windows/aapcs.ll
new file mode 100644
index 0000000..3f9a09f
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/aapcs.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s | FileCheck %s
+
+; AAPCS mandates an 8-byte stack alignment.  The alloca is implicitly aligned,
+; and no bic is required.
+
+declare void @callee(i8 *%i)
+
+define void @caller() {
+  %i = alloca i8, align 8
+  call void @callee(i8* %i)
+  ret void
+}
+
+; CHECK: sub sp, #8
+; CHECK-NOT: bic
+
diff --git a/test/CodeGen/ARM/Windows/hard-float.ll b/test/CodeGen/ARM/Windows/hard-float.ll
new file mode 100644
index 0000000..f7b7ec2
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/hard-float.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s | FileCheck %s
+
+define float @function(float %f, float %g) nounwind {
+entry:
+  %h = fadd float %f, %g
+  ret float %h
+}
+
+; CHECK: vadd.f32 s0, s0, s1
+
diff --git a/test/CodeGen/ARM/Windows/mangling.ll b/test/CodeGen/ARM/Windows/mangling.ll
new file mode 100644
index 0000000..ce1fe2e
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/mangling.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -o - %s | FileCheck %s
+
+define void @function() nounwind {
+entry:
+  ret void
+}
+
+; CHECK-LABEL: function
+
diff --git a/test/CodeGen/ARM/Windows/no-aeabi.ll b/test/CodeGen/ARM/Windows/no-aeabi.ll
new file mode 100644
index 0000000..4c6676f
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/no-aeabi.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s | FileCheck %s
+
+define i32 @divide(i32 %i, i32 %j) nounwind {
+entry:
+  %quotient = sdiv i32 %i, %j
+  ret i32 %quotient
+}
+
+; CHECK-NOT: __aeabi_idiv
+
diff --git a/test/CodeGen/ARM/Windows/no-arm-mode.ll b/test/CodeGen/ARM/Windows/no-arm-mode.ll
new file mode 100644
index 0000000..6db031f
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/no-arm-mode.ll
@@ -0,0 +1,5 @@
+; RUN: not llc -mtriple=armv7-windows-itanium -mcpu=cortex-a9 -o /dev/null %s 2>&1 \
+; RUN:  | FileCheck %s
+
+; CHECK: does not support ARM mode execution
+
diff --git a/test/CodeGen/ARM/Windows/no-ehabi.ll b/test/CodeGen/ARM/Windows/no-ehabi.ll
new file mode 100644
index 0000000..4119b6d
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/no-ehabi.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -o - %s | FileCheck %s
+
+declare void @callee(i32 %i)
+
+define i32 @caller(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o,
+                   i32 %p) {
+entry:
+  %q = add nsw i32 %j, %i
+  %r = add nsw i32 %q, %k
+  %s = add nsw i32 %r, %l
+  call void @callee(i32 %s)
+  %t = add nsw i32 %n, %m
+  %u = add nsw i32 %t, %o
+  %v = add nsw i32 %u, %p
+  call void @callee(i32 %v)
+  %w = add nsw i32 %v, %s
+  ret i32 %w
+}
+
+; CHECK-NOT: .save {{{.*}}}
+
diff --git a/test/CodeGen/ARM/a15-SD-dep.ll b/test/CodeGen/ARM/a15-SD-dep.ll
index 019ff61..5e5ca4b 100644
--- a/test/CodeGen/ARM/a15-SD-dep.ll
+++ b/test/CodeGen/ARM/a15-SD-dep.ll
@@ -56,3 +56,62 @@ define arm_aapcs_vfpcc <4 x float> @t5(<4 x float> %q, float %f) {
   %i2 = fadd <4 x float> %i1, %i1
   ret <4 x float> %i2
 }
+
+; Test that DPair can be successfully passed as QPR.
+; CHECK-ENABLED-LABEL: test_DPair1:
+; CHECK-DISABLED-LABEL: test_DPair1:
+define void @test_DPair1(i32 %vsout, i8* nocapture %out, float %x, float %y) {
+entry:
+  %0 = insertelement <4 x float> undef, float %x, i32 1
+  %1 = insertelement <4 x float> %0, float %y, i32 0
+  ; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d{{[0-9]*}}[0]
+  ; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d{{[0-9]*}}[1]
+  ; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d{{[0-9]*}}[0]
+  ; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d{{[0-9]*}}[1]
+  ; CHECK-DISABLED-NOT: vdup
+  switch i32 %vsout, label %sw.epilog [
+    i32 1, label %sw.bb
+    i32 0, label %sw.bb6
+  ]
+
+sw.bb:                                            ; preds = %entry
+  %2 = insertelement <4 x float> %1, float 0.000000e+00, i32 0
+  br label %sw.bb6
+
+sw.bb6:                                           ; preds = %sw.bb, %entry
+  %sum.0 = phi <4 x float> [ %1, %entry ], [ %2, %sw.bb ]
+  %3 = extractelement <4 x float> %sum.0, i32 0
+  %conv = fptoui float %3 to i8
+  store i8 %conv, i8* %out, align 1
+  ret void
+
+sw.epilog:                                        ; preds = %entry
+  ret void
+}
+
+; CHECK-ENABLED-LABEL: test_DPair2:
+; CHECK-DISABLED-LABEL: test_DPair2:
+define void @test_DPair2(i32 %vsout, i8* nocapture %out, float %x) {
+entry:
+  %0 = insertelement <4 x float> undef, float %x, i32 0
+  ; CHECK-ENABLED: vdup.32 q{{[0-9]*}}, d{{[0-9]*}}[0]
+  ; CHECK-DISABLED-NOT: vdup
+  switch i32 %vsout, label %sw.epilog [
+    i32 1, label %sw.bb
+    i32 0, label %sw.bb1
+  ]
+
+sw.bb:                                            ; preds = %entry
+  %1 = insertelement <4 x float> %0, float 0.000000e+00, i32 0
+  br label %sw.bb1
+
+sw.bb1:                                           ; preds = %entry, %sw.bb
+  %sum.0 = phi <4 x float> [ %0, %entry ], [ %1, %sw.bb ]
+  %2 = extractelement <4 x float> %sum.0, i32 0
+  %conv = fptoui float %2 to i8
+  store i8 %conv, i8* %out, align 1
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %entry, %sw.bb1
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/a15-mla.ll b/test/CodeGen/ARM/a15-mla.ll
index b233cc2..9867e27 100644
--- a/test/CodeGen/ARM/a15-mla.ll
+++ b/test/CodeGen/ARM/a15-mla.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s  -march=arm -float-abi=hard -mcpu=cortex-a15 -mattr=+neon,+neonfp | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=hard -mcpu=cortex-a15 -mattr=+neon,+neonfp %s -o - \
+; RUN:  | FileCheck %s
 
 ; This test checks that the VMLxForwarting feature is disabled for A15.
 ; CHECK: fun_a:
diff --git a/test/CodeGen/ARM/a15.ll b/test/CodeGen/ARM/a15.ll
index 6f816c1..9f0b280 100644
--- a/test/CodeGen/ARM/a15.ll
+++ b/test/CodeGen/ARM/a15.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s  -mcpu=cortex-a15 | FileCheck %s
+; RUN: llc -mtriple=arm -mcpu=cortex-a15 %s -o - | FileCheck %s
 
 ; CHECK: a
 define i32 @a(i32 %x) {
diff --git a/test/CodeGen/ARM/addrmode.ll b/test/CodeGen/ARM/addrmode.ll
index 748d258..8fd1da7 100644
--- a/test/CodeGen/ARM/addrmode.ll
+++ b/test/CodeGen/ARM/addrmode.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=arm -stats 2>&1 | grep asm-printer | grep 4
+; RUN: llc -mtriple=arm-eabi -stats %s -o - 2>&1 | FileCheck %s
 
 define i32 @t1(i32 %a) {
 	%b = mul i32 %a, 9
@@ -14,3 +14,6 @@ define i32 @t2(i32 %a) {
         %d = load i32* %c
 	ret i32 %d
 }
+
+; CHECK: 4 asm-printer
+
diff --git a/test/CodeGen/ARM/addrspacecast.ll b/test/CodeGen/ARM/addrspacecast.ll
index 2e98ba5..7b6237d 100644
--- a/test/CodeGen/ARM/addrspacecast.ll
+++ b/test/CodeGen/ARM/addrspacecast.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 ; Check that codegen for an addrspace cast succeeds without error.
 define <4 x i32 addrspace(1)*> @f (<4 x i32*> %x) {
diff --git a/test/CodeGen/ARM/arm-abi-attr.ll b/test/CodeGen/ARM/arm-abi-attr.ll
new file mode 100644
index 0000000..f3923ae
--- /dev/null
+++ b/test/CodeGen/ARM/arm-abi-attr.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=arm-linux < %s | FileCheck %s --check-prefix=APCS
+; RUN: llc -mtriple=arm-linux -mattr=apcs < %s | \
+; RUN: FileCheck %s --check-prefix=APCS
+; RUN: llc -mtriple=arm-linux-gnueabi -mattr=apcs < %s | \
+; RUN: FileCheck %s --check-prefix=APCS
+
+; RUN: llc -mtriple=arm-linux-gnueabi < %s | FileCheck %s --check-prefix=AAPCS
+; RUN: llc -mtriple=arm-linux-gnueabi -mattr=aapcs < %s | \
+; RUN: FileCheck %s --check-prefix=AAPCS
+; RUN: llc -mtriple=arm-linux-gnu -mattr=aapcs < %s | \
+; RUN: FileCheck %s --check-prefix=AAPCS
+
+; The stack is 8 byte aligned on AAPCS and 4 on APCS, so we should get a BIC
+; only on APCS.
+
+define void @g() {
+; APCS: sub	sp, sp, #8
+; APCS: bic	sp, sp, #7
+
+; AAPCS: sub	sp, sp, #8
+; AAPCS-NOT: bic
+
+  %c = alloca i8, align 8
+  call void @f(i8* %c)
+  ret void
+}
+
+declare void @f(i8*)
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index 88d797e..bf827d6 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -march=arm | FileCheck -check-prefix=ARM %s
-; RUN: llc < %s -march=thumb | FileCheck -check-prefix=THUMB %s
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck -check-prefix=T2 %s
-; RUN: llc < %s -mtriple=thumbv8 | FileCheck -check-prefix=V8 %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck -check-prefix=ARM %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck -check-prefix=THUMB %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
+; RUN:   | FileCheck -check-prefix=T2 %s
+; RUN: llc -mtriple=thumbv8-eabi %s -o - | FileCheck -check-prefix=V8 %s
 
 ; FIXME: The -march=thumb test doesn't change if -disable-peephole is specified.
 
diff --git a/test/CodeGen/ARM/arm-asm.ll b/test/CodeGen/ARM/arm-asm.ll
index 2e35e39..e869abe 100644
--- a/test/CodeGen/ARM/arm-asm.ll
+++ b/test/CodeGen/ARM/arm-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define void @frame_dummy() {
 entry:
diff --git a/test/CodeGen/ARM/arm-modifier.ll b/test/CodeGen/ARM/arm-modifier.ll
index 8548642..580f7e7 100644
--- a/test/CodeGen/ARM/arm-modifier.ll
+++ b/test/CodeGen/ARM/arm-modifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 -no-integrated-as %s -o - | FileCheck %s
 
 define i32 @foo(float %scale, float %scale2) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/arm-negative-stride.ll b/test/CodeGen/ARM/arm-negative-stride.ll
index fb0f8ff..7decb97 100644
--- a/test/CodeGen/ARM/arm-negative-stride.ll
+++ b/test/CodeGen/ARM/arm-negative-stride.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 ; This loop is rewritten with an indvar which counts down, which
 ; frees up a register from holding the trip count.
diff --git a/test/CodeGen/ARM/arm-ttype-target2.ll b/test/CodeGen/ARM/arm-ttype-target2.ll
index 8b5087f..4d61cb5 100644
--- a/test/CodeGen/ARM/arm-ttype-target2.ll
+++ b/test/CodeGen/ARM/arm-ttype-target2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=armv7-none-linux-gnueabi -arm-enable-ehabi -arm-enable-ehabi-descriptors < %s | FileCheck %s 
+; RUN: llc -mtriple=armv7-none-linux-gnueabi < %s | FileCheck %s
 
 @_ZTVN10__cxxabiv117__class_type_infoE = external global i8*
 @_ZTS3Foo = linkonce_odr constant [5 x i8] c"3Foo\00"
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index 0477d4f..a881d5f 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -55,8 +55,8 @@ define i64 @test3(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test3:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: and [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK: and [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-DAG: and [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-DAG: and [[REG4:(r[0-9]?[13579])]], [[REG2]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -65,8 +65,8 @@ define i64 @test3(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test3:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: and.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB: and.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-DAG: and.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-DAG: and.w [[REG4:[a-z0-9]+]], [[REG2]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -80,8 +80,8 @@ define i64 @test4(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test4:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: orr [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK: orr [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-DAG: orr [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-DAG: orr [[REG4:(r[0-9]?[13579])]], [[REG2]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -90,8 +90,8 @@ define i64 @test4(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test4:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: orr.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB: orr.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-DAG: orr.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-DAG: orr.w [[REG4:[a-z0-9]+]], [[REG2]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -105,8 +105,8 @@ define i64 @test5(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test5:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: eor [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK: eor [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-DAG: eor [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-DAG: eor [[REG4:(r[0-9]?[13579])]], [[REG2]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -115,8 +115,8 @@ define i64 @test5(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test5:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: eor.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB: eor.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-DAG: eor.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-DAG: eor.w [[REG4:[a-z0-9]+]], [[REG2]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -151,8 +151,9 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-LABEL: test7:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: cmp [[REG1]]
-; CHECK: cmpeq [[REG2]]
+; CHECK-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG1]], r1
+; CHECK-DAG: eor     [[MISMATCH_HI:r[0-9]+]], [[REG2]], r2
+; CHECK: orrs    {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
 ; CHECK: bne
 ; CHECK: strexd {{[a-z0-9]+}}, {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
 ; CHECK: cmp
@@ -162,16 +163,16 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-THUMB-LABEL: test7:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: cmp [[REG1]]
-; CHECK-THUMB: it eq
-; CHECK-THUMB: cmpeq [[REG2]]
+; CHECK-THUMB-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
+; CHECK-THUMB-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
+; CHECK-THUMB: orrs    [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-  %r = cmpxchg i64* %ptr, i64 %val1, i64 %val2 seq_cst
+  %r = cmpxchg i64* %ptr, i64 %val1, i64 %val2 seq_cst seq_cst
   ret i64 %r
 }
 
@@ -216,9 +217,18 @@ define i64 @test10(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test10:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: subs {{[a-z0-9]+}}, [[REG1]], [[REG3:(r[0-9]?[02468])]]
-; CHECK: sbcs {{[a-z0-9]+}}, [[REG2]], [[REG4:(r[0-9]?[13579])]]
-; CHECK: blt
+; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
+; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
+; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
+; CHECK: cmp     [[REG1]], r1
+; CHECK: movwls  [[CARRY_LO]], #1
+; CHECK: cmp     [[REG2]], r2
+; CHECK: movwle  [[CARRY_HI]], #1
+; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
+; CHECK: cmp     [[CARRY_HI]], #0
+; CHECK: movne   [[OUT_HI]], [[REG2]]
+; CHECK: mov     [[OUT_LO:[a-z0-9]+]], r1
+; CHECK: movne   [[OUT_LO]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -227,9 +237,18 @@ define i64 @test10(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test10:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: subs.w {{[a-z0-9]+}}, [[REG1]], [[REG3:[a-z0-9]+]]
-; CHECK-THUMB: sbcs.w {{[a-z0-9]+}}, [[REG2]], [[REG4:[a-z0-9]+]]
-; CHECK-THUMB: blt
+; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
+; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
+; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB: movls.w  [[CARRY_LO]], #1
+; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB: movle  [[CARRY_HI]], #1
+; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
+; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
+; CHECK-THUMB: cmp     [[CARRY_HI]], #0
+; CHECK-THUMB: mov     [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB: movne   [[OUT_HI]], [[REG2]]
+; CHECK-THUMB: movne   [[OUT_LO]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -243,9 +262,18 @@ define i64 @test11(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test11:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: subs {{[a-z0-9]+}}, [[REG1]], [[REG3:(r[0-9]?[02468])]]
-; CHECK: sbcs {{[a-z0-9]+}}, [[REG2]], [[REG4:(r[0-9]?[13579])]]
-; CHECK: blo
+; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
+; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
+; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
+; CHECK: cmp     [[REG1]], r1
+; CHECK: movwls  [[CARRY_LO]], #1
+; CHECK: cmp     [[REG2]], r2
+; CHECK: movwls  [[CARRY_HI]], #1
+; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
+; CHECK: cmp     [[CARRY_HI]], #0
+; CHECK: movne   [[OUT_HI]], [[REG2]]
+; CHECK: mov     [[OUT_LO:[a-z0-9]+]], r1
+; CHECK: movne   [[OUT_LO]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -255,9 +283,18 @@ define i64 @test11(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test11:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: subs.w {{[a-z0-9]+}}, [[REG1]], [[REG3:[a-z0-9]+]]
-; CHECK-THUMB: sbcs.w {{[a-z0-9]+}}, [[REG2]], [[REG4:[a-z0-9]+]]
-; CHECK-THUMB: blo
+; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
+; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
+; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB: movls.w  [[CARRY_LO]], #1
+; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB: movls  [[CARRY_HI]], #1
+; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
+; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
+; CHECK-THUMB: cmp     [[CARRY_HI]], #0
+; CHECK-THUMB: mov     [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB: movne   [[OUT_HI]], [[REG2]]
+; CHECK-THUMB: movne   [[OUT_LO]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -271,9 +308,18 @@ define i64 @test12(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test12:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: subs {{[a-z0-9]+}}, [[REG1]], [[REG3:(r[0-9]?[02468])]]
-; CHECK: sbcs {{[a-z0-9]+}}, [[REG2]], [[REG4:(r[0-9]?[13579])]]
-; CHECK: bge
+; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
+; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
+; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
+; CHECK: cmp     [[REG1]], r1
+; CHECK: movwhi  [[CARRY_LO]], #1
+; CHECK: cmp     [[REG2]], r2
+; CHECK: movwgt  [[CARRY_HI]], #1
+; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
+; CHECK: cmp     [[CARRY_HI]], #0
+; CHECK: movne   [[OUT_HI]], [[REG2]]
+; CHECK: mov     [[OUT_LO:[a-z0-9]+]], r1
+; CHECK: movne   [[OUT_LO]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -282,9 +328,18 @@ define i64 @test12(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test12:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: subs.w {{[a-z0-9]+}}, [[REG1]], [[REG3:[a-z0-9]+]]
-; CHECK-THUMB: sbcs.w {{[a-z0-9]+}}, [[REG2]], [[REG4:[a-z0-9]+]]
-; CHECK-THUMB: bge
+; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
+; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
+; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB: movhi.w  [[CARRY_LO]], #1
+; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB: movgt  [[CARRY_HI]], #1
+; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
+; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
+; CHECK-THUMB: cmp     [[CARRY_HI]], #0
+; CHECK-THUMB: mov     [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB: movne   [[OUT_HI]], [[REG2]]
+; CHECK-THUMB: movne   [[OUT_LO]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -298,9 +353,18 @@ define i64 @test13(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test13:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: subs {{[a-z0-9]+}}, [[REG1]], [[REG3:(r[0-9]?[02468])]]
-; CHECK: sbcs {{[a-z0-9]+}}, [[REG2]], [[REG4:(r[0-9]?[13579])]]
-; CHECK: bhs
+; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
+; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
+; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
+; CHECK: cmp     [[REG1]], r1
+; CHECK: movwhi  [[CARRY_LO]], #1
+; CHECK: cmp     [[REG2]], r2
+; CHECK: movwhi  [[CARRY_HI]], #1
+; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
+; CHECK: cmp     [[CARRY_HI]], #0
+; CHECK: movne   [[OUT_HI]], [[REG2]]
+; CHECK: mov     [[OUT_LO:[a-z0-9]+]], r1
+; CHECK: movne   [[OUT_LO]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -309,9 +373,18 @@ define i64 @test13(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test13:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: subs.w {{[a-z0-9]+}}, [[REG1]], [[REG3:[a-z0-9]+]]
-; CHECK-THUMB: sbcs.w {{[a-z0-9]+}}, [[REG2]], [[REG4:[a-z0-9]+]]
-; CHECK-THUMB: bhs
+; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
+; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
+; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB: movhi.w  [[CARRY_LO]], #1
+; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB: movhi  [[CARRY_HI]], #1
+; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
+; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
+; CHECK-THUMB: cmp     [[CARRY_HI]], #0
+; CHECK-THUMB: mov     [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB: movne   [[OUT_HI]], [[REG2]]
+; CHECK-THUMB: movne   [[OUT_LO]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
diff --git a/test/CodeGen/ARM/atomic-cmp.ll b/test/CodeGen/ARM/atomic-cmp.ll
index 51ada69..a473807 100644
--- a/test/CodeGen/ARM/atomic-cmp.ll
+++ b/test/CodeGen/ARM/atomic-cmp.ll
@@ -10,6 +10,6 @@ define i8 @t(i8* %a, i8 %b, i8 %c) nounwind {
 ; T2-LABEL: t:
 ; T2: ldrexb
 ; T2: strexb
-  %tmp0 = cmpxchg i8* %a, i8 %b, i8 %c monotonic
+  %tmp0 = cmpxchg i8* %a, i8 %b, i8 %c monotonic monotonic
   ret i8 %tmp0
 }
diff --git a/test/CodeGen/ARM/atomic-load-store.ll b/test/CodeGen/ARM/atomic-load-store.ll
index 53c7184..45a263d 100644
--- a/test/CodeGen/ARM/atomic-load-store.ll
+++ b/test/CodeGen/ARM/atomic-load-store.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=ARM
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s -check-prefix=THUMBTWO
 ; RUN: llc < %s -mtriple=thumbv6-apple-ios | FileCheck %s -check-prefix=THUMBONE
-; RUN  llc < %s -mtriple=armv4-apple-ios | FileCheck %s -check-prefix=ARMV4
+; RUN: llc < %s -mtriple=armv4-apple-ios | FileCheck %s -check-prefix=ARMV4
 
 define void @test1(i32* %ptr, i32 %val1) {
 ; ARM: test1
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index 9a79c9f..ac8e949 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -194,3 +194,40 @@ entry:
   %0 = atomicrmw add i32* %p, i32 1 monotonic
   ret i32 %0
 }
+
+define i32 @test_cmpxchg_fail_order(i32 *%addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: test_cmpxchg_fail_order:
+
+  %oldval = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+; CHECK:     dmb ish
+; CHECK: [[LOOP_BB:\.?LBB[0-9]+_1]]:
+; CHECK:     ldrex   [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
+; CHECK:     cmp     [[OLDVAL]], r1
+; CHECK:     bxne    lr
+; CHECK:     strex   [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
+; CHECK:     cmp     [[SUCCESS]], #0
+; CHECK:     bne     [[LOOP_BB]]
+; CHECK:     dmb     ish
+; CHECK:     bx      lr
+
+  ret i32 %oldval
+}
+
+define i32 @test_cmpxchg_fail_order1(i32 *%addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: test_cmpxchg_fail_order1:
+
+  %oldval = cmpxchg i32* %addr, i32 %desired, i32 %new acquire acquire
+; CHECK-NOT:     dmb ish
+; CHECK: [[LOOP_BB:\.?LBB[0-9]+_1]]:
+; CHECK:     ldrex   [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
+; CHECK:     cmp     [[OLDVAL]], r1
+; CHECK:     bne     [[END_BB:\.?LBB[0-9]+_[0-9]+]]
+; CHECK:     strex   [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
+; CHECK:     cmp     [[SUCCESS]], #0
+; CHECK:     bne     [[LOOP_BB]]
+; CHECK: [[END_BB]]:
+; CHECK:     dmb     ish
+; CHECK:     bx      lr
+
+  ret i32 %oldval
+}
diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll
index 3f93929..7922e22 100644
--- a/test/CodeGen/ARM/atomic-ops-v8.ll
+++ b/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=armv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=thumbv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=armv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM
+; RUN: llc -mtriple=thumbv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-THUMB
 
 @var8 = global i8 0
 @var16 = global i16 0
@@ -15,7 +15,7 @@ define i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -38,7 +38,7 @@ define i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -61,7 +61,7 @@ define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -75,7 +75,7 @@ define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
    ret i32 %old
 }
 
-define i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
+define void @test_atomic_load_add_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_add_i64:
    %old = atomicrmw add i64* @var64, i64 %offset monotonic
 ; CHECK-NOT: dmb
@@ -84,10 +84,10 @@ define i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: adds [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: adds{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
 ; CHECK-NEXT: adc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
 ; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
@@ -95,9 +95,9 @@ define i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD1]]
-; CHECK-NEXT: mov r1, r[[OLD2]]
-   ret i64 %old
+; CHECK: strd r[[OLD1]], r[[OLD2]], [r[[ADDR]]]
+  store i64 %old, i64* @var64
+   ret void
 }
 
 define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
@@ -109,7 +109,7 @@ define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -132,7 +132,7 @@ define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -155,7 +155,7 @@ define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -169,7 +169,7 @@ define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
    ret i32 %old
 }
 
-define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
+define void @test_atomic_load_sub_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_sub_i64:
    %old = atomicrmw sub i64* @var64, i64 %offset seq_cst
 ; CHECK-NOT: dmb
@@ -178,10 +178,10 @@ define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: subs [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: subs{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
 ; CHECK-NEXT: sbc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
 ; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
@@ -189,9 +189,9 @@ define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD1]]
-; CHECK-NEXT: mov r1, r[[OLD2]]
-   ret i64 %old
+; CHECK: strd r[[OLD1]], r[[OLD2]], [r[[ADDR]]]
+   store i64 %old, i64* @var64
+   ret void
 }
 
 define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
@@ -203,7 +203,7 @@ define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -226,7 +226,7 @@ define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -249,7 +249,7 @@ define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -263,7 +263,7 @@ define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
    ret i32 %old
 }
 
-define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
+define void @test_atomic_load_and_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_and_i64:
    %old = atomicrmw and i64* @var64, i64 %offset acquire
 ; CHECK-NOT: dmb
@@ -272,20 +272,20 @@ define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: and{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
-; CHECK-NEXT: and{{(\.w)?}} [[NEW2:r[0-9]+]], r[[OLD2]], r1
-; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-DAG: and{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-DAG: and{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD1]]
-; CHECK-NEXT: mov r1, r[[OLD2]]
-   ret i64 %old
+; CHECK: strd r[[OLD1]], r[[OLD2]], [r[[ADDR]]]
+   store i64 %old, i64* @var64
+   ret void
 }
 
 define i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
@@ -297,7 +297,7 @@ define i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -320,7 +320,7 @@ define i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -343,7 +343,7 @@ define i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -357,7 +357,7 @@ define i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
    ret i32 %old
 }
 
-define i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
+define void @test_atomic_load_or_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_or_i64:
    %old = atomicrmw or i64* @var64, i64 %offset release
 ; CHECK-NOT: dmb
@@ -366,20 +366,20 @@ define i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: orr{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
-; CHECK-NEXT: orr{{(\.w)?}} [[NEW2:r[0-9]+]], r[[OLD2]], r1
-; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-DAG: orr{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-DAG: orr{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD1]]
-; CHECK-NEXT: mov r1, r[[OLD2]]
-   ret i64 %old
+; CHECK: strd r[[OLD1]], r[[OLD2]], [r[[ADDR]]]
+   store i64 %old, i64* @var64
+   ret void
 }
 
 define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
@@ -391,7 +391,7 @@ define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -414,7 +414,7 @@ define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -437,7 +437,7 @@ define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
@@ -451,7 +451,7 @@ define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
    ret i32 %old
 }
 
-define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
+define void @test_atomic_load_xor_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_xor_i64:
    %old = atomicrmw xor i64* @var64, i64 %offset monotonic
 ; CHECK-NOT: dmb
@@ -460,20 +460,20 @@ define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: eor{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
-; CHECK-NEXT: eor{{(\.w)?}} [[NEW2:r[0-9]+]], r[[OLD2]], r1
-; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-DAG: eor{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-DAG: eor{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD1]]
-; CHECK-NEXT: mov r1, r[[OLD2]]
-   ret i64 %old
+; CHECK: strd r[[OLD1]], r[[OLD2]], [r[[ADDR]]]
+   store i64 %old, i64* @var64
+   ret void
 }
 
 define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
@@ -485,7 +485,7 @@ define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
@@ -507,7 +507,7 @@ define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
@@ -529,7 +529,7 @@ define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
@@ -542,7 +542,7 @@ define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
    ret i32 %old
 }
 
-define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
+define void @test_atomic_load_xchg_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_xchg_i64:
    %old = atomicrmw xchg i64* @var64, i64 %offset acquire
 ; CHECK-NOT: dmb
@@ -551,7 +551,7 @@ define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
@@ -560,28 +560,28 @@ define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD1]]
-; CHECK-NEXT: mov r1, r[[OLD2]]
-   ret i64 %old
+; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+   store i64 %old, i64* @var64
+   ret void
 }
 
-define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
+define i8 @test_atomic_load_min_i8(i8 signext %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_min_i8:
    %old = atomicrmw min i8* @var8, i8 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK-DAG: movw [[ADDR:r[0-9]+|lr]], :lower16:var8
+; CHECK-DAG: movt [[ADDR]], :upper16:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexb r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
 ; CHECK-NEXT: sxtb r[[OLDX:[0-9]+]], r[[OLD]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp r[[OLDX]], r0
-; Thumb mode: it ge
-; CHECK:      movge r[[OLDX]], r0
-; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
+; Thumb mode: it le
+; CHECK:      movle r[[OLDX]], r[[OLD]]
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r[[OLDX]], {{.*}}[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -591,23 +591,23 @@ define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
    ret i8 %old
 }
 
-define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
+define i16 @test_atomic_load_min_i16(i16 signext %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_min_i16:
    %old = atomicrmw min i16* @var16, i16 %offset release
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK: movt r[[ADDR]], :upper16:var16
+; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var16
+; CHECK: movt [[ADDR]], :upper16:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexh r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
 ; CHECK-NEXT: sxth r[[OLDX:[0-9]+]], r[[OLD]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp r[[OLDX]], r0
-; Thumb mode: it ge
-; CHECK:      movge r[[OLDX]], r0
-; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
+; Thumb mode: it le
+; CHECK:      movle r[[OLDX]], r[[OLD]]
+; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r[[OLDX]], {{.*}}[[ADDR]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -626,13 +626,13 @@ define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
 ; CHECK-NEXT: cmp r[[OLD]], r0
-; Thumb mode: it lt
-; CHECK:      movlt r[[NEW]], r[[OLD]]
+; Thumb mode: it le
+; CHECK:      movle r[[NEW]], r[[OLD]]
 ; CHECK-NEXT: strex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -643,7 +643,7 @@ define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
    ret i32 %old
 }
 
-define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
+define void @test_atomic_load_min_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_min_i64:
    %old = atomicrmw min i64* @var64, i64 %offset seq_cst
 ; CHECK-NOT: dmb
@@ -652,41 +652,50 @@ define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
-; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
-; CHECK-NEXT: blt .LBB{{[0-9]+}}_3
-; CHECK-NEXT: BB#2:
-; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
+; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
+; CHECK-ARM: cmp [[OLD1]], r0
+; CHECK-ARM: movwls [[LOCARRY]], #1
+; CHECK-ARM: cmp [[OLD2]], r1
+; CHECK-ARM: movwle [[HICARRY]], #1
+; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
+; CHECK-ARM: cmp [[HICARRY]], #0
+; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
+; CHECK-ARM: movne [[MINHI]], [[OLD2]]
+; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
+; CHECK-ARM: movne [[MINLO]], [[OLD1]]
+; CHECK-ARM: stlexd [[STATUS:r[0-9]+]], [[MINLO]], [[MINHI]], [r[[ADDR]]]
+; CHECK-THUMB: stlexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD1]]
-; CHECK-NEXT: mov r1, r[[OLD2]]
-   ret i64 %old
+; CHECK-ARM: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+   store i64 %old, i64* @var64
+   ret void
 }
 
-define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
+define i8 @test_atomic_load_max_i8(i8 signext %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_max_i8:
    %old = atomicrmw max i8* @var8, i8 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var8
+; CHECK: movt [[ADDR]], :upper16:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexb r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
 ; CHECK-NEXT: sxtb r[[OLDX:[0-9]+]], r[[OLD]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp r[[OLDX]], r0
-; Thumb mode: it le
-; CHECK:      movle r[[OLDX]], r0
-; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
+; Thumb mode: it gt
+; CHECK:      movgt r[[OLDX]], r[[OLD]]
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], r[[OLDX]], {{.*}}[[ADDR]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -696,7 +705,7 @@ define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
    ret i8 %old
 }
 
-define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
+define i16 @test_atomic_load_max_i16(i16 signext %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_max_i16:
    %old = atomicrmw max i16* @var16, i16 %offset acquire
 ; CHECK-NOT: dmb
@@ -705,13 +714,13 @@ define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
 ; CHECK-NEXT: sxth r[[OLDX:[0-9]+]], r[[OLD]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp r[[OLDX]], r0
-; Thumb mode: it le
-; CHECK:      movle r[[OLDX]], r0
+; Thumb mode: it gt
+; CHECK:      movgt r[[OLDX]], r[[OLD]]
 ; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -731,7 +740,7 @@ define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
@@ -748,7 +757,7 @@ define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
    ret i32 %old
 }
 
-define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
+define void @test_atomic_load_max_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_max_i64:
    %old = atomicrmw max i64* @var64, i64 %offset monotonic
 ; CHECK-NOT: dmb
@@ -757,41 +766,50 @@ define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
-; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
-; CHECK-NEXT: bge .LBB{{[0-9]+}}_3
-; CHECK-NEXT: BB#2:
-; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
+; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
+; CHECK-ARM: cmp [[OLD1]], r0
+; CHECK-ARM: movwhi [[LOCARRY]], #1
+; CHECK-ARM: cmp [[OLD2]], r1
+; CHECK-ARM: movwgt [[HICARRY]], #1
+; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
+; CHECK-ARM: cmp [[HICARRY]], #0
+; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
+; CHECK-ARM: movne [[MINHI]], [[OLD2]]
+; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
+; CHECK-ARM: movne [[MINLO]], [[OLD1]]
+; CHECK-ARM: strexd [[STATUS:r[0-9]+]], [[MINLO]], [[MINHI]], [r[[ADDR]]]
+; CHECK-THUMB: strexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD1]]
-; CHECK-NEXT: mov r1, r[[OLD2]]
-   ret i64 %old
+; CHECK-ARM: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+   store i64 %old, i64* @var64
+   ret void
 }
 
-define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
+define i8 @test_atomic_load_umin_i8(i8 zeroext %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i8:
    %old = atomicrmw umin i8* @var8, i8 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var8
+; CHECK: movt [[ADDR]], :upper16:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexb r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
 ; CHECK-NEXT: cmp r[[OLD]], r0
-; Thumb mode: it lo
-; CHECK:      movlo r[[NEW]], r[[OLD]]
-; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; Thumb mode: it ls
+; CHECK:      movls r[[NEW]], r[[OLD]]
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r[[NEW]], {{.*}}[[ADDR]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -801,23 +819,23 @@ define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
    ret i8 %old
 }
 
-define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
+define i16 @test_atomic_load_umin_i16(i16 zeroext %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i16:
    %old = atomicrmw umin i16* @var16, i16 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK: movt r[[ADDR]], :upper16:var16
+; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var16
+; CHECK: movt [[ADDR]], :upper16:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexh r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
 ; CHECK-NEXT: cmp r[[OLD]], r0
-; Thumb mode: it lo
-; CHECK:      movlo r[[NEW]], r[[OLD]]
-; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; Thumb mode: it ls
+; CHECK:      movls r[[NEW]], r[[OLD]]
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[NEW]], {{.*}}[[ADDR]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -836,13 +854,13 @@ define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
 ; CHECK-NEXT: cmp r[[OLD]], r0
-; Thumb mode: it lo
-; CHECK:      movlo r[[NEW]], r[[OLD]]
+; Thumb mode: it ls
+; CHECK:      movls r[[NEW]], r[[OLD]]
 ; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -853,50 +871,59 @@ define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
    ret i32 %old
 }
 
-define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
+define void @test_atomic_load_umin_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i64:
-   %old = atomicrmw umin i64* @var64, i64 %offset acq_rel
+   %old = atomicrmw umin i64* @var64, i64 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
 ; CHECK: movt r[[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
-; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
-; CHECK-NEXT: blo .LBB{{[0-9]+}}_3
-; CHECK-NEXT: BB#2:
-; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
+; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
+; CHECK-ARM: cmp [[OLD1]], r0
+; CHECK-ARM: movwls [[LOCARRY]], #1
+; CHECK-ARM: cmp [[OLD2]], r1
+; CHECK-ARM: movwls [[HICARRY]], #1
+; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
+; CHECK-ARM: cmp [[HICARRY]], #0
+; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
+; CHECK-ARM: movne [[MINHI]], [[OLD2]]
+; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
+; CHECK-ARM: movne [[MINLO]], [[OLD1]]
+; CHECK-ARM: stlexd [[STATUS:r[0-9]+]], [[MINLO]], [[MINHI]], [r[[ADDR]]]
+; CHECK-THUMB: stlexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD1]]
-; CHECK-NEXT: mov r1, r[[OLD2]]
-   ret i64 %old
+; CHECK-ARM: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+   store i64 %old, i64* @var64
+   ret void
 }
 
-define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
+define i8 @test_atomic_load_umax_i8(i8 zeroext %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i8:
    %old = atomicrmw umax i8* @var8, i8 %offset acq_rel
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
-; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var8
+; CHECK: movt [[ADDR]], :upper16:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexb r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
 ; CHECK-NEXT: cmp r[[OLD]], r0
 ; Thumb mode: it hi
 ; CHECK:      movhi r[[NEW]], r[[OLD]]
-; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], r[[NEW]], {{.*}}[[ADDR]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -906,23 +933,23 @@ define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
    ret i8 %old
 }
 
-define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
+define i16 @test_atomic_load_umax_i16(i16 zeroext %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i16:
    %old = atomicrmw umax i16* @var16, i16 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
-; CHECK: movt r[[ADDR]], :upper16:var16
+; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var16
+; CHECK: movt [[ADDR]], :upper16:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrexh r[[OLD:[0-9]+]], {{.*}}[[ADDR]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
 ; CHECK-NEXT: cmp r[[OLD]], r0
 ; Thumb mode: it hi
 ; CHECK:      movhi r[[NEW]], r[[OLD]]
-; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[NEW]], {{.*}}[[ADDR]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -941,7 +968,7 @@ define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
 ; CHECK: movt r[[ADDR]], :upper16:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
@@ -958,50 +985,59 @@ define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
    ret i32 %old
 }
 
-define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
+define void @test_atomic_load_umax_i64(i64 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i64:
-   %old = atomicrmw umax i64* @var64, i64 %offset release
+   %old = atomicrmw umax i64* @var64, i64 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
 ; CHECK: movt r[[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
-; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
-; CHECK-NEXT: bhs .LBB{{[0-9]+}}_3
-; CHECK-NEXT: BB#2:
-; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
+; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
+; CHECK-ARM: cmp [[OLD1]], r0
+; CHECK-ARM: movwhi [[LOCARRY]], #1
+; CHECK-ARM: cmp [[OLD2]], r1
+; CHECK-ARM: movwhi [[HICARRY]], #1
+; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
+; CHECK-ARM: cmp [[HICARRY]], #0
+; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
+; CHECK-ARM: movne [[MINHI]], [[OLD2]]
+; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
+; CHECK-ARM: movne [[MINLO]], [[OLD1]]
+; CHECK-ARM: stlexd [[STATUS:r[0-9]+]], [[MINLO]], [[MINHI]], [r[[ADDR]]]
+; CHECK-THUMB: stlexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD1]]
-; CHECK-NEXT: mov r1, r[[OLD2]]
-   ret i64 %old
+; CHECK-ARM: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+   store i64 %old, i64* @var64
+   ret void
 }
 
-define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
+define i8 @test_atomic_cmpxchg_i8(i8 zeroext %wanted, i8 zeroext %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i8:
-   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire
+   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
 ; CHECK: movt r[[ADDR]], :upper16:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp r[[OLD]], r0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
-; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
+; CHECK: strexb [[STATUS:r[0-9]+]], r1, {{.*}}[[ADDR]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -1011,23 +1047,23 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
    ret i8 %old
 }
 
-define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
+define i16 @test_atomic_cmpxchg_i16(i16 zeroext %wanted, i16 zeroext %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i16:
-   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst
+   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
 ; CHECK: movt r[[ADDR]], :upper16:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp r[[OLD]], r0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
-; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
+; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -1037,59 +1073,60 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
    ret i16 %old
 }
 
-define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
+define void @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i32:
-   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release
+   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
+   store i32 %old, i32* @var32
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
 ; CHECK: movt r[[ADDR]], :upper16:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp r[[OLD]], r0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
-; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
+; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, r[[OLD]]
-   ret i32 %old
+; CHECK: str{{(.w)?}} r[[OLD]],
+   ret void
 }
 
-define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
+define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i64:
-   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic
+   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
 ; CHECK: movt r[[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldrexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
+; CHECK: ldrexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: cmp   [[OLD1]], r0
-; Thumb mode: it eq
-; CHECK:      cmpeq [[OLD2]], r1
+; CHECK-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
+; CHECK-DAG: eor{{(\.w)?}} [[MISMATCH_HI:r[0-9]+|lr]], [[OLD2]], r1
+; CHECK: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
 ; CHECK-NEXT: BB#2:
   ; As above, r2, r3 is a reasonable guess.
-; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]]
+; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
-; CHECK: mov r0, [[OLD1]]
-; CHECK-NEXT: mov r1, [[OLD2]]
-   ret i64 %old
+; CHECK-ARM: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+   store i64 %old, i64* @var64
+   ret void
 }
 
 define i8 @test_atomic_load_monotonic_i8() nounwind {
@@ -1303,13 +1340,13 @@ define void @test_atomic_store_release_i64(i64 %val) nounwind {
   store atomic i64 %val, i64* @var64 release, align 8
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
-; CHECK: movt r[[ADDR]], :upper16:var64
+; CHECK: movw [[ADDR:r[0-9]+|lr]], :lower16:var64
+; CHECK: movt [[ADDR]], :upper16:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK: stlexd [[STATUS:r[0-9]+]], r0, r1, {{.*}}[[ADDR]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -1337,7 +1374,7 @@ atomic_ver:
   ; The key point here is that the second dmb isn't immediately followed by the
   ; simple_ver basic block, which LLVM attempted to do when DMB had been marked
   ; with isBarrier. For now, look for something that looks like "somewhere".
-; CHECK-NEXT: mov
+; CHECK-NEXT: {{mov|bx}}
 somewhere:
   %combined = phi i32 [ %val, %atomic_ver ], [ %newval, %simple_ver]
   ret i32 %combined
diff --git a/test/CodeGen/ARM/atomicrmw_minmax.ll b/test/CodeGen/ARM/atomicrmw_minmax.ll
index 5befc22..68bf714 100644
--- a/test/CodeGen/ARM/atomicrmw_minmax.ll
+++ b/test/CodeGen/ARM/atomicrmw_minmax.ll
@@ -1,4 +1,4 @@
-;  RUN: llc -march=arm -mcpu=cortex-a9 < %s | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s
 
 ;  CHECK-LABEL: max:
 define i32 @max(i8 %ctx, i32* %ptr, i32 %val)
@@ -15,7 +15,7 @@ define i32 @min(i8 %ctx, i32* %ptr, i32 %val)
 {
 ;  CHECK: ldrex
 ;  CHECK: cmp [[old:r[0-9]*]], [[val:r[0-9]*]]
-;  CHECK: movlo {{r[0-9]*}}, [[old]]
+;  CHECK: movls {{r[0-9]*}}, [[old]]
   %old = atomicrmw umin i32* %ptr, i32 %val monotonic
   ret i32 %old
 }
diff --git a/test/CodeGen/ARM/bfc.ll b/test/CodeGen/ARM/bfc.ll
index 3a17d2b..1162aac 100644
--- a/test/CodeGen/ARM/bfc.ll
+++ b/test/CodeGen/ARM/bfc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6t2 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v6t2 %s -o - | FileCheck %s
 
 ; 4278190095 = 0xff00000f
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/ARM/bfi.ll b/test/CodeGen/ARM/bfi.ll
index 72a4678..bce09da 100644
--- a/test/CodeGen/ARM/bfi.ll
+++ b/test/CodeGen/ARM/bfi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm -mattr=+v6t2 < %s | FileCheck %s
+; RUN: llc -mtriple=arm -mattr=+v6t2 %s -o - | FileCheck %s
 
 %struct.F = type { [3 x i8], i8 }
 
diff --git a/test/CodeGen/ARM/bfx.ll b/test/CodeGen/ARM/bfx.ll
index 394da9e..46f49e9 100644
--- a/test/CodeGen/ARM/bfx.ll
+++ b/test/CodeGen/ARM/bfx.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v7 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v7 %s -o - | FileCheck %s
 
 define i32 @sbfx1(i32 %a) {
 ; CHECK: sbfx1
diff --git a/test/CodeGen/ARM/bic.ll b/test/CodeGen/ARM/bic.ll
index 1dfd627..691f8be 100644
--- a/test/CodeGen/ARM/bic.ll
+++ b/test/CodeGen/ARM/bic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
     %tmp = xor i32 %b, 4294967295
diff --git a/test/CodeGen/ARM/bits.ll b/test/CodeGen/ARM/bits.ll
index ce1b2ad..14aa27e 100644
--- a/test/CodeGen/ARM/bits.ll
+++ b/test/CodeGen/ARM/bits.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 entry:
diff --git a/test/CodeGen/ARM/build-attributes-encoding.s b/test/CodeGen/ARM/build-attributes-encoding.s
index 5ad51b2..34a1ad3 100644
--- a/test/CodeGen/ARM/build-attributes-encoding.s
+++ b/test/CodeGen/ARM/build-attributes-encoding.s
@@ -4,7 +4,7 @@
 // RUN:   | llvm-readobj -s -sd | FileCheck %s
 
 // Tag_CPU_name (=5)
-.cpu Cortex-A8
+.cpu cortex-a8
 
 // Tag_CPU_arch (=6)
 .eabi_attribute 6, 10
@@ -61,7 +61,7 @@
 .eabi_attribute 110, 160
 
 // Check that tags > 128 are encoded properly
-.eabi_attribute 129, 1
+.eabi_attribute 129, "1"
 .eabi_attribute 250, 1
 
 // CHECK:        Section {
@@ -71,15 +71,15 @@
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     Address: 0x0
 // CHECK-NEXT:     Offset: 0x34
-// CHECK-NEXT:     Size: 70
+// CHECK-NEXT:     Size: 71
 // CHECK-NEXT:     Link: 0
 // CHECK-NEXT:     Info: 0
 // CHECK-NEXT:     AddressAlignment: 1
 // CHECK-NEXT:     EntrySize: 0
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 41450000 00616561 62690001 3B000000
+// CHECK-NEXT:       0000: 41460000 00616561 62690001 3C000000
 // CHECK-NEXT:       0010: 05434F52 5445582D 41380006 0A074108
 // CHECK-NEXT:       0020: 0109020A 030C0214 01150117 01180119
 // CHECK-NEXT:       0030: 011B001C 0124012A 012C0244 036EA001
-// CHECK-NEXT:       0040: 810101FA 0101
+// CHECK-NEXT:       0040: 81013100 FA0101
 // CHECK-NEXT:     )
diff --git a/test/CodeGen/ARM/2010-09-29-mc-asm-header-test.ll b/test/CodeGen/ARM/build-attributes.ll
index 3053694..3e825e8 100644
--- a/test/CodeGen/ARM/2010-09-29-mc-asm-header-test.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -1,6 +1,7 @@
 ; This tests that MC/asm header conversion is smooth and that the
 ; build attributes are correct
 
+; RUN: llc < %s -mtriple=thumbv5-linux-gnueabi -mcpu=xscale | FileCheck %s --check-prefix=XSCALE
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi | FileCheck %s --check-prefix=V6
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi | FileCheck %s --check-prefix=V6M
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s | FileCheck %s --check-prefix=ARM1156T2F-S
@@ -12,16 +13,30 @@
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-fp-armv8,-crypto | FileCheck %s --check-prefix=V8-NEON
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-crypto | FileCheck %s --check-prefix=V8-FPARMv8-NEON
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi | FileCheck %s --check-prefix=V8-FPARMv8-NEON-CRYPTO
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 | FileCheck %s --check-prefix=CORTEX-A5-DEFAULT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-neon,+d16 | FileCheck %s --check-prefix=CORTEX-A5-NONEON
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-A5-NOFPU
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A9-HARD
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 | FileCheck %s --check-prefix=CORTEX-A12-DEFAULT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-A12-NOFPU
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9-mp | FileCheck %s --check-prefix=CORTEX-A9-MP
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 | FileCheck %s --check-prefix=CORTEX-A15
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=CORTEX-M0
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=CORTEX-M3
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-M4-SOFT
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-M4-HARD
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=CORTEX-A57
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s  --check-prefix=CORTEX-A7-CHECK
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2,-vfp3,-vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-NOFPU
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,,+d16,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
+
+; XSCALE:      .eabi_attribute 6, 5
+; XSCALE:      .eabi_attribute 8, 1
+; XSCALE:      .eabi_attribute 9, 1
 
 ; V6:   .eabi_attribute 6, 6
 ; V6:   .eabi_attribute 8, 1
@@ -34,7 +49,7 @@
 ; V6-NOT:    .eabi_attribute 68
 
 ; V6M:  .eabi_attribute 6, 12
-; V6M:  .eabi_attribute 7, 77
+; V6M-NOT:  .eabi_attribute 7
 ; V6M:  .eabi_attribute 8, 0
 ; V6M:  .eabi_attribute 9, 1
 ; V6M:  .eabi_attribute 24, 1
@@ -71,7 +86,7 @@
 ; V7M-NOT:  .eabi_attribute 28
 ; V7M-NOT:  .eabi_attribute 36
 ; V7M-NOT:  .eabi_attribute 42
-; V7M:  .eabi_attribute 44, 0
+; V7M-NOT:  .eabi_attribute 44
 ; V7M-NOT:  .eabi_attribute 68
 
 ; V7:      .syntax unified
@@ -112,6 +127,117 @@
 ; V8-FPARMv8-NEON-CRYPTO: .fpu crypto-neon-fp-armv8
 ; V8-FPARMv8-NEON-CRYPTO: .eabi_attribute 12, 3
 
+; Tag_CPU_arch	'ARMv7'
+; CORTEX-A7-CHECK: .eabi_attribute	6, 10
+; CORTEX-A7-NOFPU: .eabi_attribute	6, 10
+; CORTEX-A7-FPUV4: .eabi_attribute	6, 10
+
+; Tag_CPU_arch_profile 'A'
+; CORTEX-A7-CHECK: .eabi_attribute	7, 65
+; CORTEX-A7-NOFPU: .eabi_attribute	7, 65
+; CORTEX-A7-FPUV4: .eabi_attribute	7, 65
+
+; Tag_ARM_ISA_use
+; CORTEX-A7-CHECK: .eabi_attribute	8, 1
+; CORTEX-A7-NOFPU: .eabi_attribute	8, 1
+; CORTEX-A7-FPUV4: .eabi_attribute	8, 1
+
+; Tag_THUMB_ISA_use
+; CORTEX-A7-CHECK: .eabi_attribute	9, 2
+; CORTEX-A7-NOFPU: .eabi_attribute	9, 2
+; CORTEX-A7-FPUV4: .eabi_attribute	9, 2
+
+; CORTEX-A7-CHECK: .fpu	neon-vfpv4
+; CORTEX-A7-NOFPU-NOT: .fpu
+; CORTEX-A7-FPUV4: .fpu	vfpv4
+
+; Tag_ABI_FP_denormal
+; CORTEX-A7-CHECK: .eabi_attribute	20, 1
+; CORTEX-A7-NOFPU: .eabi_attribute	20, 1
+; CORTEX-A7-FPUV4: .eabi_attribute	20, 1
+
+; Tag_ABI_FP_exceptions
+; CORTEX-A7-CHECK: .eabi_attribute	21, 1
+; CORTEX-A7-NOFPU: .eabi_attribute	21, 1
+; CORTEX-A7-FPUV4: .eabi_attribute	21, 1
+
+; Tag_ABI_FP_number_model
+; CORTEX-A7-CHECK: .eabi_attribute	23, 3
+; CORTEX-A7-NOFPU: .eabi_attribute	23, 3
+; CORTEX-A7-FPUV4: .eabi_attribute	23, 3
+
+; Tag_ABI_align_needed
+; CORTEX-A7-CHECK: .eabi_attribute	24, 1
+; CORTEX-A7-NOFPU: .eabi_attribute	24, 1
+; CORTEX-A7-FPUV4: .eabi_attribute	24, 1
+
+; Tag_ABI_align_preserved
+; CORTEX-A7-CHECK: .eabi_attribute	25, 1
+; CORTEX-A7-NOFPU: .eabi_attribute	25, 1
+; CORTEX-A7-FPUV4: .eabi_attribute	25, 1
+
+; Tag_FP_HP_extension
+; CORTEX-A7-CHECK: .eabi_attribute	36, 1
+; CORTEX-A7-NOFPU: .eabi_attribute	36, 1
+; CORTEX-A7-FPUV4: .eabi_attribute	36, 1
+
+; Tag_MPextension_use
+; CORTEX-A7-CHECK: .eabi_attribute	42, 1
+; CORTEX-A7-NOFPU: .eabi_attribute	42, 1
+; CORTEX-A7-FPUV4: .eabi_attribute	42, 1
+
+; Tag_DIV_use
+; CORTEX-A7-CHECK: .eabi_attribute	44, 2
+; CORTEX-A7-NOFPU: .eabi_attribute	44, 2
+; CORTEX-A7-FPUV4: .eabi_attribute	44, 2
+
+; Tag_Virtualization_use
+; CORTEX-A7-CHECK: .eabi_attribute	68, 3
+; CORTEX-A7-NOFPU: .eabi_attribute	68, 3
+; CORTEX-A7-FPUV4: .eabi_attribute	68, 3
+
+; CORTEX-A5-DEFAULT:        .cpu    cortex-a5
+; CORTEX-A5-DEFAULT:        .eabi_attribute 6, 10
+; CORTEX-A5-DEFAULT:        .eabi_attribute 7, 65
+; CORTEX-A5-DEFAULT:        .eabi_attribute 8, 1
+; CORTEX-A5-DEFAULT:        .eabi_attribute 9, 2
+; CORTEX-A5-DEFAULT:        .fpu    neon-vfpv4
+; CORTEX-A5-DEFAULT:        .eabi_attribute 20, 1
+; CORTEX-A5-DEFAULT:        .eabi_attribute 21, 1
+; CORTEX-A5-DEFAULT:        .eabi_attribute 23, 3
+; CORTEX-A5-DEFAULT:        .eabi_attribute 24, 1
+; CORTEX-A5-DEFAULT:        .eabi_attribute 25, 1
+; CORTEX-A5-DEFAULT:        .eabi_attribute 42, 1
+; CORTEX-A5-DEFAULT:        .eabi_attribute 68, 1
+
+; CORTEX-A5-NONEON:        .cpu    cortex-a5
+; CORTEX-A5-NONEON:        .eabi_attribute 6, 10
+; CORTEX-A5-NONEON:        .eabi_attribute 7, 65
+; CORTEX-A5-NONEON:        .eabi_attribute 8, 1
+; CORTEX-A5-NONEON:        .eabi_attribute 9, 2
+; CORTEX-A5-NONEON:        .fpu    vfpv4-d16
+; CORTEX-A5-NONEON:        .eabi_attribute 20, 1
+; CORTEX-A5-NONEON:        .eabi_attribute 21, 1
+; CORTEX-A5-NONEON:        .eabi_attribute 23, 3
+; CORTEX-A5-NONEON:        .eabi_attribute 24, 1
+; CORTEX-A5-NONEON:        .eabi_attribute 25, 1
+; CORTEX-A5-NONEON:        .eabi_attribute 42, 1
+; CORTEX-A5-NONEON:        .eabi_attribute 68, 1
+
+; CORTEX-A5-NOFPU:        .cpu    cortex-a5
+; CORTEX-A5-NOFPU:        .eabi_attribute 6, 10
+; CORTEX-A5-NOFPU:        .eabi_attribute 7, 65
+; CORTEX-A5-NOFPU:        .eabi_attribute 8, 1
+; CORTEX-A5-NOFPU:        .eabi_attribute 9, 2
+; CORTEX-A5-NOFPU-NOT:    .fpu
+; CORTEX-A5-NOFPU:        .eabi_attribute 20, 1
+; CORTEX-A5-NOFPU:        .eabi_attribute 21, 1
+; CORTEX-A5-NOFPU:        .eabi_attribute 23, 3
+; CORTEX-A5-NOFPU:        .eabi_attribute 24, 1
+; CORTEX-A5-NOFPU:        .eabi_attribute 25, 1
+; CORTEX-A5-NOFPU:        .eabi_attribute 42, 1
+; CORTEX-A5-NOFPU:        .eabi_attribute 68, 1
+
 ; CORTEX-A9-SOFT:  .cpu cortex-a9
 ; CORTEX-A9-SOFT:  .eabi_attribute 6, 10
 ; CORTEX-A9-SOFT:  .eabi_attribute 7, 65
@@ -157,12 +283,42 @@
 ; CORTEX-A9-MP:  .eabi_attribute 23, 3
 ; CORTEX-A9-MP:  .eabi_attribute 24, 1
 ; CORTEX-A9-MP:  .eabi_attribute 25, 1
-; CORTEX-A9-NOT:  .eabi_attribute 27
-; CORTEX-A9-NOT:  .eabi_attribute 28
+; CORTEX-A9-MP-NOT:  .eabi_attribute 27
+; CORTEX-A9-MP-NOT:  .eabi_attribute 28
 ; CORTEX-A9-MP:  .eabi_attribute 36, 1
 ; CORTEX-A9-MP:  .eabi_attribute 42, 1
 ; CORTEX-A9-MP:  .eabi_attribute 68, 1
 
+; CORTEX-A12-DEFAULT:  .cpu cortex-a12
+; CORTEX-A12-DEFAULT:  .eabi_attribute 6, 10
+; CORTEX-A12-DEFAULT:  .eabi_attribute 7, 65
+; CORTEX-A12-DEFAULT:  .eabi_attribute 8, 1
+; CORTEX-A12-DEFAULT:  .eabi_attribute 9, 2
+; CORTEX-A12-DEFAULT:  .fpu neon-vfpv4
+; CORTEX-A12-DEFAULT:  .eabi_attribute 20, 1
+; CORTEX-A12-DEFAULT:  .eabi_attribute 21, 1
+; CORTEX-A12-DEFAULT:  .eabi_attribute 23, 3
+; CORTEX-A12-DEFAULT:  .eabi_attribute 24, 1
+; CORTEX-A12-DEFAULT:  .eabi_attribute 25, 1
+; CORTEX-A12-DEFAULT:  .eabi_attribute 42, 1
+; CORTEX-A12-DEFAULT:  .eabi_attribute 44, 2
+; CORTEX-A12-DEFAULT:  .eabi_attribute 68, 3
+
+; CORTEX-A12-NOFPU:  .cpu cortex-a12
+; CORTEX-A12-NOFPU:  .eabi_attribute 6, 10
+; CORTEX-A12-NOFPU:  .eabi_attribute 7, 65
+; CORTEX-A12-NOFPU:  .eabi_attribute 8, 1
+; CORTEX-A12-NOFPU:  .eabi_attribute 9, 2
+; CORTEX-A12-NOFPU-NOT:  .fpu
+; CORTEX-A12-NOFPU:  .eabi_attribute 20, 1
+; CORTEX-A12-NOFPU:  .eabi_attribute 21, 1
+; CORTEX-A12-NOFPU:  .eabi_attribute 23, 3
+; CORTEX-A12-NOFPU:  .eabi_attribute 24, 1
+; CORTEX-A12-NOFPU:  .eabi_attribute 25, 1
+; CORTEX-A12-NOFPU:  .eabi_attribute 42, 1
+; CORTEX-A12-NOFPU:  .eabi_attribute 44, 2
+; CORTEX-A12-NOFPU:  .eabi_attribute 68, 3
+
 ; CORTEX-A15: .cpu cortex-a15
 ; CORTEX-A15: .eabi_attribute 6, 10
 ; CORTEX-A15: .eabi_attribute 7, 65
@@ -183,7 +339,7 @@
 
 ; CORTEX-M0:  .cpu cortex-m0
 ; CORTEX-M0:  .eabi_attribute 6, 12
-; CORTEX-M0:  .eabi_attribute 7, 77
+; CORTEX-M0-NOT:  .eabi_attribute 7
 ; CORTEX-M0:  .eabi_attribute 8, 0
 ; CORTEX-M0:  .eabi_attribute 9, 1
 ; CORTEX-M0:  .eabi_attribute 24, 1
@@ -194,6 +350,23 @@
 ; CORTEX-M0-NOT:  .eabi_attribute 42
 ; CORTEX-M0-NOT:  .eabi_attribute 68
 
+; CORTEX-M3:  .cpu cortex-m3
+; CORTEX-M3:  .eabi_attribute 6, 10
+; CORTEX-M3:  .eabi_attribute 7, 77
+; CORTEX-M3:  .eabi_attribute 8, 0
+; CORTEX-M3:  .eabi_attribute 9, 2
+; CORTEX-M3:  .eabi_attribute 20, 1
+; CORTEX-M3:  .eabi_attribute 21, 1
+; CORTEX-M3:  .eabi_attribute 23, 3
+; CORTEX-M3:  .eabi_attribute 24, 1
+; CORTEX-M3:  .eabi_attribute 25, 1
+; CORTEX-M3-NOT:  .eabi_attribute 27
+; CORTEX-M3-NOT:  .eabi_attribute 28
+; CORTEX-M3-NOT:  .eabi_attribute 36
+; CORTEX-M3-NOT:  .eabi_attribute 42
+; CORTEX-M3-NOT:  .eabi_attribute 44
+; CORTEX-M3-NOT:  .eabi_attribute 68
+
 ; CORTEX-M4-SOFT:  .cpu cortex-m4
 ; CORTEX-M4-SOFT:  .eabi_attribute 6, 13
 ; CORTEX-M4-SOFT:  .eabi_attribute 7, 77
@@ -209,7 +382,7 @@
 ; CORTEX-M4-SOFT-NOT:  .eabi_attribute 28
 ; CORTEX-M4-SOFT:  .eabi_attribute 36, 1
 ; CORTEX-M4-SOFT-NOT:  .eabi_attribute 42
-; CORTEX-M4-SOFT:  .eabi_attribute 44, 0
+; CORTEX-M4-SOFT-NOT:  .eabi_attribute 44
 ; CORTEX-M4-SOFT-NOT:  .eabi_attribute 68
 
 ; CORTEX-M4-HARD:  .cpu cortex-m4
@@ -227,8 +400,8 @@
 ; CORTEX-M4-HARD:  .eabi_attribute 28, 1
 ; CORTEX-M4-HARD:  .eabi_attribute 36, 1
 ; CORTEX-M4-HARD-NOT:  .eabi_attribute 42
-; CORTEX-M4-HARD:  .eabi_attribute 44, 0
-; CORTEX-M4-HRAD-NOT:  .eabi_attribute 68
+; CORTEX-M4-HARD-NOT:  .eabi_attribute 44
+; CORTEX-M4-HARD-NOT:  .eabi_attribute 68
 
 ; CORTEX-R5:  .cpu cortex-r5
 ; CORTEX-R5:  .eabi_attribute 6, 10
@@ -261,7 +434,7 @@
 ; CORTEX-A53-NOT:  .eabi_attribute 28
 ; CORTEX-A53:  .eabi_attribute 36, 1
 ; CORTEX-A53:  .eabi_attribute 42, 1
-; CORTEX-A53:  .eabi_attribute 44, 2
+; CORTEX-A53-NOT:  .eabi_attribute 44
 ; CORTEX-A53:  .eabi_attribute 68, 3
 
 ; CORTEX-A57:  .cpu cortex-a57
@@ -277,7 +450,7 @@
 ; CORTEX-A57-NOT:  .eabi_attribute 28
 ; CORTEX-A57:  .eabi_attribute 36, 1
 ; CORTEX-A57:  .eabi_attribute 42, 1
-; CORTEX-A57:  .eabi_attribute 44, 2
+; CORTEX-A57-NOT:  .eabi_attribute 44
 ; CORTEX-A57:  .eabi_attribute 68, 3
 
 define i32 @f(i64 %z) {
diff --git a/test/CodeGen/ARM/cache-intrinsic.ll b/test/CodeGen/ARM/cache-intrinsic.ll
new file mode 100644
index 0000000..6048917
--- /dev/null
+++ b/test/CodeGen/ARM/cache-intrinsic.ll
@@ -0,0 +1,26 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+@buffer = global [32 x i8] c"This is a largely unused buffer\00", align 1
+@.str = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
+@.str1 = private unnamed_addr constant [25 x i8] c"Still, largely unused...\00", align 1
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0))
+  %call1 = call i8* @strcpy(i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8]* @.str1, i32 0, i32 0)) #3
+  call void @llvm.clear_cache(i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds (i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i32 32)) #3
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0))
+  ret i32 0
+}
+
+; CHECK: __clear_cache
+
+declare i32 @printf(i8*, ...)
+
+declare i8* @strcpy(i8*, i8*)
+
+declare void @llvm.clear_cache(i8*, i8*)
diff --git a/test/CodeGen/ARM/call-tc.ll b/test/CodeGen/ARM/call-tc.ll
index d463602..40694bf 100644
--- a/test/CodeGen/ARM/call-tc.ll
+++ b/test/CodeGen/ARM/call-tc.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=armv6-apple-ios -mattr=+vfp2 -arm-tail-calls | FileCheck %s -check-prefix=CHECKV6
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -relocation-model=pic -mattr=+vfp2 -arm-tail-calls | FileCheck %s -check-prefix=CHECKELF
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-tail-calls | FileCheck %s -check-prefix=CHECKT2D
+; RUN: llc < %s -mtriple=armv6-apple-ios5.0 -mattr=+vfp2 | FileCheck %s -check-prefix=CHECKV6
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 | FileCheck %s -check-prefix=CHECKT2D
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -relocation-model=pic -mattr=+vfp2 \
+; RUN:    | FileCheck %s -check-prefix=CHECKELF
 
 ; Enable tailcall optimization for iOS 5.0
 ; rdar://9120031
diff --git a/test/CodeGen/ARM/call.ll b/test/CodeGen/ARM/call.ll
index 107e79a..f6301cf 100644
--- a/test/CodeGen/ARM/call.ll
+++ b/test/CodeGen/ARM/call.ll
@@ -1,7 +1,11 @@
-; RUN: llc < %s -march=arm -mattr=+v4t | FileCheck %s -check-prefix=CHECKV4
-; RUN: llc < %s -march=arm -mattr=+v5t | FileCheck %s -check-prefix=CHECKV5
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi\
-; RUN:   -relocation-model=pic | FileCheck %s -check-prefix=CHECKELF
+; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - \
+; RUN:   | FileCheck %s -check-prefix=CHECKV4
+
+; RUN: llc -mtriple=arm-eabi -mattr=+v5t %s -o - \
+; RUN:   | FileCheck %s -check-prefix=CHECKV5
+
+; RUN: llc -mtriple=armv6-linux-gnueabi -relocation-model=pic %s -o - \
+; RUN:   | FileCheck %s -check-prefix=CHECKELF
 
 @t = weak global i32 ()* null           ; <i32 ()**> [#uses=1]
 
diff --git a/test/CodeGen/ARM/carry.ll b/test/CodeGen/ARM/carry.ll
index f67987f..e344b08 100644
--- a/test/CodeGen/ARM/carry.ll
+++ b/test/CodeGen/ARM/carry.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/ARM/clz.ll b/test/CodeGen/ARM/clz.ll
index 5b6a584..68e8c7c 100644
--- a/test/CodeGen/ARM/clz.ll
+++ b/test/CodeGen/ARM/clz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v5t | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v5t %s -o - | FileCheck %s
 
 declare i32 @llvm.ctlz.i32(i32, i1)
 
diff --git a/test/CodeGen/ARM/coalesce-dbgvalue.ll b/test/CodeGen/ARM/coalesce-dbgvalue.ll
index 86106a0..606c9bc 100644
--- a/test/CodeGen/ARM/coalesce-dbgvalue.ll
+++ b/test/CodeGen/ARM/coalesce-dbgvalue.ll
@@ -81,7 +81,7 @@ attributes #3 = { nounwind }
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 182024) (llvm/trunk 182023)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/d/b/pr16110.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"pr16110.c", metadata !"/d/b"}
-!2 = metadata !{i32 0}
+!2 = metadata !{}
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"pr16110", metadata !"pr16110", metadata !"", i32 7, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @pr16110, null, null, metadata !9, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [pr16110]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/d/b/pr16110.c]
diff --git a/test/CodeGen/ARM/compare-call.ll b/test/CodeGen/ARM/compare-call.ll
index fac2bc5..323eb1f 100644
--- a/test/CodeGen/ARM/compare-call.ll
+++ b/test/CodeGen/ARM/compare-call.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6,+vfp2 | \
-; RUN:   grep vcmpe.f32
+; RUN: llc -mtriple=arm-eabi -mattr=+v6,+vfp2 %s -o - | FileCheck %s
 
 define void @test3(float* %glob, i32 %X) {
 entry:
@@ -18,3 +17,6 @@ UnifiedReturnBlock:             ; preds = %entry
 }
 
 declare i32 @bar(...)
+
+; CHECK: vcmpe.f32
+
diff --git a/test/CodeGen/ARM/constantfp.ll b/test/CodeGen/ARM/constantfp.ll
index 974bdd7..27b6e9b 100644
--- a/test/CodeGen/ARM/constantfp.ll
+++ b/test/CodeGen/ARM/constantfp.ll
@@ -15,7 +15,7 @@ define arm_aapcs_vfpcc float @test_vmov_imm() {
 ; CHECK: vmov.i32 d0, #0
 
 ; CHECK-NONEON-LABEL: test_vmov_imm:
-; CHECK_NONEON: vldr s0, {{.?LCPI[0-9]+_[0-9]+}}
+; CHECK-NONEON: vldr s0, {{.?LCPI[0-9]+_[0-9]+}}
   ret float 0.0
 }
 
@@ -24,7 +24,7 @@ define arm_aapcs_vfpcc float @test_vmvn_imm() {
 ; CHECK: vmvn.i32 d0, #0xb0000000
 
 ; CHECK-NONEON-LABEL: test_vmvn_imm:
-; CHECK_NONEON: vldr s0, {{.?LCPI[0-9]+_[0-9]+}}
+; CHECK-NONEON: vldr s0, {{.?LCPI[0-9]+_[0-9]+}}
   ret float 8589934080.0
 }
 
@@ -33,7 +33,7 @@ define arm_aapcs_vfpcc double @test_vmov_f64() {
 ; CHECK: vmov.f64 d0, #1.0
 
 ; CHECK-NONEON-LABEL: test_vmov_f64:
-; CHECK_NONEON: vmov.f64 d0, #1.0
+; CHECK-NONEON: vmov.f64 d0, #1.0
 
   ret double 1.0
 }
@@ -43,7 +43,7 @@ define arm_aapcs_vfpcc double @test_vmov_double_imm() {
 ; CHECK: vmov.i32 d0, #0
 
 ; CHECK-NONEON-LABEL: test_vmov_double_imm:
-; CHECK_NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
+; CHECK-NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
   ret double 0.0
 }
 
@@ -52,7 +52,7 @@ define arm_aapcs_vfpcc double @test_vmvn_double_imm() {
 ; CHECK: vmvn.i32 d0, #0xb0000000
 
 ; CHECK-NONEON-LABEL: test_vmvn_double_imm:
-; CHECK_NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
+; CHECK-NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
   ret double 0x4fffffff4fffffff
 }
 
@@ -63,6 +63,6 @@ define arm_aapcs_vfpcc double @test_notvmvn_double_imm() {
 ; CHECK: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
 
 ; CHECK-NONEON-LABEL: test_notvmvn_double_imm:
-; CHECK_NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
+; CHECK-NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
   ret double 0x4fffffffffffffff
 }
diff --git a/test/CodeGen/ARM/crash-O0.ll b/test/CodeGen/ARM/crash-O0.ll
index 8bce4e0..8855bb9 100644
--- a/test/CodeGen/ARM/crash-O0.ll
+++ b/test/CodeGen/ARM/crash-O0.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -relocation-model=pic -disable-fp-elim
+; RUN: llc < %s -O0 -relocation-model=pic -disable-fp-elim -no-integrated-as
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64-n32"
 target triple = "armv6-apple-darwin10"
 
diff --git a/test/CodeGen/ARM/cse-ldrlit.ll b/test/CodeGen/ARM/cse-ldrlit.ll
new file mode 100644
index 0000000..ea8c0ca
--- /dev/null
+++ b/test/CodeGen/ARM/cse-ldrlit.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple=thumbv6m-apple-none-macho -relocation-model=pic -o -  %s | FileCheck %s --check-prefix=CHECK-THUMB-PIC
+; RUN: llc -mtriple=arm-apple-none-macho -relocation-model=pic -o -  %s | FileCheck %s --check-prefix=CHECK-ARM-PIC
+; RUN: llc -mtriple=thumbv6m-apple-none-macho -relocation-model=dynamic-no-pic -o -  %s | FileCheck %s --check-prefix=CHECK-DYNAMIC
+; RUN: llc -mtriple=arm-apple-none-macho -relocation-model=dynamic-no-pic -o -  %s | FileCheck %s --check-prefix=CHECK-DYNAMIC
+; RUN: llc -mtriple=thumbv6m-apple-none-macho -relocation-model=static -o -  %s | FileCheck %s --check-prefix=CHECK-STATIC
+; RUN: llc -mtriple=arm-apple-none-macho -relocation-model=static -o -  %s | FileCheck %s --check-prefix=CHECK-STATIC
+@var = global [16 x i32] zeroinitializer
+
+declare void @bar(i32*)
+
+define void @foo() {
+  %flag = load i32* getelementptr inbounds([16 x i32]* @var, i32 0, i32 1)
+  %tst = icmp eq i32 %flag, 0
+  br i1 %tst, label %true, label %false
+true:
+  tail call void @bar(i32* getelementptr inbounds([16 x i32]* @var, i32 0, i32 4))
+  ret void
+false:
+  ret void
+}
+
+; CHECK-THUMB-PIC-LABEL: foo:
+; CHECK-THUMB-PIC: ldr r0, LCPI0_0
+; CHECK-THUMB-PIC: LPC0_0:
+; CHECK-THUMB-PIC-NEXT: add r0, pc
+; CHECK-THUMB-PIC: ldr {{r[1-9][0-9]?}}, [r0, #4]
+
+; CHECK-THUMB-PIC: LCPI0_0:
+; CHECK-THUMB-PIC-NEXT: .long _var-(LPC0_0+4)
+; CHECK-THUMB-PIC-NOT: LCPI0_1
+
+
+; CHECK-ARM-PIC-LABEL: foo:
+; CHECK-ARM-PIC: ldr [[VAR_OFFSET:r[0-9]+]], LCPI0_0
+; CHECK-ARM-PIC: LPC0_0:
+; CHECK-ARM-PIC-NEXT: ldr r0, [pc, [[VAR_OFFSET]]]
+; CHECK-ARM-PIC: ldr {{r[1-9][0-9]?}}, [r0, #4]
+
+; CHECK-ARM-PIC: LCPI0_0:
+; CHECK-ARM-PIC-NEXT: .long _var-(LPC0_0+8)
+; CHECK-ARM-PIC-NOT: LCPI0_1
+
+
+; CHECK-DYNAMIC-LABEL: foo:
+; CHECK-DYNAMIC: ldr r0, LCPI0_0
+; CHECK-DYNAMIC: ldr {{r[1-9][0-9]?}}, [r0, #4]
+
+; CHECK-DYNAMIC: LCPI0_0:
+; CHECK-DYNAMIC-NEXT: .long _var
+; CHECK-DYNAMIC-NOT: LCPI0_1
+
+
+; CHECK-STATIC-LABEL: foo:
+; CHECK-STATIC: ldr r0, LCPI0_0
+; CHECK-STATIC: ldr {{r[1-9][0-9]?}}, [r0, #4]
+
+; CHECK-STATIC: LCPI0_0:
+; CHECK-STATIC-NEXT: .long _var{{$}}
+; CHECK-STATIC-NOT: LCPI0_1
+
+
diff --git a/test/CodeGen/ARM/ctz.ll b/test/CodeGen/ARM/ctz.ll
index 2c7efc7..2d88b03 100644
--- a/test/CodeGen/ARM/ctz.ll
+++ b/test/CodeGen/ARM/ctz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6t2 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v6t2 %s -o - | FileCheck %s
 
 declare i32 @llvm.cttz.i32(i32, i1)
 
diff --git a/test/CodeGen/ARM/debug-frame-large-stack.ll b/test/CodeGen/ARM/debug-frame-large-stack.ll
new file mode 100644
index 0000000..5bafce9
--- /dev/null
+++ b/test/CodeGen/ARM/debug-frame-large-stack.ll
@@ -0,0 +1,99 @@
+; RUN: llc -filetype=asm -o - < %s -mtriple arm-arm-none-eabi -disable-fp-elim| FileCheck %s --check-prefix=CHECK-ARM
+; RUN: llc -filetype=asm -o - < %s -mtriple arm-arm-none-eabi | FileCheck %s --check-prefix=CHECK-ARM-FP-ELIM
+
+define void @test1() {
+    %tmp = alloca [ 64 x i32 ] , align 4
+    ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/large.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"large.c", metadata !"/tmp"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test1", metadata !"test1", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @test1, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [test1]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/large.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{metadata !"clang version 3.5 "}
+!11 = metadata !{i32 2, i32 0, metadata !4, null}
+
+; CHECK-ARM-LABEL: test1:
+; CHECK-ARM: .cfi_startproc
+; CHECK-ARM: sub    sp, sp, #256
+; CHECK-ARM: .cfi_endproc
+
+; CHECK-ARM-FP-ELIM-LABEL: test1:
+; CHECK-ARM-FP-ELIM: .cfi_startproc
+; CHECK-ARM-FP-ELIM: sub    sp, sp, #256
+; CHECK-ARM-FP-ELIM: .cfi_endproc
+
+define void @test2() {
+    %tmp = alloca [ 4168 x i8 ] , align 4
+    ret void
+}
+
+; CHECK-ARM-LABEL: test2:
+; CHECK-ARM: .cfi_startproc
+; CHECK-ARM: push    {r4, r5}
+; CHECK-ARM: .cfi_def_cfa_offset 8
+; CHECK-ARM: .cfi_offset r5, -4
+; CHECK-ARM: .cfi_offset r4, -8
+; CHECK-ARM: sub    sp, sp, #72
+; CHECK-ARM: sub    sp, sp, #4096
+; CHECK-ARM: .cfi_def_cfa_offset 4176
+; CHECK-ARM: .cfi_endproc
+
+; CHECK-ARM-FP_ELIM-LABEL: test2:
+; CHECK-ARM-FP_ELIM: .cfi_startproc
+; CHECK-ARM-FP_ELIM: push    {r4, r5}
+; CHECK-ARM-FP_ELIM: .cfi_def_cfa_offset 8
+; CHECK-ARM-FP_ELIM: .cfi_offset 54, -4
+; CHECK-ARM-FP_ELIM: .cfi_offset r4, -8
+; CHECK-ARM-FP_ELIM: sub    sp, sp, #72
+; CHECK-ARM-FP_ELIM: sub    sp, sp, #4096
+; CHECK-ARM-FP_ELIM: .cfi_def_cfa_offset 4176
+; CHECK-ARM-FP_ELIM: .cfi_endproc
+
+define i32 @test3() {
+	%retval = alloca i32, align 4
+	%tmp = alloca i32, align 4
+	%a = alloca [805306369 x i8], align 16
+	store i32 0, i32* %tmp
+	%tmp1 = load i32* %tmp
+        ret i32 %tmp1
+}
+
+; CHECK-ARM-LABEL: test3:
+; CHECK-ARM: .cfi_startproc
+; CHECK-ARM: push    {r4, r5, r11}
+; CHECK-ARM: .cfi_def_cfa_offset 12
+; CHECK-ARM: .cfi_offset r11, -4
+; CHECK-ARM: .cfi_offset r5, -8
+; CHECK-ARM: .cfi_offset r4, -12
+; CHECK-ARM: add    r11, sp, #8
+; CHECK-ARM: .cfi_def_cfa r11, 4
+; CHECK-ARM: sub    sp, sp, #20
+; CHECK-ARM: sub    sp, sp, #805306368
+; CHECK-ARM: bic    sp, sp, #15
+; CHECK-ARM: .cfi_endproc
+
+; CHECK-ARM-FP-ELIM-LABEL: test3:
+; CHECK-ARM-FP-ELIM: .cfi_startproc
+; CHECK-ARM-FP-ELIM: push    {r4, r5, r11}
+; CHECK-ARM-FP-ELIM: .cfi_def_cfa_offset 12
+; CHECK-ARM-FP-ELIM: .cfi_offset r11, -4
+; CHECK-ARM-FP-ELIM: .cfi_offset r5, -8
+; CHECK-ARM-FP-ELIM: .cfi_offset r4, -12
+; CHECK-ARM-FP-ELIM: add    r11, sp, #8
+; CHECK-ARM-FP-ELIM: .cfi_def_cfa r11, 4
+; CHECK-ARM-FP-ELIM: sub    sp, sp, #20
+; CHECK-ARM-FP-ELIM: sub    sp, sp, #805306368
+; CHECK-ARM-FP-ELIM: bic    sp, sp, #15
+; CHECK-ARM-FP-ELIM: .cfi_endproc
+
diff --git a/test/CodeGen/ARM/debug-frame-no-debug.ll b/test/CodeGen/ARM/debug-frame-no-debug.ll
new file mode 100644
index 0000000..81702c6
--- /dev/null
+++ b/test/CodeGen/ARM/debug-frame-no-debug.ll
@@ -0,0 +1,97 @@
+; ARM EHABI integrated test
+
+; This test case checks that the ARM DWARF stack frame directives
+; are not generated if compiling with no debug information.
+  
+; RUN: llc -mtriple arm-unknown-linux-gnueabi \
+; RUN:     -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-FP-ELIM
+
+; RUN: llc -mtriple thumb-unknown-linux-gnueabi \
+; RUN:     -disable-fp-elim -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-FP
+
+;-------------------------------------------------------------------------------
+; Test 1
+;-------------------------------------------------------------------------------
+; This is the LLVM assembly generated from following C++ code:
+;
+;   extern void print(int, int, int, int, int);
+;   extern void print(double, double, double, double, double);
+;
+;   void test(int a, int b, int c, int d, int e,
+;             double m, double n, double p, double q, double r) {
+;     try {
+;       print(a, b, c, d, e);
+;     } catch (...) {
+;       print(m, n, p, q, r);
+;     }
+;   }
+
+declare void @_Z5printiiiii(i32, i32, i32, i32, i32)
+
+declare void @_Z5printddddd(double, double, double, double, double)
+
+define void @_Z4testiiiiiddddd(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e,
+                               double %m, double %n, double %p,
+                               double %q, double %r) {
+entry:
+  invoke void @_Z5printiiiii(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e)
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 }
+          personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1)
+  invoke void @_Z5printddddd(double %m, double %n, double %p,
+                             double %q, double %r)
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret void
+
+lpad1:
+  %3 = landingpad { i8*, i32 }
+          personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:
+  resume { i8*, i32 } %3
+
+terminate.lpad:
+  %4 = landingpad { i8*, i32 }
+          personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %5 = extractvalue { i8*, i32 } %4, 0
+  tail call void @__clang_call_terminate(i8* %5)
+  unreachable
+}
+
+declare void @__clang_call_terminate(i8*)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+declare void @_ZSt9terminatev()
+
+; CHECK-FP-ELIM-LABEL: _Z4testiiiiiddddd:
+; CHECK-FP-ELIM-NOT:   .cfi_startproc
+; CHECK-FP-ELIM:   push  {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-FP-ELIM-NOT:   .cfi_def_cfa_offset 36
+
+; CHECK-THUMB-FP-LABEL: _Z4testiiiiiddddd:
+; CHECK-THUMB-FP-NOT:   .cfi_startproc
+; CHECK-THUMB-FP:   push   {r4, r5, r6, r7, lr}
+; CHECK-THUMB-FP-NOT:   .cfi_def_cfa_offset 20
+
diff --git a/test/CodeGen/ARM/debug-frame-vararg.ll b/test/CodeGen/ARM/debug-frame-vararg.ll
new file mode 100644
index 0000000..9b39525
--- /dev/null
+++ b/test/CodeGen/ARM/debug-frame-vararg.ll
@@ -0,0 +1,141 @@
+; RUN: llc -mtriple arm-unknown-linux-gnueabi -filetype asm -o - %s | FileCheck %s --check-prefix=CHECK-FP
+; RUN: llc -mtriple arm-unknown-linux-gnueabi -filetype asm -o - %s -disable-fp-elim | FileCheck %s --check-prefix=CHECK-FP-ELIM
+; RUN: llc -mtriple thumb-unknown-linux-gnueabi -filetype asm -o - %s | FileCheck %s --check-prefix=CHECK-THUMB-FP
+; RUN: llc -mtriple thumb-unknown-linux-gnueabi -filetype asm -o - %s -disable-fp-elim | FileCheck %s --check-prefix=CHECK-THUMB-FP-ELIM
+
+; Tests that the initial space allocated to the varargs on the stack is
+; taken into account in the the .cfi_ directives.
+
+; Generated from the C program:
+; #include <stdarg.h>
+;
+; extern int foo(int);
+;
+; int sum(int count, ...) {
+;  va_list vl;
+;  va_start(vl, count);
+;  int sum = 0;
+;  for (int i = 0; i < count; i++) {
+;   sum += foo(va_arg(vl, int));
+;  }
+;  va_end(vl);
+; }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"var.c", metadata !"/tmp"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"sum", metadata !"sum", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, ...)* @sum, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{metadata !"clang version 3.5 "}
+!12 = metadata !{i32 786689, metadata !4, metadata !"count", metadata !5, i32 16777221, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [count] [line 5]
+!13 = metadata !{i32 5, i32 0, metadata !4, null}
+!14 = metadata !{i32 786688, metadata !4, metadata !"vl", metadata !5, i32 6, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vl] [line 6]
+!15 = metadata !{i32 786454, metadata !16, null, metadata !"va_list", i32 30, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
+!16 = metadata !{metadata !"/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", metadata !"/tmp"}
+!17 = metadata !{i32 786454, metadata !1, null, metadata !"__builtin_va_list", i32 6, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
+!18 = metadata !{i32 786451, metadata !1, null, metadata !"__va_list", i32 6, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
+!19 = metadata !{metadata !20}
+!20 = metadata !{i32 786445, metadata !1, metadata !18, metadata !"__ap", i32 6, i64 32, i64 32, i64 0, i32 0, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
+!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
+!22 = metadata !{i32 6, i32 0, metadata !4, null}
+!23 = metadata !{i32 7, i32 0, metadata !4, null}
+!24 = metadata !{i32 786688, metadata !4, metadata !"sum", metadata !5, i32 8, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [sum] [line 8]
+!25 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!26 = metadata !{i32 786688, metadata !27, metadata !"i", metadata !5, i32 9, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 9]
+!27 = metadata !{i32 786443, metadata !1, metadata !4, i32 9, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!28 = metadata !{i32 9, i32 0, metadata !27, null}
+!29 = metadata !{i32 10, i32 0, metadata !30, null}
+!30 = metadata !{i32 786443, metadata !1, metadata !27, i32 9, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!31 = metadata !{i32 11, i32 0, metadata !30, null}
+!32 = metadata !{i32 12, i32 0, metadata !4, null}
+!33 = metadata !{i32 13, i32 0, metadata !4, null}
+
+; CHECK-FP-LABEL: sum
+; CHECK-FP: .cfi_startproc
+; CHECK-FP: sub    sp, sp, #16
+; CHECK-FP: .cfi_def_cfa_offset 16
+; CHECK-FP: push   {r4, lr}
+; CHECK-FP: .cfi_def_cfa_offset 24
+; CHECK-FP: .cfi_offset lr, -20
+; CHECK-FP: .cfi_offset r4, -24
+; CHECK-FP: sub    sp, sp, #8
+; CHECK-FP: .cfi_def_cfa_offset 32
+
+; CHECK-FP-ELIM-LABEL: sum
+; CHECK-FP-ELIM: .cfi_startproc
+; CHECK-FP-ELIM: sub    sp, sp, #16
+; CHECK-FP-ELIM: .cfi_def_cfa_offset 16
+; CHECK-FP-ELIM: push   {r4, r11, lr}
+; CHECK-FP-ELIM: .cfi_def_cfa_offset 28
+; CHECK-FP-ELIM: .cfi_offset lr, -20
+; CHECK-FP-ELIM: .cfi_offset r11, -24
+; CHECK-FP-ELIM: .cfi_offset r4, -28
+; CHECK-FP-ELIM: add    r11, sp, #4
+; CHECK-FP-ELIM: .cfi_def_cfa r11, 24
+
+; CHECK-THUMB-FP-LABEL: sum
+; CHECK-THUMB-FP: .cfi_startproc
+; CHECK-THUMB-FP: sub    sp, #16
+; CHECK-THUMB-FP: .cfi_def_cfa_offset 16
+; CHECK-THUMB-FP: push   {r4, r5, r7, lr}
+; CHECK-THUMB-FP: .cfi_def_cfa_offset 32
+; CHECK-THUMB-FP: .cfi_offset lr, -20
+; CHECK-THUMB-FP: .cfi_offset r7, -24
+; CHECK-THUMB-FP: .cfi_offset r5, -28
+; CHECK-THUMB-FP: .cfi_offset r4, -32
+; CHECK-THUMB-FP: sub    sp, #8
+; CHECK-THUMB-FP: .cfi_def_cfa_offset 40
+
+; CHECK-THUMB-FP-ELIM-LABEL: sum
+; CHECK-THUMB-FP-ELIM: .cfi_startproc
+; CHECK-THUMB-FP-ELIM: sub    sp, #16
+; CHECK-THUMB-FP-ELIM: .cfi_def_cfa_offset 16
+; CHECK-THUMB-FP-ELIM: push   {r4, r5, r7, lr}
+; CHECK-THUMB-FP-ELIM: .cfi_def_cfa_offset 32
+; CHECK-THUMB-FP-ELIM: .cfi_offset lr, -20
+; CHECK-THUMB-FP-ELIM: .cfi_offset r7, -24
+; CHECK-THUMB-FP-ELIM: .cfi_offset r5, -28
+; CHECK-THUMB-FP-ELIM: .cfi_offset r4, -32
+; CHECK-THUMB-FP-ELIM: add    r7, sp, #8
+; CHECK-THUMB-FP-ELIM: .cfi_def_cfa r7, 24
+
+define i32 @sum(i32 %count, ...) {
+entry:
+  %vl = alloca i8*, align 4
+  %vl1 = bitcast i8** %vl to i8*
+  call void @llvm.va_start(i8* %vl1)
+  %cmp4 = icmp sgt i32 %count, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %ap.cur = load i8** %vl, align 4
+  %ap.next = getelementptr i8* %ap.cur, i32 4
+  store i8* %ap.next, i8** %vl, align 4
+  %0 = bitcast i8* %ap.cur to i32*
+  %1 = load i32* %0, align 4
+  %call = call i32 @foo(i32 %1) #1
+  %inc = add nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  call void @llvm.va_end(i8* %vl1)
+  ret i32 undef
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare i32 @foo(i32)
+
+declare void @llvm.va_end(i8*) nounwind
diff --git a/test/CodeGen/ARM/debug-frame.ll b/test/CodeGen/ARM/debug-frame.ll
new file mode 100644
index 0000000..cf68767
--- /dev/null
+++ b/test/CodeGen/ARM/debug-frame.ll
@@ -0,0 +1,574 @@
+; ARM EHABI integrated test
+
+; This test case checks whether the ARM DWARF stack frame directives
+; are properly generated or not.
+
+; We have to check several cases:
+; (1) arm with -disable-fp-elim
+; (2) arm without -disable-fp-elim
+; (3) armv7 with -disable-fp-elim
+; (4) armv7 without -disable-fp-elim
+; (5) thumb with -disable-fp-elim
+; (6) thumb without -disable-fp-elim
+; (7) thumbv7 with -disable-fp-elim
+; (8) thumbv7 without -disable-fp-elim
+; (9) thumbv7 with -no-integrated-as
+
+; RUN: llc -mtriple arm-unknown-linux-gnueabi \
+; RUN:     -disable-fp-elim -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-FP
+
+; RUN: llc -mtriple arm-unknown-linux-gnueabi \
+; RUN:     -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-FP-ELIM
+
+; RUN: llc -mtriple armv7-unknown-linux-gnueabi \
+; RUN:     -disable-fp-elim -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-V7-FP
+
+; RUN: llc -mtriple armv7-unknown-linux-gnueabi \
+; RUN:     -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-V7-FP-ELIM
+
+; RUN: llc -mtriple thumb-unknown-linux-gnueabi \
+; RUN:     -disable-fp-elim -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-FP
+
+; RUN: llc -mtriple thumb-unknown-linux-gnueabi \
+; RUN:     -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-FP-ELIM
+
+; RUN: llc -mtriple thumbv7-unknown-linux-gnueabi \
+; RUN:     -disable-fp-elim -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-V7-FP
+
+; RUN: llc -mtriple thumbv7-unknown-linux-gnueabi \
+; RUN:     -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-V7-FP-ELIM
+
+; RUN: llc -mtriple thumbv7-unknown-linux-gnueabi \
+; RUN:     -disable-fp-elim -no-integrated-as -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-V7-FP-NOIAS
+
+;-------------------------------------------------------------------------------
+; Test 1
+;-------------------------------------------------------------------------------
+; This is the LLVM assembly generated from following C++ code:
+;
+;   extern void print(int, int, int, int, int);
+;   extern void print(double, double, double, double, double);
+;
+;   void test(int a, int b, int c, int d, int e,
+;             double m, double n, double p, double q, double r) {
+;     try {
+;       print(a, b, c, d, e);
+;     } catch (...) {
+;       print(m, n, p, q, r);
+;     }
+;   }
+
+declare void @_Z5printiiiii(i32, i32, i32, i32, i32)
+
+declare void @_Z5printddddd(double, double, double, double, double)
+
+define void @_Z4testiiiiiddddd(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e,
+                               double %m, double %n, double %p,
+                               double %q, double %r) {
+entry:
+  invoke void @_Z5printiiiii(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e)
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 }
+          personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1)
+  invoke void @_Z5printddddd(double %m, double %n, double %p,
+                             double %q, double %r)
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret void
+
+lpad1:
+  %3 = landingpad { i8*, i32 }
+          personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:
+  resume { i8*, i32 } %3
+
+terminate.lpad:
+  %4 = landingpad { i8*, i32 }
+          personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %5 = extractvalue { i8*, i32 } %4, 0
+  tail call void @__clang_call_terminate(i8* %5)
+  unreachable
+}
+
+declare void @__clang_call_terminate(i8*)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+declare void @_ZSt9terminatev()
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11}
+!llvm.ident = !{!12}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/exp.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"exp.cpp", metadata !"/tmp"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test", metadata !"test", metadata !"_Z4testiiiiiddddd", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32, i32, i32, i32, i32, double, double, double, double, double)* @_Z4testiiiiiddddd, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 5] [test]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/exp.cpp]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null, metadata !8, metadata !8, metadata !8, metadata !8, metadata !8, metadata !9, metadata !9, metadata !9, metadata !9, metadata !9}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{metadata !"clang version 3.5 "}
+!13 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !5, i32 16777220, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!14 = metadata !{i32 4, i32 0, metadata !4, null}
+!15 = metadata !{i32 786689, metadata !4, metadata !"b", metadata !5, i32 33554436, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 4]
+!16 = metadata !{i32 786689, metadata !4, metadata !"c", metadata !5, i32 50331652, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [c] [line 4]
+!17 = metadata !{i32 786689, metadata !4, metadata !"d", metadata !5, i32 67108868, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [d] [line 4]
+!18 = metadata !{i32 786689, metadata !4, metadata !"e", metadata !5, i32 83886084, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [e] [line 4]
+!19 = metadata !{i32 786689, metadata !4, metadata !"m", metadata !5, i32 100663301, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [m] [line 5]
+!20 = metadata !{i32 5, i32 0, metadata !4, null}
+!21 = metadata !{i32 786689, metadata !4, metadata !"n", metadata !5, i32 117440517, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [n] [line 5]
+!22 = metadata !{i32 786689, metadata !4, metadata !"p", metadata !5, i32 134217733, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 5]
+!23 = metadata !{i32 786689, metadata !4, metadata !"q", metadata !5, i32 150994949, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [q] [line 5]
+!24 = metadata !{i32 786689, metadata !4, metadata !"r", metadata !5, i32 167772165, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 5]
+!25 = metadata !{i32 7, i32 0, metadata !26, null}
+!26 = metadata !{i32 786443, metadata !1, metadata !4, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
+!27 = metadata !{i32 8, i32 0, metadata !26, null} ; [ DW_TAG_imported_declaration ]
+!28 = metadata !{i32 11, i32 0, metadata !26, null}
+!29 = metadata !{i32 9, i32 0, metadata !30, null}
+!30 = metadata !{i32 786443, metadata !1, metadata !4, i32 8, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
+!31 = metadata !{i32 10, i32 0, metadata !30, null}
+!32 = metadata !{i32 10, i32 0, metadata !4, null}
+!33 = metadata !{i32 11, i32 0, metadata !4, null}
+!34 = metadata !{i32 11, i32 0, metadata !30, null}
+
+; CHECK-FP-LABEL: _Z4testiiiiiddddd:
+; CHECK-FP:   .cfi_startproc
+; CHECK-FP:   push   {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-FP:   .cfi_def_cfa_offset 36
+; CHECK-FP:   .cfi_offset lr, -4
+; CHECK-FP:   .cfi_offset r11, -8
+; CHECK-FP:   .cfi_offset r10, -12
+; CHECK-FP:   .cfi_offset r9, -16
+; CHECK-FP:   .cfi_offset r8, -20
+; CHECK-FP:   .cfi_offset r7, -24
+; CHECK-FP:   .cfi_offset r6, -28
+; CHECK-FP:   .cfi_offset r5, -32
+; CHECK-FP:   .cfi_offset r4, -36
+; CHECK-FP:   add    r11, sp, #28
+; CHECK-FP:   .cfi_def_cfa r11, 8
+; CHECK-FP:   sub    sp, sp, #28
+; CHECK-FP:   .cfi_endproc
+
+; CHECK-FP-ELIM-LABEL: _Z4testiiiiiddddd:
+; CHECK-FP-ELIM:   .cfi_startproc
+; CHECK-FP-ELIM:   push  {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-FP-ELIM:   .cfi_def_cfa_offset 36
+; CHECK-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-FP-ELIM:   .cfi_offset r11, -8
+; CHECK-FP-ELIM:   .cfi_offset r10, -12
+; CHECK-FP-ELIM:   .cfi_offset r9, -16
+; CHECK-FP-ELIM:   .cfi_offset r8, -20
+; CHECK-FP-ELIM:   .cfi_offset r7, -24
+; CHECK-FP-ELIM:   .cfi_offset r6, -28
+; CHECK-FP-ELIM:   .cfi_offset r5, -32
+; CHECK-FP-ELIM:   .cfi_offset r4, -36
+; CHECK-FP-ELIM:   sub   sp, sp, #28
+; CHECK-FP-ELIM:   .cfi_def_cfa_offset 64
+; CHECK-FP-ELIM:   .cfi_endproc
+
+; CHECK-V7-FP-LABEL: _Z4testiiiiiddddd:
+; CHECK-V7-FP:   .cfi_startproc
+; CHECK-V7-FP:   push   {r4, r11, lr}
+; CHECK-V7-FP:   .cfi_def_cfa_offset 12
+; CHECK-V7-FP:   .cfi_offset lr, -4
+; CHECK-V7-FP:   .cfi_offset r11, -8
+; CHECK-V7-FP:   .cfi_offset r4, -12
+; CHECK-V7-FP:   add    r11, sp, #4
+; CHECK-V7-FP:   .cfi_def_cfa r11, 8
+; CHECK-V7-FP:   vpush  {d8, d9, d10, d11, d12}
+; CHECK-V7-FP:   .cfi_offset d12, -24
+; CHECK-V7-FP:   .cfi_offset d11, -32
+; CHECK-V7-FP:   .cfi_offset d10, -40
+; CHECK-V7-FP:   .cfi_offset d9, -48
+; CHECK-V7-FP:   .cfi_offset d8, -56
+; CHECK-V7-FP:   sub    sp, sp, #28
+; CHECK-V7-FP:   .cfi_endproc
+
+; CHECK-V7-FP-ELIM-LABEL: _Z4testiiiiiddddd:
+; CHECK-V7-FP-ELIM:   .cfi_startproc
+; CHECK-V7-FP-ELIM:   push   {r4, lr}
+; CHECK-V7-FP-ELIM:   .cfi_def_cfa_offset 8
+; CHECK-V7-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-V7-FP-ELIM:   .cfi_offset r4, -8
+; CHECK-V7-FP-ELIM:   vpush  {d8, d9, d10, d11, d12}
+; CHECK-V7-FP-ELIM:   .cfi_def_cfa_offset 48
+; CHECK-V7-FP-ELIM:   .cfi_offset d12, -16
+; CHECK-V7-FP-ELIM:   .cfi_offset d11, -24
+; CHECK-V7-FP-ELIM:   .cfi_offset d10, -32
+; CHECK-V7-FP-ELIM:   .cfi_offset d9, -40
+; CHECK-V7-FP-ELIM:   .cfi_offset d8, -48
+; CHECK-V7-FP-ELIM:   sub    sp, sp, #24
+; CHECK-V7-FP-ELIM:   .cfi_def_cfa_offset 72
+; CHECK-V7-FP-ELIM:   .cfi_endproc
+
+; CHECK-THUMB-FP-LABEL: _Z4testiiiiiddddd:
+; CHECK-THUMB-FP:   .cfi_startproc
+; CHECK-THUMB-FP:   push   {r4, r5, r6, r7, lr}
+; CHECK-THUMB-FP:   .cfi_def_cfa_offset 20
+; CHECK-THUMB-FP:   .cfi_offset lr, -4
+; CHECK-THUMB-FP:   .cfi_offset r7, -8
+; CHECK-THUMB-FP:   .cfi_offset r6, -12
+; CHECK-THUMB-FP:   .cfi_offset r5, -16
+; CHECK-THUMB-FP:   .cfi_offset r4, -20
+; CHECK-THUMB-FP:   add    r7, sp, #12
+; CHECK-THUMB-FP:   .cfi_def_cfa r7, 8
+; CHECK-THUMB-FP:   sub    sp, #60
+; CHECK-THUMB-FP:   .cfi_endproc
+
+; CHECK-THUMB-FP-ELIM-LABEL: _Z4testiiiiiddddd:
+; CHECK-THUMB-FP-ELIM:   .cfi_startproc
+; CHECK-THUMB-FP-ELIM:   push   {r4, r5, r6, r7, lr}
+; CHECK-THUMB-FP-ELIM:   .cfi_def_cfa_offset 20
+; CHECK-THUMB-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-THUMB-FP-ELIM:   .cfi_offset r7, -8
+; CHECK-THUMB-FP-ELIM:   .cfi_offset r6, -12
+; CHECK-THUMB-FP-ELIM:   .cfi_offset r5, -16
+; CHECK-THUMB-FP-ELIM:   .cfi_offset r4, -20
+; CHECK-THUMB-FP-ELIM:   sub    sp, #60
+; CHECK-THUMB-FP-ELIM:   .cfi_def_cfa_offset 80
+; CHECK-THUMB-FP-ELIM:   .cfi_endproc
+
+; CHECK-THUMB-V7-FP-LABEL: _Z4testiiiiiddddd:
+; CHECK-THUMB-V7-FP:   .cfi_startproc
+; CHECK-THUMB-V7-FP:   push.w   {r4, r7, r11, lr}
+; CHECK-THUMB-V7-FP:   .cfi_def_cfa_offset 16
+; CHECK-THUMB-V7-FP:   .cfi_offset lr, -4
+; CHECK-THUMB-V7-FP:   .cfi_offset r11, -8
+; CHECK-THUMB-V7-FP:   .cfi_offset r7, -12
+; CHECK-THUMB-V7-FP:   .cfi_offset r4, -16
+; CHECK-THUMB-V7-FP:   add    r7, sp, #4
+; CHECK-THUMB-V7-FP:   .cfi_def_cfa r7, 12
+; CHECK-THUMB-V7-FP:   vpush  {d8, d9, d10, d11, d12}
+; CHECK-THUMB-V7-FP:   .cfi_offset d12, -24
+; CHECK-THUMB-V7-FP:   .cfi_offset d11, -32
+; CHECK-THUMB-V7-FP:   .cfi_offset d10, -40
+; CHECK-THUMB-V7-FP:   .cfi_offset d9, -48
+; CHECK-THUMB-V7-FP:   .cfi_offset d8, -56
+; CHECK-THUMB-V7-FP:   sub    sp, #24
+; CHECK-THUMB-V7-FP:   .cfi_endproc
+
+; CHECK-THUMB-V7-FP-ELIM-LABEL: _Z4testiiiiiddddd:
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_startproc
+; CHECK-THUMB-V7-FP-ELIM:   push   {r4, lr}
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_def_cfa_offset 8
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset r4, -8
+; CHECK-THUMB-V7-FP-ELIM:   vpush  {d8, d9, d10, d11, d12}
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_def_cfa_offset 48
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset d12, -16
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset d11, -24
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset d10, -32
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset d9, -40
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset d8, -48
+; CHECK-THUMB-V7-FP-ELIM:   sub    sp, #24
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_def_cfa_offset 72
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_endproc
+
+; CHECK-THUMB-V7-FP-NOIAS-LABEL: _Z4testiiiiiddddd:
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_startproc
+; CHECK-THUMB-V7-FP-NOIAS:   push.w   {r4, r7, r11, lr}
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_def_cfa_offset 16
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_offset 14, -4
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_offset 11, -8
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_offset 7, -12
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_offset 4, -16
+; CHECK-THUMB-V7-FP-NOIAS:   add    r7, sp, #4
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_def_cfa 7, 12
+; CHECK-THUMB-V7-FP-NOIAS:   vpush  {d8, d9, d10, d11, d12}
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_offset 268, -24
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_offset 267, -32
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_offset 266, -40
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_offset 265, -48
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_offset 264, -56
+; CHECK-THUMB-V7-FP-NOIAS:   sub    sp, #24
+; CHECK-THUMB-V7-FP-NOIAS:   .cfi_endproc
+
+;-------------------------------------------------------------------------------
+; Test 2
+;-------------------------------------------------------------------------------
+
+declare void @throw_exception_2()
+
+define void @test2() {
+entry:
+  call void @throw_exception_2()
+  ret void
+}
+
+; CHECK-FP-LABEL: test2:
+; CHECK-FP:   .cfi_startproc
+; CHECK-FP:   push   {r11, lr}
+; CHECK-FP:   .cfi_def_cfa_offset 8
+; CHECK-FP:   .cfi_offset lr, -4
+; CHECK-FP:   .cfi_offset r11, -8
+; CHECK-FP:   mov    r11, sp
+; CHECK-FP:   .cfi_def_cfa_register r11
+; CHECK-FP:   pop    {r11, lr}
+; CHECK-FP:   mov    pc, lr
+; CHECK-FP:   .cfi_endproc
+
+; CHECK-FP-ELIM-LABEL: test2:
+; CHECK-FP-ELIM:   .cfi_startproc
+; CHECK-FP-ELIM:   push  {r11, lr}
+; CHECK-FP-ELIM:   .cfi_def_cfa_offset 8
+; CHECK-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-FP-ELIM:   .cfi_offset r11, -8
+; CHECK-FP-ELIM:   pop   {r11, lr}
+; CHECK-FP-ELIM:   mov   pc, lr
+; CHECK-FP-ELIM:   .cfi_endproc
+
+; CHECK-V7-FP-LABEL: test2:
+; CHECK-V7-FP:   .cfi_startproc
+; CHECK-V7-FP:   push   {r11, lr}
+; CHECK-V7-FP:   .cfi_def_cfa_offset 8
+; CHECK-V7-FP:   .cfi_offset lr, -4
+; CHECK-V7-FP:   .cfi_offset r11, -8
+; CHECK-V7-FP:   mov    r11, sp
+; CHECK-V7-FP:   .cfi_def_cfa_register r11
+; CHECK-V7-FP:   pop    {r11, pc}
+; CHECK-V7-FP:   .cfi_endproc
+
+; CHECK-V7-FP-ELIM-LABEL: test2:
+; CHECK-V7-FP-ELIM:   .cfi_startproc
+; CHECK-V7-FP-ELIM:   push  {r11, lr}
+; CHECK-V7-FP-ELIM:   .cfi_def_cfa_offset 8
+; CHECK-V7-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-V7-FP-ELIM:   .cfi_offset r11, -8
+; CHECK-V7-FP-ELIM:   pop   {r11, pc}
+; CHECK-V7-FP-ELIM:   .cfi_endproc
+
+; CHECK-THUMB-FP-LABEL: test2:
+; CHECK-THUMB-FP:   .cfi_startproc
+; CHECK-THUMB-FP:   push   {r7, lr}
+; CHECK-THUMB-FP:   .cfi_def_cfa_offset 8
+; CHECK-THUMB-FP:   .cfi_offset lr, -4
+; CHECK-THUMB-FP:   .cfi_offset r7, -8
+; CHECK-THUMB-FP:   add    r7, sp, #0
+; CHECK-THUMB-FP:   .cfi_def_cfa_register r7
+; CHECK-THUMB-FP:   pop    {r7, pc}
+; CHECK-THUMB-FP:   .cfi_endproc
+
+; CHECK-THUMB-FP-ELIM-LABEL: test2:
+; CHECK-THUMB-FP-ELIM:   .cfi_startproc
+; CHECK-THUMB-FP-ELIM:   push  {r7, lr}
+; CHECK-THUMB-FP-ELIM:   .cfi_def_cfa_offset 8
+; CHECK-THUMB-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-THUMB-FP-ELIM:   .cfi_offset r7, -8
+; CHECK-THUMB-FP-ELIM:   pop   {r7, pc}
+; CHECK-THUMB-FP-ELIM:   .cfi_endproc
+
+; CHECK-THUMB-V7-FP-LABEL: test2:
+; CHECK-THUMB-V7-FP:   .cfi_startproc
+; CHECK-THUMB-V7-FP:   push   {r7, lr}
+; CHECK-THUMB-V7-FP:   .cfi_def_cfa_offset 8
+; CHECK-THUMB-V7-FP:   .cfi_offset lr, -4
+; CHECK-THUMB-V7-FP:   .cfi_offset r7, -8
+; CHECK-THUMB-V7-FP:   mov    r7, sp
+; CHECK-THUMB-V7-FP:   .cfi_def_cfa_register r7
+; CHECK-THUMB-V7-FP:   pop    {r7, pc}
+; CHECK-THUMB-V7-FP:   .cfi_endproc
+
+; CHECK-THUMB-V7-FP-ELIM-LABEL: test2:
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_startproc
+; CHECK-THUMB-V7-FP-ELIM:   push.w  {r11, lr}
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_def_cfa_offset 8
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset r11, -8
+; CHECK-THUMB-V7-FP-ELIM:   pop.w   {r11, pc}
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_endproc
+
+
+;-------------------------------------------------------------------------------
+; Test 3
+;-------------------------------------------------------------------------------
+
+declare void @throw_exception_3(i32)
+
+define i32 @test3(i32 %a, i32 %b, i32 %c, i32 %d,
+                  i32 %e, i32 %f, i32 %g, i32 %h) {
+entry:
+  %add = add nsw i32 %b, %a
+  %add1 = add nsw i32 %add, %c
+  %add2 = add nsw i32 %add1, %d
+  tail call void @throw_exception_3(i32 %add2)
+  %add3 = add nsw i32 %f, %e
+  %add4 = add nsw i32 %add3, %g
+  %add5 = add nsw i32 %add4, %h
+  tail call void @throw_exception_3(i32 %add5)
+  %add6 = add nsw i32 %add5, %add2
+  ret i32 %add6
+}
+
+; CHECK-FP-LABEL: test3:
+; CHECK-FP:   .cfi_startproc
+; CHECK-FP:   push   {r4, r5, r11, lr}
+; CHECK-FP:   .cfi_def_cfa_offset 16
+; CHECK-FP:   .cfi_offset lr, -4
+; CHECK-FP:   .cfi_offset r11, -8
+; CHECK-FP:   .cfi_offset r5, -12
+; CHECK-FP:   .cfi_offset r4, -16
+; CHECK-FP:   add    r11, sp, #8
+; CHECK-FP:   .cfi_def_cfa r11, 8
+; CHECK-FP:   pop    {r4, r5, r11, lr}
+; CHECK-FP:   mov    pc, lr
+; CHECK-FP:   .cfi_endproc
+
+; CHECK-FP-ELIM-LABEL: test3:
+; CHECK-FP-ELIM:   .cfi_startproc
+; CHECK-FP-ELIM:   push  {r4, r5, r11, lr}
+; CHECK-FP-ELIM:   .cfi_def_cfa_offset 16
+; CHECK-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-FP-ELIM:   .cfi_offset r11, -8
+; CHECK-FP-ELIM:   .cfi_offset r5, -12
+; CHECK-FP-ELIM:   .cfi_offset r4, -16
+; CHECK-FP-ELIM:   pop   {r4, r5, r11, lr}
+; CHECK-FP-ELIM:   mov   pc, lr
+; CHECK-FP-ELIM:   .cfi_endproc
+
+; CHECK-V7-FP-LABEL: test3:
+; CHECK-V7-FP:   .cfi_startproc
+; CHECK-V7-FP:   push   {r4, r5, r11, lr}
+; CHECK-V7-FP:   .cfi_def_cfa_offset 16
+; CHECK-V7-FP:   .cfi_offset lr, -4
+; CHECK-V7-FP:   .cfi_offset r11, -8
+; CHECK-V7-FP:   .cfi_offset r5, -12
+; CHECK-V7-FP:   .cfi_offset r4, -16
+; CHECK-V7-FP:   add    r11, sp, #8
+; CHECK-V7-FP:   .cfi_def_cfa r11, 8
+; CHECK-V7-FP:   pop    {r4, r5, r11, pc}
+; CHECK-V7-FP:   .cfi_endproc
+
+; CHECK-V7-FP-ELIM-LABEL: test3:
+; CHECK-V7-FP-ELIM:   .cfi_startproc
+; CHECK-V7-FP-ELIM:   push  {r4, r5, r11, lr}
+; CHECK-V7-FP-ELIM:   .cfi_def_cfa_offset 16
+; CHECK-V7-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-V7-FP-ELIM:   .cfi_offset r11, -8
+; CHECK-V7-FP-ELIM:   .cfi_offset r5, -12
+; CHECK-V7-FP-ELIM:   .cfi_offset r4, -16
+; CHECK-V7-FP-ELIM:   pop   {r4, r5, r11, pc}
+; CHECK-V7-FP-ELIM:   .cfi_endproc
+
+; CHECK-THUMB-FP-LABEL: test3:
+; CHECK-THUMB-FP:   .cfi_startproc
+; CHECK-THUMB-FP:   push   {r4, r5, r7, lr}
+; CHECK-THUMB-FP:   .cfi_def_cfa_offset 16
+; CHECK-THUMB-FP:   .cfi_offset lr, -4
+; CHECK-THUMB-FP:   .cfi_offset r7, -8
+; CHECK-THUMB-FP:   .cfi_offset r5, -12
+; CHECK-THUMB-FP:   .cfi_offset r4, -16
+; CHECK-THUMB-FP:   add    r7, sp, #8
+; CHECK-THUMB-FP:   .cfi_def_cfa r7, 8
+; CHECK-THUMB-FP:   pop    {r4, r5, r7, pc}
+; CHECK-THUMB-FP:   .cfi_endproc
+
+; CHECK-THUMB-FP-ELIM-LABEL: test3:
+; CHECK-THUMB-FP-ELIM:   .cfi_startproc
+; CHECK-THUMB-FP-ELIM:   push  {r4, r5, r7, lr}
+; CHECK-THUMB-FP-ELIM:   .cfi_def_cfa_offset 16
+; CHECK-THUMB-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-THUMB-FP-ELIM:   .cfi_offset r7, -8
+; CHECK-THUMB-FP-ELIM:   .cfi_offset r5, -12
+; CHECK-THUMB-FP-ELIM:   .cfi_offset r4, -16
+; CHECK-THUMB-FP-ELIM:   pop   {r4, r5, r7, pc}
+; CHECK-THUMB-FP-ELIM:   .cfi_endproc
+
+; CHECK-THUMB-V7-FP-LABEL: test3:
+; CHECK-THUMB-V7-FP:   .cfi_startproc
+; CHECK-THUMB-V7-FP:   push   {r4, r5, r7, lr}
+; CHECK-THUMB-V7-FP:   .cfi_def_cfa_offset 16
+; CHECK-THUMB-V7-FP:   .cfi_offset lr, -4
+; CHECK-THUMB-V7-FP:   .cfi_offset r7, -8
+; CHECK-THUMB-V7-FP:   .cfi_offset r5, -12
+; CHECK-THUMB-V7-FP:   .cfi_offset r4, -16
+; CHECK-THUMB-V7-FP:   add    r7, sp, #8
+; CHECK-THUMB-V7-FP:   .cfi_def_cfa r7, 8
+; CHECK-THUMB-V7-FP:   pop    {r4, r5, r7, pc}
+; CHECK-THUMB-V7-FP:   .cfi_endproc
+
+; CHECK-THUMB-V7-FP-ELIM-LABEL: test3:
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_startproc
+; CHECK-THUMB-V7-FP-ELIM:   push.w  {r4, r5, r11, lr}
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_def_cfa_offset 16
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset lr, -4
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset r11, -8
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset r5, -12
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_offset r4, -16
+; CHECK-THUMB-V7-FP-ELIM:   pop.w   {r4, r5, r11, pc}
+; CHECK-THUMB-V7-FP-ELIM:   .cfi_endproc
+
+
+;-------------------------------------------------------------------------------
+; Test 4
+;-------------------------------------------------------------------------------
+
+define void @test4() nounwind {
+entry:
+  ret void
+}
+
+; CHECK-FP-LABEL: test4:
+; CHECK-FP:   mov pc, lr
+; CHECK-FP-NOT:   .cfi_def_cfa_offset
+
+; CHECK-FP-ELIM-LABEL: test4:
+; CHECK-FP-ELIM:   mov pc, lr
+; CHECK-FP-ELIM-NOT:   .cfi_def_cfa_offset
+
+; CHECK-V7-FP-LABEL: test4:
+; CHECK-V7-FP:   bx lr
+; CHECK-V7-FP-NOT:   .cfi_def_cfa_offset
+
+; CHECK-V7-FP-ELIM-LABEL: test4:
+; CHECK-V7-FP-ELIM:   bx lr
+; CHECK-V7-FP-ELIM-NOT:   .cfi_def_cfa_offset
+
+; CHECK-THUMB-FP-LABEL: test4:
+; CHECK-THUMB-FP:   bx lr
+; CHECK-THUMB-FP-NOT:   .cfi_def_cfa_offset
+
+; CHECK-THUMB-FP-ELIM-LABEL: test4:
+; CHECK-THUMB-FP-ELIM:   bx lr
+; CHECK-THUMB-FP-ELIM-NOT:   .cfi_def_cfa_offset
+
+; CHECK-THUMB-V7-FP-LABEL: test4:
+; CHECK-THUMB-V7-FP:   bx lr
+; CHECK-THUMB-V7-FP-NOT:   .cfi_def_cfa_offset
+
+; CHECK-THUMB-V7-FP-ELIM-LABEL: test4:
+; CHECK-THUMB-V7-FP-ELIM:   bx lr
+; CHECK-THUMB-V7-FP-ELIM-NOT:   .cfi_def_cfa_offset
+
diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
index ee515fd5..03ce312 100644
--- a/test/CodeGen/ARM/debug-info-qreg.ll
+++ b/test/CodeGen/ARM/debug-info-qreg.ll
@@ -2,13 +2,15 @@
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-macosx10.6.7"
 
-;CHECK: DW_OP_regx for Q register: D1
+;CHECK: sub-register
+;CHECK-NEXT: DW_OP_regx
 ;CHECK-NEXT: ascii
-;CHECK-NEXT: DW_OP_piece 8
+;CHECK-NEXT: DW_OP_piece
 ;CHECK-NEXT: byte   8
-;CHECK-NEXT: DW_OP_regx for Q register: D2
+;CHECK-NEXT: sub-register
+;CHECK-NEXT: DW_OP_regx
 ;CHECK-NEXT: ascii
-;CHECK-NEXT: DW_OP_piece 8
+;CHECK-NEXT: DW_OP_piece
 ;CHECK-NEXT: byte   8
 
 @.str = external constant [13 x i8]
diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll
index e92d977..ee9faf8 100644
--- a/test/CodeGen/ARM/debug-info-s16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-s16-reg.ll
@@ -1,9 +1,11 @@
 ; RUN: llc < %s - | FileCheck %s
 ; Radar 9309221
 ; Test dwarf reg no for s16
-;CHECK: DW_OP_regx for S register
+;CHECK: super-register
+;CHECK-NEXT: DW_OP_regx
 ;CHECK-NEXT: ascii
-;CHECK-NEXT: DW_OP_bit_piece 32 0
+;CHECK-NEXT: DW_OP_piece
+;CHECK-NEXT: 4
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-macosx10.6.7"
diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
index 854fcab..71a696a 100644
--- a/test/CodeGen/ARM/debug-info-sreg2.ll
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll
@@ -3,13 +3,19 @@
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-macosx10.6.7"
 
-;CHECK: Ldebug_loc0:
-;CHECK-NEXT:        .long   Ltmp0
-;CHECK-NEXT:        .long   Ltmp1
+;CHECK-LABEL: Lfunc_begin0:
+;CHECK: Ltmp[[K:[0-9]+]]:
+;CHECK: Ltmp[[L:[0-9]+]]:
+;CHECK-LABEL: Ldebug_loc0:
+;CHECK-NEXT:        .long   Ltmp[[K]]
+;CHECK-NEXT:        .long   Ltmp[[L]]
 ;CHECK-NEXT: Lset[[N:[0-9]+]] = Ltmp{{[0-9]+}}-Ltmp[[M:[0-9]+]]        @ Loc expr size
 ;CHECK-NEXT:        .short  Lset[[N]]
 ;CHECK-NEXT: Ltmp[[M]]:
-;CHECK-NEXT:        .byte   144                     @ DW_OP_regx for S register
+;CHECK-NEXT:        .byte   144                     @ super-register
+;CHECK-NEXT:                                        @ DW_OP_regx
+;CHECK-NEXT:        .ascii
+;CHECK-NEXT:        .byte   {{[0-9]+}}              @ DW_OP_{{.*}}piece
 
 define void @_Z3foov() optsize ssp {
 entry:
diff --git a/test/CodeGen/ARM/debug-segmented-stacks.ll b/test/CodeGen/ARM/debug-segmented-stacks.ll
new file mode 100644
index 0000000..b0dc467
--- /dev/null
+++ b/test/CodeGen/ARM/debug-segmented-stacks.ll
@@ -0,0 +1,80 @@
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs -filetype=asm | FileCheck %s -check-prefix=ARM-linux
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+define void @test_basic() {
+        %mem = alloca i32, i32 10
+        call void @dummy_use (i32* %mem, i32 10)
+	ret void
+
+; ARM-linux:      test_basic:
+
+; ARM-linux:      push    {r4, r5}
+; ARM-linux:      .cfi_def_cfa_offset 8
+; ARM-linux:      .cfi_offset r5, -4
+; ARM-linux:      .cfi_offset r4, -8
+; ARM-linux-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-linux-NEXT: mov     r5, sp
+; ARM-linux-NEXT: ldr     r4, [r4, #4]
+; ARM-linux-NEXT: cmp     r4, r5
+; ARM-linux-NEXT: blo     .LBB0_2
+
+; ARM-linux:      mov     r4, #48
+; ARM-linux-NEXT: mov     r5, #0
+; ARM-linux-NEXT: stmdb   sp!, {lr}
+; ARM-linux:      .cfi_def_cfa_offset 12
+; ARM-linux:      .cfi_offset lr, -12
+; ARM-linux-NEXT: bl      __morestack
+; ARM-linux-NEXT: ldm     sp!, {lr}
+; ARM-linux-NEXT: pop     {r4, r5}
+; ARM-linux:      .cfi_def_cfa_offset 0
+; ARM-linux-NEXT: bx      lr
+
+; ARM-linux:      pop     {r4, r5}
+; ARM-linux:      .cfi_def_cfa_offset 0
+; ARM-linux       .cfi_same_value r4
+; ARM-linux       .cfi_same_value r5
+}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"var.c", metadata !"/tmp"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_basic",
+  metadata !"test_basic", metadata !"", i32 5, metadata !6, i1 false, i1 true,
+  i32 0, i32 0, null, i32 256, i1 false, void ()* @test_basic, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{metadata !"clang version 3.5 "}
+!12 = metadata !{i32 786689, metadata !4, metadata !"count", metadata !5, i32 16777221, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [count] [line 5]
+!13 = metadata !{i32 5, i32 0, metadata !4, null}
+!14 = metadata !{i32 786688, metadata !4, metadata !"vl", metadata !5, i32 6, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vl] [line 6]
+!15 = metadata !{i32 786454, metadata !16, null, metadata !"va_list", i32 30, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
+!16 = metadata !{metadata !"/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", metadata !"/tmp"}
+!17 = metadata !{i32 786454, metadata !1, null, metadata !"__builtin_va_list", i32 6, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
+!18 = metadata !{i32 786451, metadata !1, null, metadata !"__va_list", i32 6, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
+!19 = metadata !{metadata !20}
+!20 = metadata !{i32 786445, metadata !1, metadata !18, metadata !"__ap", i32 6, i64 32, i64 32, i64 0, i32 0, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
+!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
+!22 = metadata !{i32 6, i32 0, metadata !4, null}
+!23 = metadata !{i32 7, i32 0, metadata !4, null}
+!24 = metadata !{i32 786688, metadata !4, metadata !"test_basic", metadata !5, i32 8, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [sum] [line 8]
+!25 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!26 = metadata !{i32 786688, metadata !27, metadata !"i", metadata !5, i32 9, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 9]
+!27 = metadata !{i32 786443, metadata !1, metadata !4, i32 9, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!28 = metadata !{i32 9, i32 0, metadata !27, null}
+!29 = metadata !{i32 10, i32 0, metadata !30, null}
+!30 = metadata !{i32 786443, metadata !1, metadata !27, i32 9, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!31 = metadata !{i32 11, i32 0, metadata !30, null}
+!32 = metadata !{i32 12, i32 0, metadata !4, null}
+!33 = metadata !{i32 13, i32 0, metadata !4, null}
+
+; Just to prevent the alloca from being optimized away
+declare void @dummy_use(i32*, i32)
diff --git a/test/CodeGen/ARM/default-float-abi.ll b/test/CodeGen/ARM/default-float-abi.ll
new file mode 100644
index 0000000..1b26bbd
--- /dev/null
+++ b/test/CodeGen/ARM/default-float-abi.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s --check-prefix=CHECK-HARD
+; RUN: llc -mtriple=armv7-linux-eabihf %s -o - | FileCheck %s --check-prefix=CHECK-HARD
+; RUN: llc -mtriple=armv7-linux-gnueabihf -float-abi=soft %s -o - | FileCheck %s --check-prefix=CHECK-SOFT
+; RUN: llc -mtriple=armv7-linux-gnueabi %s -o - | FileCheck %s --check-prefix=CHECK-SOFT
+; RUN: llc -mtriple=armv7-linux-eabi -float-abi=hard %s -o - | FileCheck %s --check-prefix=CHECK-HARD
+; RUN: llc -mtriple=thumbv7-apple-ios6.0 %s -o - | FileCheck %s --check-prefix=CHECK-SOFT
+
+define float @test_abi(float %lhs, float %rhs) {
+  %sum = fadd float %lhs, %rhs
+  ret float %sum
+
+; CHECK-HARD-LABEL: test_abi:
+; CHECK-HARD-NOT: vmov
+; CHECK-HARD: vadd.f32 s0, s0, s1
+; CHECK-HARD-NOT: vmov
+
+; CHECK-SOFT-LABEL: test_abi:
+; CHECK-SOFT-DAG: vmov [[LHS:s[0-9]+]], r0
+; CHECK-SOFT-DAG: vmov [[RHS:s[0-9]+]], r1
+; CHECK-SOFT: vadd.f32 [[DEST:s[0-9]+]], [[LHS]], [[RHS]]
+; CHECK-SOFT: vmov r0, [[DEST]]
+}
diff --git a/test/CodeGen/ARM/divmod-eabi.ll b/test/CodeGen/ARM/divmod-eabi.ll
index 404cae0..7f72048 100644
--- a/test/CodeGen/ARM/divmod-eabi.ll
+++ b/test/CodeGen/ARM/divmod-eabi.ll
@@ -1,6 +1,9 @@
 ; RUN: llc -mtriple armv7-none-eabi %s -o - | FileCheck %s --check-prefix=EABI
+; RUN: llc -mtriple armv7-none-eabihf %s -o - | FileCheck %s --check-prefix=EABI
 ; RUN: llc -mtriple armv7-linux-gnueabi %s -o - | FileCheck %s --check-prefix=GNU
 ; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefix=DARWIN
+; FIXME: long-term, we will use "-apple-macho" and won't need this exception:
+; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - | FileCheck %s --check-prefix=DARWIN
 
 define signext i16 @f16(i16 signext %a, i16 signext %b) {
 ; EABI-LABEL: f16:
@@ -186,7 +189,7 @@ entry:
   %div = sdiv i32 %a, %b
 ; EABI: __aeabi_idivmod
 ; EABI: mov [[div:r[0-9]+]], r0
-; GNU __aeabi_idiv
+; GNU: __aeabi_idiv
 ; GNU: mov [[sum:r[0-9]+]], r0
 ; DARWIN: ___divsi3
 ; DARWIN: mov [[sum:r[0-9]+]], r0
diff --git a/test/CodeGen/ARM/dyn-stackalloc.ll b/test/CodeGen/ARM/dyn-stackalloc.ll
index de2820e..4ac5b8a 100644
--- a/test/CodeGen/ARM/dyn-stackalloc.ll
+++ b/test/CodeGen/ARM/dyn-stackalloc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 %struct.comment = type { i8**, i32*, i32, i8* }
 %struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
diff --git a/test/CodeGen/ARM/ehabi-filters.ll b/test/CodeGen/ARM/ehabi-filters.ll
index cb5291b..f86b66c 100644
--- a/test/CodeGen/ARM/ehabi-filters.ll
+++ b/test/CodeGen/ARM/ehabi-filters.ll
@@ -1,4 +1,4 @@
-; RUN: llc -arm-enable-ehabi -arm-enable-ehabi-descriptors < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
 target triple = "armv7-none-linux-gnueabi"
 
diff --git a/test/CodeGen/ARM/ehabi-no-landingpad.ll b/test/CodeGen/ARM/ehabi-no-landingpad.ll
index ac0dff4..d5c74c5 100644
--- a/test/CodeGen/ARM/ehabi-no-landingpad.ll
+++ b/test/CodeGen/ARM/ehabi-no-landingpad.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-unknown-linux-gnueabi \
-; RUN:   -arm-enable-ehabi -arm-enable-ehabi-descriptors | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-unknown-linux-gnueabi | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
 target triple = "armv7-unknown-linux-gnueabi"
diff --git a/test/CodeGen/ARM/ehabi-unwind.ll b/test/CodeGen/ARM/ehabi-unwind.ll
index fd7d0e6..a86f340 100644
--- a/test/CodeGen/ARM/ehabi-unwind.ll
+++ b/test/CodeGen/ARM/ehabi-unwind.ll
@@ -1,8 +1,7 @@
 ; Test that the EHABI unwind instruction generator does not encounter any
 ; unfamiliar instructions.
-; RUN: llc < %s -mtriple=thumbv7 -arm-enable-ehabi -disable-fp-elim
-; RUN: llc < %s -mtriple=thumbv7 -arm-enable-ehabi
-; RUN: llc < %s -mtriple=thumbv7 -arm-enable-ehabi -arm-enable-ehabi-descriptors
+; RUN: llc < %s -mtriple=thumbv7 -disable-fp-elim
+; RUN: llc < %s -mtriple=thumbv7
 
 define void @_Z1fv() nounwind {
 entry:
diff --git a/test/CodeGen/ARM/ehabi.ll b/test/CodeGen/ARM/ehabi.ll
index 6644652..720cc3c 100644
--- a/test/CodeGen/ARM/ehabi.ll
+++ b/test/CodeGen/ARM/ehabi.ll
@@ -19,22 +19,34 @@
 ; (4) armv7 without -disable-fp-elim
 
 ; RUN: llc -mtriple arm-unknown-linux-gnueabi \
-; RUN:     -arm-enable-ehabi -arm-enable-ehabi-descriptors \
 ; RUN:     -disable-fp-elim -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-FP
 
 ; RUN: llc -mtriple arm-unknown-linux-gnueabi \
-; RUN:     -arm-enable-ehabi -arm-enable-ehabi-descriptors \
 ; RUN:     -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-FP-ELIM
 
 ; RUN: llc -mtriple armv7-unknown-linux-gnueabi \
-; RUN:     -arm-enable-ehabi -arm-enable-ehabi-descriptors \
 ; RUN:     -disable-fp-elim -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-V7-FP
 
 ; RUN: llc -mtriple armv7-unknown-linux-gnueabi \
-; RUN:     -arm-enable-ehabi -arm-enable-ehabi-descriptors \
+; RUN:     -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-V7-FP-ELIM
+
+; RUN: llc -mtriple arm-unknown-linux-androideabi \
+; RUN:     -disable-fp-elim -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-FP
+
+; RUN: llc -mtriple arm-unknown-linux-androideabi \
+; RUN:     -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-FP-ELIM
+
+; RUN: llc -mtriple armv7-unknown-linux-androideabi \
+; RUN:     -disable-fp-elim -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=CHECK-V7-FP
+
+; RUN: llc -mtriple armv7-unknown-linux-androideabi \
 ; RUN:     -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-V7-FP-ELIM
 
@@ -169,7 +181,7 @@ declare void @throw_exception_2()
 
 define void @test2() {
 entry:
-  tail call void @throw_exception_2()
+  call void @throw_exception_2()
   ret void
 }
 
diff --git a/test/CodeGen/ARM/extload-knownzero.ll b/test/CodeGen/ARM/extload-knownzero.ll
index 8ccf58c..f55b951 100644
--- a/test/CodeGen/ARM/extload-knownzero.ll
+++ b/test/CodeGen/ARM/extload-knownzero.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
 ; rdar://12771555
 
 define void @foo(i16* %ptr, i32 %a) nounwind {
diff --git a/test/CodeGen/ARM/extloadi1.ll b/test/CodeGen/ARM/extloadi1.ll
index dc45ce7..2504c6c 100644
--- a/test/CodeGen/ARM/extloadi1.ll
+++ b/test/CodeGen/ARM/extloadi1.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
+
 @handler_installed.6144.b = external global i1          ; <i1*> [#uses=1]
 
 define void @__mf_sigusr1_respond() {
diff --git a/test/CodeGen/ARM/fadds.ll b/test/CodeGen/ARM/fadds.ll
index 21219ce..b5d3bda 100644
--- a/test/CodeGen/ARM/fadds.ll
+++ b/test/CodeGen/ARM/fadds.ll
@@ -1,9 +1,20 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CORTEXA8U
-; RUN: llc < %s -mtriple=arm-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8U
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=VFP2
+
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NFP0
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA8
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA8U
+
+; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA8U
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA9
 
 define float @test(float %a, float %b) {
 entry:
diff --git a/test/CodeGen/ARM/fast-isel-call.ll b/test/CodeGen/ARM/fast-isel-call.ll
index 917a15d..2d7378e 100644
--- a/test/CodeGen/ARM/fast-isel-call.ll
+++ b/test/CodeGen/ARM/fast-isel-call.ll
@@ -8,8 +8,6 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=THUMB-NOVFP
 
-; XFAIL: vg_leak
-
 ; Note that some of these tests assume that relocations are either
 ; movw/movt or constant pool loads. Different platforms will select
 ; different approaches.
diff --git a/test/CodeGen/ARM/fast-isel-crash2.ll b/test/CodeGen/ARM/fast-isel-crash2.ll
index d606877..cccd9eb 100644
--- a/test/CodeGen/ARM/fast-isel-crash2.ll
+++ b/test/CodeGen/ARM/fast-isel-crash2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=thumbv7-apple-darwin
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=thumbv7-linux-gnueabi
+; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=thumbv7-apple-darwin
+; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=armv7-linux-gnueabi
 ; rdar://9515076
 ; (Make sure this doesn't crash.)
 
diff --git a/test/CodeGen/ARM/fast-isel-frameaddr.ll b/test/CodeGen/ARM/fast-isel-frameaddr.ll
index 8542bb5..93cdbbb 100644
--- a/test/CodeGen/ARM/fast-isel-frameaddr.ll
+++ b/test/CodeGen/ARM/fast-isel-frameaddr.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=armv7-apple-darwin | FileCheck %s --check-prefix=DARWIN-ARM
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=DARWIN-ARM
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=LINUX-ARM
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=thumbv7-apple-darwin | FileCheck %s --check-prefix=DARWIN-THUMB2
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=DARWIN-THUMB2
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=thumbv7-linux-gnueabi | FileCheck %s --check-prefix=LINUX-THUMB2
 
 define i8* @frameaddr_index0() nounwind {
@@ -34,14 +34,12 @@ entry:
 ; DARWIN-ARM-LABEL: frameaddr_index1:
 ; DARWIN-ARM: push {r7}
 ; DARWIN-ARM: mov r7, sp
-; DARWIN-ARM: mov r0, r7
-; DARWIN-ARM: ldr r0, [r0]
+; DARWIN-ARM: ldr r0, [r7]
 
 ; DARWIN-THUMB2-LABEL: frameaddr_index1:
 ; DARWIN-THUMB2: str r7, [sp, #-4]!
 ; DARWIN-THUMB2: mov r7, sp
-; DARWIN-THUMB2: mov r0, r7
-; DARWIN-THUMB2: ldr r0, [r0]
+; DARWIN-THUMB2: ldr r0, [r7]
 
 ; LINUX-ARM-LABEL: frameaddr_index1:
 ; LINUX-ARM: push {r11}
@@ -63,16 +61,14 @@ entry:
 ; DARWIN-ARM-LABEL: frameaddr_index3:
 ; DARWIN-ARM: push {r7}
 ; DARWIN-ARM: mov r7, sp
-; DARWIN-ARM: mov r0, r7
-; DARWIN-ARM: ldr r0, [r0]
+; DARWIN-ARM: ldr r0, [r7]
 ; DARWIN-ARM: ldr r0, [r0]
 ; DARWIN-ARM: ldr r0, [r0]
 
 ; DARWIN-THUMB2-LABEL: frameaddr_index3:
 ; DARWIN-THUMB2: str r7, [sp, #-4]!
 ; DARWIN-THUMB2: mov r7, sp
-; DARWIN-THUMB2: mov r0, r7
-; DARWIN-THUMB2: ldr r0, [r0]
+; DARWIN-THUMB2: ldr r0, [r7]
 ; DARWIN-THUMB2: ldr r0, [r0]
 ; DARWIN-THUMB2: ldr r0, [r0]
 
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index b08b72b..089209e 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -5,8 +5,6 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-LONG
 
-; XFAIL: vg_leak
-
 ; Note that some of these tests assume that relocations are either
 ; movw/movt or constant pool loads. Different platforms will select
 ; different approaches.
@@ -15,7 +13,7 @@
 @temp = common global [60 x i8] zeroinitializer, align 1
 
 define void @t1() nounwind ssp {
-; ARM: t1
+; ARM-LABEL: t1:
 ; ARM: {{(movw r0, :lower16:_?message1)|(ldr r0, .LCPI)}}
 ; ARM: {{(movt r0, :upper16:_?message1)|(ldr r0, \[r0\])}}
 ; ARM: add r0, r0, #5
@@ -23,12 +21,12 @@ define void @t1() nounwind ssp {
 ; ARM: movw r2, #10
 ; ARM: and r1, r1, #255
 ; ARM: bl {{_?}}memset
-; ARM-LONG: t1
+; ARM-LONG-LABEL: t1:
 ; ARM-LONG: {{(movw r3, :lower16:L_memset\$non_lazy_ptr)|(ldr r3, .LCPI)}}
 ; ARM-LONG: {{(movt r3, :upper16:L_memset\$non_lazy_ptr)?}}
 ; ARM-LONG: ldr r3, [r3]
 ; ARM-LONG: blx r3
-; THUMB: t1
+; THUMB-LABEL: t1:
 ; THUMB: {{(movw r0, :lower16:_?message1)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:_?message1)|(ldr r0, \[r0\])}}
 ; THUMB: adds r0, #5
@@ -38,7 +36,7 @@ define void @t1() nounwind ssp {
 ; THUMB: movt r2, #0
 ; THUMB: and r1, r1, #255
 ; THUMB: bl {{_?}}memset
-; THUMB-LONG: t1
+; THUMB-LONG-LABEL: t1:
 ; THUMB-LONG: movw r3, :lower16:L_memset$non_lazy_ptr
 ; THUMB-LONG: movt r3, :upper16:L_memset$non_lazy_ptr
 ; THUMB-LONG: ldr r3, [r3]
@@ -50,7 +48,7 @@ define void @t1() nounwind ssp {
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
 
 define void @t2() nounwind ssp {
-; ARM: t2
+; ARM-LABEL: t2:
 ; ARM: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
 ; ARM: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; ARM: ldr r0, [r0]
@@ -61,12 +59,12 @@ define void @t2() nounwind ssp {
 ; ARM: mov r0, r1
 ; ARM: ldr r1, [sp[[SLOT]]] @ 4-byte Reload
 ; ARM: bl {{_?}}memcpy
-; ARM-LONG: t2
+; ARM-LONG-LABEL: t2:
 ; ARM-LONG: {{(movw r3, :lower16:L_memcpy\$non_lazy_ptr)|(ldr r3, .LCPI)}}
 ; ARM-LONG: {{(movt r3, :upper16:L_memcpy\$non_lazy_ptr)?}}
 ; ARM-LONG: ldr r3, [r3]
 ; ARM-LONG: blx r3
-; THUMB: t2
+; THUMB-LABEL: t2:
 ; THUMB: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; THUMB: ldr r0, [r0]
@@ -78,7 +76,7 @@ define void @t2() nounwind ssp {
 ; THUMB: mov r0, r1
 ; THUMB: ldr r1,  [sp[[SLOT]]] @ 4-byte Reload
 ; THUMB: bl {{_?}}memcpy
-; THUMB-LONG: t2
+; THUMB-LONG-LABEL: t2:
 ; THUMB-LONG: movw r3, :lower16:L_memcpy$non_lazy_ptr
 ; THUMB-LONG: movt r3, :upper16:L_memcpy$non_lazy_ptr
 ; THUMB-LONG: ldr r3, [r3]
@@ -90,7 +88,7 @@ define void @t2() nounwind ssp {
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 
 define void @t3() nounwind ssp {
-; ARM: t3
+; ARM-LABEL: t3:
 ; ARM: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
 ; ARM: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; ARM: ldr r0, [r0]
@@ -99,12 +97,12 @@ define void @t3() nounwind ssp {
 ; ARM: movw r2, #10
 ; ARM: mov r0, r1
 ; ARM: bl {{_?}}memmove
-; ARM-LONG: t3
+; ARM-LONG-LABEL: t3:
 ; ARM-LONG: {{(movw r3, :lower16:L_memmove\$non_lazy_ptr)|(ldr r3, .LCPI)}}
 ; ARM-LONG: {{(movt r3, :upper16:L_memmove\$non_lazy_ptr)?}}
 ; ARM-LONG: ldr r3, [r3]
 ; ARM-LONG: blx r3
-; THUMB: t3
+; THUMB-LABEL: t3:
 ; THUMB: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; THUMB: ldr r0, [r0]
@@ -116,7 +114,7 @@ define void @t3() nounwind ssp {
 ; THUMB: mov r0, r1
 ; THUMB: ldr r1,  [sp[[SLOT]]] @ 4-byte Reload
 ; THUMB: bl {{_?}}memmove
-; THUMB-LONG: t3
+; THUMB-LONG-LABEL: t3:
 ; THUMB-LONG: movw r3, :lower16:L_memmove$non_lazy_ptr
 ; THUMB-LONG: movt r3, :upper16:L_memmove$non_lazy_ptr
 ; THUMB-LONG: ldr r3, [r3]
@@ -126,7 +124,7 @@ define void @t3() nounwind ssp {
 }
 
 define void @t4() nounwind ssp {
-; ARM: t4
+; ARM-LABEL: t4:
 ; ARM: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
 ; ARM: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; ARM: ldr r0, [r0]
@@ -137,7 +135,7 @@ define void @t4() nounwind ssp {
 ; ARM: ldrh r1, [r0, #24]
 ; ARM: strh r1, [r0, #12]
 ; ARM: bx lr
-; THUMB: t4
+; THUMB-LABEL: t4:
 ; THUMB: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; THUMB: ldr r0, [r0]
@@ -155,7 +153,7 @@ define void @t4() nounwind ssp {
 declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 
 define void @t5() nounwind ssp {
-; ARM: t5
+; ARM-LABEL: t5:
 ; ARM: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
 ; ARM: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; ARM: ldr r0, [r0]
@@ -170,7 +168,7 @@ define void @t5() nounwind ssp {
 ; ARM: ldrh r1, [r0, #24]
 ; ARM: strh r1, [r0, #12]
 ; ARM: bx lr
-; THUMB: t5
+; THUMB-LABEL: t5:
 ; THUMB: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; THUMB: ldr r0, [r0]
@@ -190,7 +188,7 @@ define void @t5() nounwind ssp {
 }
 
 define void @t6() nounwind ssp {
-; ARM: t6
+; ARM-LABEL: t6:
 ; ARM: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
 ; ARM: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; ARM: ldr r0, [r0]
@@ -215,7 +213,7 @@ define void @t6() nounwind ssp {
 ; ARM: ldrb r1, [r0, #25]
 ; ARM: strb r1, [r0, #13]
 ; ARM: bx lr
-; THUMB: t6
+; THUMB-LABEL: t6:
 ; THUMB: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; THUMB: ldr r0, [r0]
@@ -253,9 +251,9 @@ define void @t7() nounwind ssp {
 
 define i32 @t8(i32 %x) nounwind {
 entry:
-; ARM: t8
+; ARM-LABEL: t8:
 ; ARM-NOT: FastISel missed call:   %expval = call i32 @llvm.expect.i32(i32 %x, i32 1)
-; THUMB: t8
+; THUMB-LABEL: t8:
 ; THUMB-NOT: FastISel missed call:   %expval = call i32 @llvm.expect.i32(i32 %x, i32 1)
   %expval = call i32 @llvm.expect.i32(i32 %x, i32 1)
   ret i32 %expval
diff --git a/test/CodeGen/ARM/fast-isel-static.ll b/test/CodeGen/ARM/fast-isel-static.ll
index 93c14a0..9bd0a51 100644
--- a/test/CodeGen/ARM/fast-isel-static.ll
+++ b/test/CodeGen/ARM/fast-isel-static.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
-; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
-; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
 
 define void @myadd(float* %sum, float* %addend) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/fast-tail-call.ll b/test/CodeGen/ARM/fast-tail-call.ll
index 9fbdc9d..6472016 100644
--- a/test/CodeGen/ARM/fast-tail-call.ll
+++ b/test/CodeGen/ARM/fast-tail-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv7-linux-gnueabi -O0 -arm-tail-calls < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -O0 < %s | FileCheck %s
 
 ; Primarily a non-crash test: Thumbv7 Linux does not have FastISel support,
 ; which led (via a convoluted route) to DAG nodes after a TC_RETURN that
diff --git a/test/CodeGen/ARM/fastcc-vfp.ll b/test/CodeGen/ARM/fastcc-vfp.ll
new file mode 100644
index 0000000..4c98150
--- /dev/null
+++ b/test/CodeGen/ARM/fastcc-vfp.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -mattr=+vfp2 | FileCheck %s
+
+define fastcc double @t1(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, float %a, float %b) {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: vmov
+; CHECK: vldr
+  %add = fadd float %a, %b
+  %conv = fpext float %add to double
+  ret double %conv
+}
+
+define fastcc double @t2(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %a, float %b, double %c) {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: vmov
+; CHECK: vldr
+  %add = fadd double %a, %c
+  ret double %add
+}
+
+define fastcc float @t3(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, float %a, double %b, float %c) {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: vldr
+  %add = fadd float %a, %c
+  ret float %add
+}
+
+define fastcc double @t4(double %a, double %b) #0 {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: vstr
+  %add = fadd double %a, %b
+  %sub = fsub double %a, %b
+  %call = tail call fastcc double @x(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double %add, float 0.000000e+00, double %sub) #2
+  ret double %call
+}
+
+declare fastcc double @x(double, double, double, double, double, double, double, float, double)
diff --git a/test/CodeGen/ARM/fastisel-thumb-litpool.ll b/test/CodeGen/ARM/fastisel-thumb-litpool.ll
new file mode 100644
index 0000000..aa9e726
--- /dev/null
+++ b/test/CodeGen/ARM/fastisel-thumb-litpool.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -O0 -o - %s | FileCheck %s
+
+; We used to accidentally create both an ARM and a Thumb ldr here. It led to an
+; assertion failure at the time, but could go all the way through to emission,
+; hence the CHECK-NOT.
+
+define i32 @test_thumb_ldrlit() minsize {
+; CHECK: ldr r0, LCPI0_0
+; CHECK-NOT: ldr
+  ret i32 12345678
+}
diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll
index a4fecfe..7cab766 100644
--- a/test/CodeGen/ARM/fdivs.ll
+++ b/test/CodeGen/ARM/fdivs.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - | FileCheck %s -check-prefix=VFP2
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s -check-prefix=NFP0
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=CORTEXA8
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=CORTEXA9
 
 define float @test(float %a, float %b) {
 entry:
diff --git a/test/CodeGen/ARM/fixunsdfdi.ll b/test/CodeGen/ARM/fixunsdfdi.ll
index 6db2385..f3406cc 100644
--- a/test/CodeGen/ARM/fixunsdfdi.ll
+++ b/test/CodeGen/ARM/fixunsdfdi.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
-; RUN: llc < %s -march=arm -mattr=vfp2 | not grep vstr.64
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
+; RUN: llc -mtriple=arm-eabi -mattr=vfp2 %s -o - | FileCheck %s
 
 define hidden i64 @__fixunsdfdi(double %x) nounwind readnone {
 entry:
@@ -27,3 +27,6 @@ bb7:		; preds = %bb3
 bb10:		; preds = %entry
 	ret i64 0
 }
+
+; CHECK-NOT: vstr.64
+
diff --git a/test/CodeGen/ARM/fmacs.ll b/test/CodeGen/ARM/fmacs.ll
index f2486c6..6f8c0fe 100644
--- a/test/CodeGen/ARM/fmacs.ll
+++ b/test/CodeGen/ARM/fmacs.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NEON
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=A9
-; RUN: llc < %s -mtriple=arm-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s -check-prefix=HARD
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - | FileCheck %s -check-prefix=VFP2
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s -check-prefix=NEON
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=A8
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9
+; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard %s -o - | FileCheck %s -check-prefix=HARD
 
 define float @t1(float %acc, float %a, float %b) {
 entry:
diff --git a/test/CodeGen/ARM/fmdrr-fmrrd.ll b/test/CodeGen/ARM/fmdrr-fmrrd.ll
index eb72faf..a3669b4 100644
--- a/test/CodeGen/ARM/fmdrr-fmrrd.ll
+++ b/test/CodeGen/ARM/fmdrr-fmrrd.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=vfp2 | not grep fmdrr
-; RUN: llc < %s -march=arm -mattr=vfp2 | not grep fmrrd
+; RUN: llc -mtriple=arm-eabi -mattr=vfp2 %s -o - | FileCheck %s
 
 ; naive codegen for this is:
 ; _i:
@@ -11,3 +10,8 @@ define i64 @test(double %X) {
         %Y = bitcast double %X to i64
         ret i64 %Y
 }
+
+; CHECK-LABEL: test:
+; CHECK-NOT: fmdrr
+; CHECK-NOT: fmrrd
+
diff --git a/test/CodeGen/ARM/fmscs.ll b/test/CodeGen/ARM/fmscs.ll
index f16ec17..5aff74c 100644
--- a/test/CodeGen/ARM/fmscs.ll
+++ b/test/CodeGen/ARM/fmscs.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NEON
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - | FileCheck %s -check-prefix=VFP2
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s -check-prefix=NEON
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=A8
 
 define float @t1(float %acc, float %a, float %b) {
 entry:
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index d11f6bd..b24d867 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -1,9 +1,20 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CORTEXA8U
-; RUN: llc < %s -mtriple=arm-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8U
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=VFP2
+
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NFP0
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA8
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA8U
+
+; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA8U
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA9
 
 define float @test(float %a, float %b) {
 entry:
diff --git a/test/CodeGen/ARM/fnegs.ll b/test/CodeGen/ARM/fnegs.ll
index dc4c2e3..36af835 100644
--- a/test/CodeGen/ARM/fnegs.ll
+++ b/test/CodeGen/ARM/fnegs.ll
@@ -1,9 +1,20 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CORTEXA8U
-; RUN: llc < %s -mtriple=arm-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8U
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=VFP2
+
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NFP0
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA8
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA8U
+
+; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA8U
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=CORTEXA9
 
 define float @test1(float* %a) {
 entry:
diff --git a/test/CodeGen/ARM/fnmacs.ll b/test/CodeGen/ARM/fnmacs.ll
index 825feaa..ab35a97 100644
--- a/test/CodeGen/ARM/fnmacs.ll
+++ b/test/CodeGen/ARM/fnmacs.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NEON
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - | FileCheck %s -check-prefix=VFP2
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s -check-prefix=NEON
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=A8
 
 define float @t1(float %acc, float %a, float %b) {
 entry:
diff --git a/test/CodeGen/ARM/fnmscs.ll b/test/CodeGen/ARM/fnmscs.ll
index 78ccb60..5fa6b21 100644
--- a/test/CodeGen/ARM/fnmscs.ll
+++ b/test/CodeGen/ARM/fnmscs.ll
@@ -1,9 +1,20 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NEON
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=A8
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=A8U
-; RUN: llc < %s -mtriple=arm-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8U
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=VFP2
+
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NEON
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=A8
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 -regalloc=basic %s -o - \
+; RUN:  | FileCheck %s -check-prefix=A8
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \
+; RUN:  | FileCheck %s -check-prefix=A8U
+
+; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=A8U
 
 define float @t1(float %acc, float %a, float %b) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/fnmul.ll b/test/CodeGen/ARM/fnmul.ll
index 6d7bc05..e14e5ba 100644
--- a/test/CodeGen/ARM/fnmul.ll
+++ b/test/CodeGen/ARM/fnmul.ll
@@ -1,5 +1,8 @@
-; RUN: llc < %s -march=arm -mattr=+v6,+vfp2 | grep vnmul.f64
-; RUN: llc < %s -march=arm -mattr=+v6,+vfp2 -enable-sign-dependent-rounding-fp-math | grep vmul.f64
+; RUN: llc -mtriple=arm-eabi -mattr=+v6,+vfp2 %s -o - | FileCheck %s
+
+; RUN: llc -mtriple=arm-eabi -mattr=+v6,+vfp2 -enable-sign-dependent-rounding-fp-math %s -o - \
+; RUN:  | FileCheck %s -check-prefix CHECK-ROUNDING
+
 
 
 define double @t1(double %a, double %b) {
@@ -9,3 +12,6 @@ entry:
         ret double %tmp4
 }
 
+; CHECK: vnmul.f64
+; CHECK-ROUNDING: vmul.f64
+
diff --git a/test/CodeGen/ARM/fnmuls.ll b/test/CodeGen/ARM/fnmuls.ll
index 3223885..de3b053 100644
--- a/test/CodeGen/ARM/fnmuls.ll
+++ b/test/CodeGen/ARM/fnmuls.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s
 
 define arm_aapcs_vfpcc float @test1(float %a, float %b) nounwind {
 ; CHECK: vnmul.f32 s0, s0, s1 
diff --git a/test/CodeGen/ARM/fold-const.ll b/test/CodeGen/ARM/fold-const.ll
index 1ba561d..dc5419f 100644
--- a/test/CodeGen/ARM/fold-const.ll
+++ b/test/CodeGen/ARM/fold-const.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v7 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v7 %s -o - | FileCheck %s
 
 define i32 @f(i32 %a) nounwind readnone optsize ssp {
 entry:
diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll
index 67fd129..695a20b 100644
--- a/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -1,6 +1,7 @@
-; RUN: llc -mtriple=thumbv7-apple-darwin-eabi < %s | FileCheck %s
-; RUN: llc -mtriple=thumbv6m-apple-darwin-eabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1
+; RUN: llc -mtriple=thumbv7-apple-none-macho < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv6m-apple-none-macho -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1
 ; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS
+; RUN: llc -mtriple=thumbv7--linux-gnueabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-LINUX
 
 
 declare void @bar(i8*)
@@ -11,11 +12,11 @@ declare void @bar(i8*)
 
 define void @check_simple() minsize {
 ; CHECK-LABEL: check_simple:
-; CHECK: push.w {r7, r8, r9, r10, r11, lr}
+; CHECK: push {r3, r4, r5, r6, r7, lr}
 ; CHECK-NOT: sub sp, sp,
 ; ...
 ; CHECK-NOT: add sp, sp,
-; CHECK: pop.w {r0, r1, r2, r3, r11, pc}
+; CHECK: pop {r0, r1, r2, r3, r7, pc}
 
 ; CHECK-T1-LABEL: check_simple:
 ; CHECK-T1: push {r3, r4, r5, r6, r7, lr}
@@ -43,11 +44,11 @@ define void @check_simple() minsize {
 
 define void @check_simple_too_big() minsize {
 ; CHECK-LABEL: check_simple_too_big:
-; CHECK: push.w {r11, lr}
+; CHECK: push {r7, lr}
 ; CHECK: sub sp,
 ; ...
 ; CHECK: add sp,
-; CHECK: pop.w {r11, pc}
+; CHECK: pop {r7, pc}
   %var = alloca i8, i32 64
   call void @bar(i8* %var)
   ret void
@@ -92,16 +93,16 @@ define void @check_vfp_fold() minsize {
 ; folded in except that doing so would clobber the value being returned.
 define i64 @check_no_return_clobber() minsize {
 ; CHECK-LABEL: check_no_return_clobber:
-; CHECK: push.w {r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK: push {r1, r2, r3, r4, r5, r6, r7, lr}
 ; CHECK-NOT: sub sp,
 ; ...
-; CHECK: add sp, #40
-; CHECK: pop.w {r11, pc}
+; CHECK: add sp, #24
+; CHECK: pop {r7, pc}
 
   ; Just to keep iOS FileCheck within previous function:
 ; CHECK-IOS-LABEL: check_no_return_clobber:
 
-  %var = alloca i8, i32 40
+  %var = alloca i8, i32 20
   call void @bar(i8* %var)
   ret i64 0
 }
@@ -161,4 +162,57 @@ end:
   ; We want the epilogue to be the only thing in a basic block so that we hit
   ; the correct edge-case (first inst in block is correct one to adjust).
   ret void
-}
-\ No newline at end of file
+}
+
+define void @test_varsize(...) minsize {
+; CHECK-T1-LABEL: test_varsize:
+; CHECK-T1: sub	sp, #16
+; CHECK-T1: push	{r2, r3, r4, r5, r7, lr}
+; ...
+; CHECK-T1: pop	{r2, r3, r4, r5, r7}
+; CHECK-T1: pop	{r3}
+; CHECK-T1: add	sp, #16
+; CHECK-T1: bx	r3
+
+; CHECK-LABEL: test_varsize:
+; CHECK: sub	sp, #16
+; CHECK: push	{r5, r6, r7, lr}
+; ...
+; CHECK: pop.w	{r2, r3, r7, lr}
+; CHECK: add	sp, #16
+; CHECK: bx	lr
+
+  %var = alloca i8, i32 8
+  call void @bar(i8* %var)
+  ret void
+}
+
+%"MyClass" = type { i8*, i32, i32, float, float, float, [2 x i8], i32, i32* }
+
+declare float @foo()
+
+declare void @bar3()
+
+declare %"MyClass"* @bar2(%"MyClass"* returned, i16*, i32, float, float, i32, i32, i1 zeroext, i1 zeroext, i32)
+
+define fastcc float @check_vfp_no_return_clobber2(i16* %r, i16* %chars, i32 %length, i1 zeroext %flag) minsize {
+entry:
+; CHECK-LINUX-LABEL: check_vfp_no_return_clobber2
+; CHECK-LINUX: vpush	{d0, d1, d2, d3, d4, d5, d6, d7, d8}
+; CHECK-NOT: sub sp,
+; ...
+; CHECK-LINUX: add sp
+; CHECK-LINUX: vpop {d8}
+  %run = alloca %"MyClass", align 4
+  %call = call %"MyClass"* @bar2(%"MyClass"* %run, i16* %chars, i32 %length, float 0.000000e+00, float 0.000000e+00, i32 1, i32 1, i1 zeroext false, i1 zeroext true, i32 3)
+  %call1 = call float @foo()
+  %cmp = icmp eq %"MyClass"* %run, null
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:                                          ; preds = %entry
+  call void @bar3()
+  br label %exit
+
+exit:                                             ; preds = %if.then, %entry
+  ret float %call1
+}
diff --git a/test/CodeGen/ARM/formal.ll b/test/CodeGen/ARM/formal.ll
index 4ac10ba..05a6be1 100644
--- a/test/CodeGen/ARM/formal.ll
+++ b/test/CodeGen/ARM/formal.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 declare void @bar(i64 %x, i64 %y)
 
diff --git a/test/CodeGen/ARM/fp-arg-shuffle.ll b/test/CodeGen/ARM/fp-arg-shuffle.ll
index ae02b79..4996cc8 100644
--- a/test/CodeGen/ARM/fp-arg-shuffle.ll
+++ b/test/CodeGen/ARM/fp-arg-shuffle.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon -float-abi=soft | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -float-abi=soft %s -o - | FileCheck %s
 
 ; CHECK: function1
 ; CHECK-NOT: vmov
diff --git a/test/CodeGen/ARM/fp-fast.ll b/test/CodeGen/ARM/fp-fast.ll
index ec57187..7d95a5e 100644
--- a/test/CodeGen/ARM/fp-fast.ll
+++ b/test/CodeGen/ARM/fp-fast.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=arm -mcpu=cortex-a9 -mattr=+vfp4 -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 -mattr=+vfp4 -enable-unsafe-fp-math %s -o - \
+; RUN:  | FileCheck %s
 
 ; CHECK: test1
 define float @test1(float %x) {
diff --git a/test/CodeGen/ARM/fp.ll b/test/CodeGen/ARM/fp.ll
index fbf3a4a..7e1f000 100644
--- a/test/CodeGen/ARM/fp.ll
+++ b/test/CodeGen/ARM/fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+vfp2 %s -o - | FileCheck %s
 
 define float @f(i32 %a) {
 ;CHECK-LABEL: f:
diff --git a/test/CodeGen/ARM/fp16.ll b/test/CodeGen/ARM/fp16.ll
index a5c1aed..fba7946 100644
--- a/test/CodeGen/ARM/fp16.ll
+++ b/test/CodeGen/ARM/fp16.ll
@@ -9,7 +9,7 @@ target triple = "armv7-eabi"
 
 define arm_aapcs_vfpcc void @foo() nounwind {
 ; CHECK-LABEL: foo:
-; CHECK-FP6-LABEL: foo:
+; CHECK-FP16-LABEL: foo:
 entry:
   %0 = load i16* @x, align 2
   %1 = load i16* @y, align 2
diff --git a/test/CodeGen/ARM/fp_convert.ll b/test/CodeGen/ARM/fp_convert.ll
index f0d9100..6f47075 100644
--- a/test/CodeGen/ARM/fp_convert.ll
+++ b/test/CodeGen/ARM/fp_convert.ll
@@ -1,9 +1,20 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=NEON
-; RUN: llc < %s -mtriple=arm-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=NEON
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=VFP2
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=VFP2
+
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \
+; RUN:  | FileCheck %s -check-prefix=VFP2
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \
+; RUN: | FileCheck %s -check-prefix=VFP2
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NEON
+
+; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NEON
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=VFP2
 
 define i32 @test1(float %a, float %b) {
 ; VFP2-LABEL: test1:
diff --git a/test/CodeGen/ARM/fpcmp-opt.ll b/test/CodeGen/ARM/fpcmp-opt.ll
index 3a0af16..eab5988 100644
--- a/test/CodeGen/ARM/fpcmp-opt.ll
+++ b/test/CodeGen/ARM/fpcmp-opt.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 -mattr=+vfp2 -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 -mattr=+vfp2 -enable-unsafe-fp-math %s -o - \
+; RUN:  | FileCheck %s
+
 ; rdar://7461510
 ; rdar://10964603
 
diff --git a/test/CodeGen/ARM/fpcmp.ll b/test/CodeGen/ARM/fpcmp.ll
index 916a1ae..e3ffd45 100644
--- a/test/CodeGen/ARM/fpcmp.ll
+++ b/test/CodeGen/ARM/fpcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - | FileCheck %s
 
 define i32 @f1(float %a) {
 ;CHECK-LABEL: f1:
diff --git a/test/CodeGen/ARM/fpconsts.ll b/test/CodeGen/ARM/fpconsts.ll
index 0679a47..5a45a9b 100644
--- a/test/CodeGen/ARM/fpconsts.ll
+++ b/test/CodeGen/ARM/fpconsts.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp3 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp3 %s -o - | FileCheck %s
 
 define float @t1(float %x) nounwind readnone optsize {
 entry:
diff --git a/test/CodeGen/ARM/fpconv.ll b/test/CodeGen/ARM/fpconv.ll
index 326e062..eadf9af 100644
--- a/test/CodeGen/ARM/fpconv.ll
+++ b/test/CodeGen/ARM/fpconv.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s --check-prefix=CHECK-VFP
-; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - | FileCheck %s --check-prefix=CHECK-VFP
+; RUN: llc -mtriple=arm-apple-darwin %s -o - | FileCheck %s
 
 define float @f1(double %x) {
 ;CHECK-VFP-LABEL: f1:
diff --git a/test/CodeGen/ARM/fpmem.ll b/test/CodeGen/ARM/fpmem.ll
index 8fbd1d8..3a454ed 100644
--- a/test/CodeGen/ARM/fpmem.ll
+++ b/test/CodeGen/ARM/fpmem.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
+; RUN: llc -mtriple=arm -float-abi=soft -mattr=+vfp2 %s -o - | FileCheck %s
 
 define float @f1(float %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/ARM/fpow.ll b/test/CodeGen/ARM/fpow.ll
index 6d48792..3e37724 100644
--- a/test/CodeGen/ARM/fpow.ll
+++ b/test/CodeGen/ARM/fpow.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define double @t(double %x, double %y) nounwind optsize {
 entry:
diff --git a/test/CodeGen/ARM/fptoint.ll b/test/CodeGen/ARM/fptoint.ll
index 7408687..c721756 100644
--- a/test/CodeGen/ARM/fptoint.ll
+++ b/test/CodeGen/ARM/fptoint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6,+vfp2 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v6,+vfp2 %s -o - | FileCheck %s
 
 @i = weak global i32 0		; <i32*> [#uses=2]
 @u = weak global i32 0		; <i32*> [#uses=2]
diff --git a/test/CodeGen/ARM/fsubs.ll b/test/CodeGen/ARM/fsubs.ll
index 617b018..baff34a 100644
--- a/test/CodeGen/ARM/fsubs.ll
+++ b/test/CodeGen/ARM/fsubs.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 | FileCheck %s -check-prefix=NFP1
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=NFP1U
-; RUN: llc < %s -mtriple=arm-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=NFP1U
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=VFP2
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NFP1
+
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NFP1U
+
+; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NFP1U
+
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NFP0
 
 define float @test(float %a, float %b) {
 entry:
diff --git a/test/CodeGen/ARM/hello.ll b/test/CodeGen/ARM/hello.ll
index 893b426..d268585 100644
--- a/test/CodeGen/ARM/hello.ll
+++ b/test/CodeGen/ARM/hello.ll
@@ -1,8 +1,11 @@
-; RUN: llc < %s -march=arm
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi | grep mov | count 1
-; RUN: llc < %s -mtriple=armv6-linux-gnu --disable-fp-elim | \
-; RUN:   grep mov | count 2
-; RUN: llc < %s -mtriple=armv6-apple-ios | grep mov | count 2
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
+; RUN: llc -mtriple=armv6-linux-gnueabi %s -o - | FileCheck %s
+
+; RUN: llc -mtriple=armv6-linux-gnu --disable-fp-elim %s -o - \
+; RUN:  | FileCheck %s -check-prefix CHECK-FP-ELIM
+
+; RUN: llc -mtriple=armv6-apple-ios %s -o - \
+; RUN:  | FileCheck %s -check-prefix CHECK-FP-ELIM
 
 @str = internal constant [12 x i8] c"Hello World\00"
 
@@ -12,3 +15,11 @@ define i32 @main() {
 }
 
 declare i32 @puts(i8*)
+
+; CHECK: mov
+; CHECK-NOT: mov
+
+; CHECK-FP-ELIM: mov
+; CHECK-FP-ELIM: mov
+; CHECK-FP-ELIM-NOT: mov
+
diff --git a/test/CodeGen/ARM/iabs.ll b/test/CodeGen/ARM/iabs.ll
index 600a8c2..c52caf6 100644
--- a/test/CodeGen/ARM/iabs.ll
+++ b/test/CodeGen/ARM/iabs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v4t | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s
 
 ;; Integer absolute value, should produce something as good as: ARM:
 ;;   movs r0, r0
diff --git a/test/CodeGen/ARM/ifconv-kills.ll b/test/CodeGen/ARM/ifconv-kills.ll
index bf54ba2..de80c92 100644
--- a/test/CodeGen/ARM/ifconv-kills.ll
+++ b/test/CodeGen/ARM/ifconv-kills.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march arm -mcpu swift -verify-machineinstrs
+; RUN: llc -mtriple arm-eabi -mcpu swift -verify-machineinstrs %s -o /dev/null
 
 declare i32 @f(i32 %p0, i32 %p1)
 
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
new file mode 100644
index 0000000..86ed5b2
--- /dev/null
+++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -mtriple=armv4t--linux-androideabi -print-machineinstrs=if-converter -o /dev/null 2>&1 | FileCheck %s
+; Fix a bug triggered in IfConverterTriangle when CvtBB has multiple
+; predecessors.
+; PR18752
+
+%classK = type { i8, %classF }
+%classF = type { i8 }
+%classL = type { %classG, i32, i32 }
+%classG = type { %classL* }
+%classM2 = type { %classL }
+
+define zeroext i1 @test(%classK* %this, %classM2* nocapture readnone %p1, %classM2* nocapture readnone %p2) align 2 {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+; Before if conversion, we have
+; for.body -> lor.lhs.false.i (62)
+;          -> for.cond.backedge (62)
+; lor.lhs.false.i -> for.cond.backedge (1048575)
+;                 -> cond.false.i (1)
+; Afer if conversion, we have
+; for.body -> for.cond.backedge (130023362)
+;          -> cond.false.i (62)
+; CHECK: BB#1: derived from LLVM BB %for.body
+; CHECK: Successors according to CFG: BB#2(130023362) BB#4(62)
+for.body:
+  br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i
+
+for.cond.backedge:
+  %tobool = icmp eq %classL* undef, null
+  br i1 %tobool, label %for.end, label %for.body
+
+lor.lhs.false.i:
+  %tobool.i.i7 = icmp eq i32 undef, 0
+  br i1 %tobool.i.i7, label %for.cond.backedge, label %cond.false.i
+
+cond.false.i:
+  call void @_Z3fn1v()
+  unreachable
+
+for.end:
+  br i1 undef, label %if.else.i.i, label %if.then.i.i
+
+if.then.i.i:
+  store %classL* null, %classL** undef, align 4
+  br label %_ZN1M6spliceEv.exit
+
+if.else.i.i:
+  store %classL* null, %classL** null, align 4
+  br label %_ZN1M6spliceEv.exit
+
+_ZN1M6spliceEv.exit:
+  %LIS = getelementptr inbounds %classK* %this, i32 0, i32 1
+  call void @_ZN1F10handleMoveEb(%classF* %LIS, i1 zeroext false)
+  unreachable
+}
+
+declare %classL* @_ZN1M1JI1LS1_EcvPS1_Ev(%classM2*)
+declare void @_ZN1F10handleMoveEb(%classF*, i1 zeroext)
+declare void @_Z3fn1v()
+
+!0 = metadata !{metadata !"clang version 3.5"}
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll
new file mode 100644
index 0000000..cd8a561
--- /dev/null
+++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=thumbv8 -print-machineinstrs=if-converter -o /dev/null 2>&1 | FileCheck %s
+
+%struct.S = type { i8* (i8*)*, [1 x i8] }
+define internal zeroext i8 @bar(%struct.S* %x, %struct.S* nocapture %y) nounwind readonly {
+entry:
+  %0 = getelementptr inbounds %struct.S* %x, i32 0, i32 1, i32 0
+  %1 = load i8* %0, align 1
+  %2 = zext i8 %1 to i32
+  %3 = and i32 %2, 112
+  %4 = icmp eq i32 %3, 0
+  br i1 %4, label %return, label %bb
+
+bb:
+  %5 = getelementptr inbounds %struct.S* %y, i32 0, i32 1, i32 0
+  %6 = load i8* %5, align 1
+  %7 = zext i8 %6 to i32
+  %8 = and i32 %7, 112
+  %9 = icmp eq i32 %8, 0
+  br i1 %9, label %return, label %bb2
+
+; CHECK: BB#2: derived from LLVM BB %bb2
+; CHECK: Successors according to CFG: BB#3(192) BB#4(192)
+
+bb2:
+  %v10 = icmp eq i32 %3, 16
+  br i1 %v10, label %bb4, label %bb3, !prof !0
+
+bb3:
+  %v11 = icmp eq i32 %8, 16
+  br i1 %v11, label %bb4, label %return, !prof !1
+
+bb4:
+  %v12 = ptrtoint %struct.S* %x to i32
+  %phitmp = trunc i32 %v12 to i8
+  ret i8 %phitmp
+
+return:
+  ret i8 1
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 4, i32 12}
+!1 = metadata !{metadata !"branch_weights", i32 8, i32 16}
diff --git a/test/CodeGen/ARM/ifcvt1.ll b/test/CodeGen/ARM/ifcvt1.ll
index 5a55653..cae2399 100644
--- a/test/CodeGen/ARM/ifcvt1.ll
+++ b/test/CodeGen/ARM/ifcvt1.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
-; RUN: llc < %s -march=arm -mcpu=swift     | FileCheck %s -check-prefix=SWIFT
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=A8
+; RUN: llc -mtriple=arm-eabi -mcpu=swift %s -o - | FileCheck %s -check-prefix=SWIFT
 
 define i32 @t1(i32 %a, i32 %b) {
 ; A8-LABEL: t1:
diff --git a/test/CodeGen/ARM/ifcvt2.ll b/test/CodeGen/ARM/ifcvt2.ll
index e34edec..e445416 100644
--- a/test/CodeGen/ARM/ifcvt2.ll
+++ b/test/CodeGen/ARM/ifcvt2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v4t | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s
 
 define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: t1:
diff --git a/test/CodeGen/ARM/ifcvt3.ll b/test/CodeGen/ARM/ifcvt3.ll
index fa7d618..5da63dc 100644
--- a/test/CodeGen/ARM/ifcvt3.ll
+++ b/test/CodeGen/ARM/ifcvt3.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+v4t | grep cmpne | count 1
-; RUN: llc < %s -march=arm -mattr=+v4t | grep bx | count 2
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s -check-prefix CHECK-V4-CMP
+; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s -check-prefix CHECK-V4-BX
 
 define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: t1:
@@ -22,3 +22,11 @@ cond_next:
 	%tmp15 = add i32 %b, %a
 	ret i32 %tmp15
 }
+
+; CHECK-V4-CMP: cmpne
+; CHECK-V4-CMP-NOT: cmpne
+
+; CHECK-V4-BX: bx
+; CHECK-V4-BX: bx
+; CHECK-V4-BX-NOT: bx
+
diff --git a/test/CodeGen/ARM/ifcvt4.ll b/test/CodeGen/ARM/ifcvt4.ll
index 53c789d..8c6825a 100644
--- a/test/CodeGen/ARM/ifcvt4.ll
+++ b/test/CodeGen/ARM/ifcvt4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 ; Do not if-convert when branches go to the different loops.
 ; CHECK-LABEL: t:
diff --git a/test/CodeGen/ARM/ifcvt9.ll b/test/CodeGen/ARM/ifcvt9.ll
index 05bdc45..1191716 100644
--- a/test/CodeGen/ARM/ifcvt9.ll
+++ b/test/CodeGen/ARM/ifcvt9.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define fastcc void @t() nounwind {
 entry:
diff --git a/test/CodeGen/ARM/illegal-vector-bitcast.ll b/test/CodeGen/ARM/illegal-vector-bitcast.ll
index febe6f5..7208fff 100644
--- a/test/CodeGen/ARM/illegal-vector-bitcast.ll
+++ b/test/CodeGen/ARM/illegal-vector-bitcast.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm
-; RUN: llc < %s -mtriple=arm-linux
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
+; RUN: llc -mtriple=arm-linux %s -o /dev/null
 
 define void @foo(<8 x float>* %f, <8 x float>* %g, <4 x i64>* %y)
 {
diff --git a/test/CodeGen/ARM/imm.ll b/test/CodeGen/ARM/imm.ll
index 6f25f9d..e7bc0af 100644
--- a/test/CodeGen/ARM/imm.ll
+++ b/test/CodeGen/ARM/imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | not grep CPI
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @test1(i32 %A) {
         %B = add i32 %A, -268435441             ; <i32> [#uses=1]
@@ -14,3 +14,6 @@ define i32 @test3(i32 %A) {
         ret i32 %B
 }
 
+; CHECK-NOT: CPI
+
+
diff --git a/test/CodeGen/ARM/indirect-reg-input.ll b/test/CodeGen/ARM/indirect-reg-input.ll
index b936455..17f6a9c 100644
--- a/test/CodeGen/ARM/indirect-reg-input.ll
+++ b/test/CodeGen/ARM/indirect-reg-input.ll
@@ -1,4 +1,4 @@
-; RUN: not llc < %s -march=arm -mcpu=cortex-a8 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - 2>&1 | FileCheck %s
 
 ; Check for error message:
 ; CHECK: error: inline asm not supported yet: don't know how to handle tied indirect register inputs
diff --git a/test/CodeGen/ARM/indirectbr.ll b/test/CodeGen/ARM/indirectbr.ll
index 1aeeb91..7c49cb3 100644
--- a/test/CodeGen/ARM/indirectbr.ll
+++ b/test/CodeGen/ARM/indirectbr.ll
@@ -11,6 +11,11 @@ define internal i32 @foo(i32 %i) nounwind {
 ; THUMB-LABEL: foo:
 ; THUMB2-LABEL: foo:
 entry:
+  ; _nextaddr gets CSEed for use later on.
+; THUMB: ldr r[[NEXTADDR_REG:[0-9]+]], [[NEXTADDR_CPI:LCPI0_[0-9]+]]
+; THUMB: [[NEXTADDR_PCBASE:LPC0_[0-9]]]:
+; THUMB: add r[[NEXTADDR_REG]], pc
+
   %0 = load i8** @nextaddr, align 4               ; <i8*> [#uses=2]
   %1 = icmp eq i8* %0, null                       ; <i1> [#uses=1]
 ; indirect branch gets duplicated here
@@ -53,12 +58,11 @@ L1:                                               ; preds = %L2, %bb2
 ; ARM: ldr [[R1:r[0-9]+]], LCPI
 ; ARM: add [[R1b:r[0-9]+]], pc, [[R1]]
 ; ARM: str [[R1b]]
+
 ; THUMB-LABEL: %L1
-; THUMB: ldr
-; THUMB: add
 ; THUMB: ldr [[R2:r[0-9]+]], LCPI
 ; THUMB: add [[R2]], pc
-; THUMB: str [[R2]]
+; THUMB: str [[R2]], [r[[NEXTADDR_REG]]]
 ; THUMB2-LABEL: %L1
 ; THUMB2: ldr [[R2:r[0-9]+]], LCPI
 ; THUMB2-NEXT: str{{(.w)?}} [[R2]]
@@ -67,4 +71,5 @@ L1:                                               ; preds = %L2, %bb2
 }
 ; ARM: .long Ltmp0-(LPC{{.*}}+8)
 ; THUMB: .long Ltmp0-(LPC{{.*}}+4)
+; THUMB: .long _nextaddr-([[NEXTADDR_PCBASE]]+4)
 ; THUMB2: .long Ltmp0
diff --git a/test/CodeGen/ARM/inline-diagnostics.ll b/test/CodeGen/ARM/inline-diagnostics.ll
new file mode 100644
index 0000000..7b77da2
--- /dev/null
+++ b/test/CodeGen/ARM/inline-diagnostics.ll
@@ -0,0 +1,16 @@
+; RUN: not llc < %s -verify-machineinstrs -mtriple=armv7-none-linux-gnu -mattr=+neon 2>&1 | FileCheck %s
+
+%struct.float4 = type { float, float, float, float }
+
+; CHECK: error: Don't know how to handle indirect register inputs yet for constraint 'w'
+define float @inline_func(float %f1, float %f2) #0 {
+  %c1 = alloca %struct.float4, align 4
+  %c2 = alloca %struct.float4, align 4
+  %c3 = alloca %struct.float4, align 4
+  call void asm sideeffect "vmul.f32 ${2:q}, ${0:q}, ${1:q}", "=*r,=*r,*w"(%struct.float4* %c1, %struct.float4* %c2, %struct.float4* %c3) #1, !srcloc !1
+  %x = getelementptr inbounds %struct.float4* %c3, i32 0, i32 0
+  %1 = load float* %x, align 4
+  ret float %1
+}
+
+!1 = metadata !{i32 271, i32 305}
diff --git a/test/CodeGen/ARM/inlineasm-64bit.ll b/test/CodeGen/ARM/inlineasm-64bit.ll
index 683a0c4..d098a43 100644
--- a/test/CodeGen/ARM/inlineasm-64bit.ll
+++ b/test/CodeGen/ARM/inlineasm-64bit.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -O3  -mtriple=arm-linux-gnueabi | FileCheck %s
-; RUN: llc -mtriple=thumbv7-none-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc < %s -O3  -mtriple=arm-linux-gnueabi -no-integrated-as | FileCheck %s
+; RUN: llc -mtriple=thumbv7-none-linux-gnueabi -verify-machineinstrs -no-integrated-as < %s | FileCheck %s
 ; check if regs are passing correctly
 define void @i64_write(i64* %p, i64 %val) nounwind {
 ; CHECK-LABEL: i64_write:
diff --git a/test/CodeGen/ARM/inlineasm-imm-arm.ll b/test/CodeGen/ARM/inlineasm-imm-arm.ll
index 45dfcf0..603e52d 100644
--- a/test/CodeGen/ARM/inlineasm-imm-arm.ll
+++ b/test/CodeGen/ARM/inlineasm-imm-arm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi -no-integrated-as %s -o /dev/null
 
 ; Test ARM-mode "I" constraint, for any Data Processing immediate.
 define i32 @testI(i32 %x) {
diff --git a/test/CodeGen/ARM/inlineasm-ldr-pseudo.ll b/test/CodeGen/ARM/inlineasm-ldr-pseudo.ll
new file mode 100644
index 0000000..f63e4b0
--- /dev/null
+++ b/test/CodeGen/ARM/inlineasm-ldr-pseudo.ll
@@ -0,0 +1,17 @@
+; PR18354
+; We actually need to use -filetype=obj in this test because if we output
+; assembly, the current code path will bypass the parser and just write the
+; raw text out to the Streamer. We need to actually parse the inlineasm to
+; demonstrate the bug. Going the asm->obj route does not show the issue.
+; RUN: llc -mtriple=arm-none-linux   < %s -filetype=obj | llvm-objdump -d - | FileCheck %s
+; RUN: llc -mtriple=arm-apple-darwin < %s -filetype=obj | llvm-objdump -d - | FileCheck %s
+; CHECK-LABEL: foo:
+; CHECK: 0:       00 00 9f e5                                     ldr     r0, [pc]
+; CHECK: 4:       0e f0 a0 e1                                     mov     pc, lr
+; Make sure the constant pool entry comes after the return
+; CHECK: 8:       01 00 00 00
+define i32 @foo() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "ldr $0,=1", "=r"() nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/inlineasm-switch-mode-oneway-from-arm.ll b/test/CodeGen/ARM/inlineasm-switch-mode-oneway-from-arm.ll
new file mode 100644
index 0000000..3be378d
--- /dev/null
+++ b/test/CodeGen/ARM/inlineasm-switch-mode-oneway-from-arm.ll
@@ -0,0 +1,18 @@
+;RUN:  llc -mtriple=armv7-linux-gnueabi < %s | llvm-mc -triple=armv7-linux-gnueabi -filetype=obj | llvm-objdump -triple=armv7 -d - | FileCheck %s
+;RUN:  llc -mtriple=armv7-linux-gnueabi < %s | FileCheck %s -check-prefix=ASM
+;RUN:  llc -mtriple=armv7-apple-darwin < %s | FileCheck %s -check-prefix=ASM
+
+define hidden i32 @bah(i8* %start) #0 align 2 {
+  %1 = ptrtoint i8* %start to i32
+  %2 = tail call i32 asm sideeffect "@ Enter THUMB Mode\0A\09adr r3, 2f+1 \0A\09bx  r3 \0A\09.code 16 \0A2: push {r7} \0A\09mov r7, $4 \0A\09svc 0x0 \0A\09pop {r7} \0A\09", "={r0},{r0},{r1},{r2},r,~{r3}"(i32 %1, i32 %1, i32 0, i32 983042) #3
+  %3 = add i32 %1, 1
+  ret i32 %3
+}
+; CHECK: $t
+; CHECK: $a
+; CHECK: 01 00 81 e2     add     r0, r1, #1
+
+; .code 32 is implicit
+; ASM-LABEL: bah:
+; ASM: .code 16
+; ASM: .code 32
diff --git a/test/CodeGen/ARM/inlineasm-switch-mode-oneway-from-thumb.ll b/test/CodeGen/ARM/inlineasm-switch-mode-oneway-from-thumb.ll
new file mode 100644
index 0000000..b9bd4c2
--- /dev/null
+++ b/test/CodeGen/ARM/inlineasm-switch-mode-oneway-from-thumb.ll
@@ -0,0 +1,18 @@
+;RUN:  llc -mtriple=thumbv7-linux-gnueabi < %s | llvm-mc -triple=thumbv7-linux-gnueabi -filetype=obj | llvm-objdump -triple=thumbv7 -d - | FileCheck %s
+;RUN:  llc -mtriple=thumbv7-linux-gnueabi < %s | FileCheck %s -check-prefix=ASM
+;RUN:  llc -mtriple=thumbv7-apple-darwin < %s | FileCheck %s -check-prefix=ASM
+
+define hidden i32 @bah(i8* %start) #0 align 2 {
+  %1 = ptrtoint i8* %start to i32
+  %2 = tail call i32 asm sideeffect "@ Enter ARM Mode  \0A\09adr r3, 1f \0A\09bx  r3 \0A\09.align 2 \0A\09.code 32 \0A1:  push {r7} \0A\09mov r7, $4 \0A\09svc 0x0 \0A\09pop {r7} \0A\09", "={r0},{r0},{r1},{r2},r,~{r3}"(i32 %1, i32 %1, i32 0, i32 983042) #3
+  %3 = add i32 %1, 1
+  ret i32 %3
+}
+; CHECK: $a
+; CHECK: $t
+; CHECK: 48 1c   adds    r0, r1, #1
+
+; ASM: .code 16
+; ASM-LABEL: bah:
+; ASM: .code 32
+; ASM: .code 16
diff --git a/test/CodeGen/ARM/inlineasm-switch-mode.ll b/test/CodeGen/ARM/inlineasm-switch-mode.ll
new file mode 100644
index 0000000..65fea11
--- /dev/null
+++ b/test/CodeGen/ARM/inlineasm-switch-mode.ll
@@ -0,0 +1,22 @@
+;RUN: llc -mtriple=thumbv7-linux-gnueabi < %s | llvm-mc -triple=thumbv7-linux-gnueabi -filetype=obj > %t
+; Two pass decoding needed because llvm-objdump does not respect mapping symbols
+;RUN: llvm-objdump -triple=armv7   -d %t | FileCheck %s --check-prefix=ARM
+;RUN: llvm-objdump -triple=thumbv7 -d %t | FileCheck %s --check-prefix=THUMB
+
+define hidden i32 @bah(i8* %start) #0 align 2 {
+  %1 = ptrtoint i8* %start to i32
+  %2 = tail call i32 asm sideeffect "@ Enter ARM Mode  \0A\09adr r3, 1f \0A\09bx  r3 \0A\09.align 2 \0A\09.code 32 \0A1:  push {r7} \0A\09mov r7, $4 \0A\09svc 0x0 \0A\09pop {r7} \0A\09@ Enter THUMB Mode\0A\09adr r3, 2f+1 \0A\09bx  r3 \0A\09.code 16 \0A2: \0A\09", "={r0},{r0},{r1},{r2},r,~{r3}"(i32 %1, i32 %1, i32 0, i32 983042) #3
+  %3 = add i32 %1, 1
+  ret i32 %3
+}
+
+; ARM: $a
+; ARM-NEXT: 04 70 2d e5     str     r7, [sp, #-4]!
+; ARM: $t
+; ARM-NEXT: 48 1c
+
+; THUMB: $a
+; THUMB-NEXT: 04 70
+; THUMB-NEXT: 2d e5
+; THUMB: $t
+; THUMB-NEXT: 48 1c   adds    r0, r1, #1
diff --git a/test/CodeGen/ARM/inlineasm.ll b/test/CodeGen/ARM/inlineasm.ll
index cca3c69..39962e0 100644
--- a/test/CodeGen/ARM/inlineasm.ll
+++ b/test/CodeGen/ARM/inlineasm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o /dev/null
 
 define i32 @test1(i32 %tmp54) {
 	%tmp56 = tail call i32 asm "uxtb16 $0,$1", "=r,r"( i32 %tmp54 )		; <i32> [#uses=1]
diff --git a/test/CodeGen/ARM/inlineasm2.ll b/test/CodeGen/ARM/inlineasm2.ll
index a99bccf..5918738 100644
--- a/test/CodeGen/ARM/inlineasm2.ll
+++ b/test/CodeGen/ARM/inlineasm2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define double @__ieee754_sqrt(double %x) {
 	%tmp2 = tail call double asm "fsqrtd ${0:P}, ${1:P}", "=w,w"( double %x )
diff --git a/test/CodeGen/ARM/inlineasm3.ll b/test/CodeGen/ARM/inlineasm3.ll
index 390a44e..eb7ba59 100644
--- a/test/CodeGen/ARM/inlineasm3.ll
+++ b/test/CodeGen/ARM/inlineasm3.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+neon,+v6t2 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon,+v6t2 -no-integrated-as %s -o - \
+; RUN:  | FileCheck %s
 
 ; Radar 7449043
 %struct.int32x4_t = type { <4 x i32> }
diff --git a/test/CodeGen/ARM/inlineasm4.ll b/test/CodeGen/ARM/inlineasm4.ll
index 4a1bcca..a117cd2 100644
--- a/test/CodeGen/ARM/inlineasm4.ll
+++ b/test/CodeGen/ARM/inlineasm4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define double @f(double %x) {
 entry:
diff --git a/test/CodeGen/ARM/insn-sched1.ll b/test/CodeGen/ARM/insn-sched1.ll
index d188fae..2749a8e 100644
--- a/test/CodeGen/ARM/insn-sched1.ll
+++ b/test/CodeGen/ARM/insn-sched1.ll
@@ -1,6 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+v6
-; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+v6 |\
-; RUN:   grep mov | count 3
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o /dev/null
+; RUN: llc -mtriple=arm-apple-ios -mattr=+v6 %s -o - | FileCheck %s
 
 define i32 @test(i32 %x) {
         %tmp = trunc i32 %x to i16              ; <i16> [#uses=1]
@@ -9,3 +8,9 @@ define i32 @test(i32 %x) {
 }
 
 declare i32 @f(i32, i16)
+
+; CHECK: mov
+; CHECK: mov
+; CHECK: mov
+; CHECK-NOT: mov
+
diff --git a/test/CodeGen/ARM/integer_insertelement.ll b/test/CodeGen/ARM/integer_insertelement.ll
index 1d72afe..bf403b9 100644
--- a/test/CodeGen/ARM/integer_insertelement.ll
+++ b/test/CodeGen/ARM/integer_insertelement.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 ; This test checks that when inserting one (integer) element into a vector,
 ; the vector is not spuriously copied. "vorr dX, dY, dY" is the way of moving
diff --git a/test/CodeGen/ARM/interrupt-attr.ll b/test/CodeGen/ARM/interrupt-attr.ll
index 217fd69..9b7b41b 100644
--- a/test/CodeGen/ARM/interrupt-attr.ll
+++ b/test/CodeGen/ARM/interrupt-attr.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=arm-none-none-eabi -mcpu=cortex-a15 -o - %s | FileCheck --check-prefix=CHECK-A %s
 ; RUN: llc -mtriple=thumb-none-none-eabi -mcpu=cortex-a15 -o - %s | FileCheck --check-prefix=CHECK-A-THUMB %s
-; RUN: llc -mtriple=thumb-apple-darwin -mcpu=cortex-m3 -o - %s | FileCheck --check-prefix=CHECK-M %s
+; RUN: llc -mtriple=thumb-apple-none-macho -mcpu=cortex-m3 -o - %s | FileCheck --check-prefix=CHECK-M %s
 
 declare arm_aapcscc void @bar()
 
@@ -12,32 +12,33 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
 
   ; Also need special function return setting pc and CPSR simultaneously.
 ; CHECK-A-LABEL: irq_fn:
-; CHECK-A: push {r0, r1, r2, r3, r11, lr}
+; CHECK-A: push {r0, r1, r2, r3, r11, r12, lr}
 ; CHECK-A: add r11, sp, #16
 ; CHECK-A: sub sp, sp, #{{[0-9]+}}
 ; CHECK-A: bic sp, sp, #7
 ; CHECK-A: bl bar
 ; CHECK-A: sub sp, r11, #16
-; CHECK-A: pop {r0, r1, r2, r3, r11, lr}
+; CHECK-A: pop {r0, r1, r2, r3, r11, r12, lr}
 ; CHECK-A: subs pc, lr, #4
 
 ; CHECK-A-THUMB-LABEL: irq_fn:
-; CHECK-A-THUMB: push {r0, r1, r2, r3, r4, r7, lr}
-; CHECK-A-THUMB: mov r4, sp
+; CHECK-A-THUMB: push.w {r0, r1, r2, r3, r4, r7, r12, lr}
 ; CHECK-A-THUMB: add r7, sp, #20
+; CHECK-A-THUMB: mov r4, sp
 ; CHECK-A-THUMB: bic r4, r4, #7
 ; CHECK-A-THUMB: bl bar
 ; CHECK-A-THUMB: sub.w r4, r7,  #20
 ; CHECK-A-THUMB: mov sp, r4
-; CHECK-A-THUMB: pop.w {r0, r1, r2, r3, r4, r7, lr}
+; CHECK-A-THUMB: pop.w {r0, r1, r2, r3, r4, r7, r12, lr}
 ; CHECK-A-THUMB: subs pc, lr, #4
 
   ; Normal AAPCS function (r0-r3 pushed onto stack by hardware, lr set to
   ; appropriate sentinel so no special return needed).
+; CHECK-M-LABEL: irq_fn:
 ; CHECK-M: push {r4, r7, lr}
 ; CHECK-M: add r7, sp, #4
-; CHECK-M: sub sp, #4
 ; CHECK-M: mov r4, sp
+; CHECK-M: bic r4, r4, #7
 ; CHECK-M: mov sp, r4
 ; CHECK-M: blx _bar
 ; CHECK-M: subs r4, r7, #4
@@ -48,6 +49,7 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
   ret void
 }
 
+; We don't push/pop r12, as it is banked for FIQ
 define arm_aapcscc void @fiq_fn() alignstack(8) "interrupt"="FIQ" {
 ; CHECK-A-LABEL: fiq_fn:
 ; CHECK-A: push {r0, r1, r2, r3, r4, r5, r6, r7, r11, lr}
@@ -61,6 +63,8 @@ define arm_aapcscc void @fiq_fn() alignstack(8) "interrupt"="FIQ" {
 ; CHECK-A: pop {r0, r1, r2, r3, r4, r5, r6, r7, r11, lr}
 ; CHECK-A: subs pc, lr, #4
 
+; CHECK-A-THUMB-LABEL: fiq_fn:
+; CHECK-M-LABEL: fiq_fn:
   %val = load volatile [16 x i32]* @bigvar
   store volatile [16 x i32] %val, [16 x i32]* @bigvar
   ret void
@@ -68,13 +72,13 @@ define arm_aapcscc void @fiq_fn() alignstack(8) "interrupt"="FIQ" {
 
 define arm_aapcscc void @swi_fn() alignstack(8) "interrupt"="SWI" {
 ; CHECK-A-LABEL: swi_fn:
-; CHECK-A: push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-A: push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
 ; CHECK-A: add r11, sp, #44
 ; CHECK-A: sub sp, sp, #{{[0-9]+}}
 ; CHECK-A: bic sp, sp, #7
 ; [...]
 ; CHECK-A: sub sp, r11, #44
-; CHECK-A: pop {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-A: pop {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
 ; CHECK-A: subs pc, lr, #0
 
   %val = load volatile [16 x i32]* @bigvar
@@ -84,13 +88,13 @@ define arm_aapcscc void @swi_fn() alignstack(8) "interrupt"="SWI" {
 
 define arm_aapcscc void @undef_fn() alignstack(8) "interrupt"="UNDEF" {
 ; CHECK-A-LABEL: undef_fn:
-; CHECK-A: push {r0, r1, r2, r3, r11, lr}
+; CHECK-A: push {r0, r1, r2, r3, r11, r12, lr}
 ; CHECK-A: add r11, sp, #16
 ; CHECK-A: sub sp, sp, #{{[0-9]+}}
 ; CHECK-A: bic sp, sp, #7
 ; [...]
 ; CHECK-A: sub sp, r11, #16
-; CHECK-A: pop {r0, r1, r2, r3, r11, lr}
+; CHECK-A: pop {r0, r1, r2, r3, r11, r12, lr}
 ; CHECK-A: subs pc, lr, #0
 
   call void @bar()
@@ -99,13 +103,13 @@ define arm_aapcscc void @undef_fn() alignstack(8) "interrupt"="UNDEF" {
 
 define arm_aapcscc void @abort_fn() alignstack(8) "interrupt"="ABORT" {
 ; CHECK-A-LABEL: abort_fn:
-; CHECK-A: push {r0, r1, r2, r3, r11, lr}
+; CHECK-A: push {r0, r1, r2, r3, r11, r12, lr}
 ; CHECK-A: add r11, sp, #16
 ; CHECK-A: sub sp, sp, #{{[0-9]+}}
 ; CHECK-A: bic sp, sp, #7
 ; [...]
 ; CHECK-A: sub sp, r11, #16
-; CHECK-A: pop {r0, r1, r2, r3, r11, lr}
+; CHECK-A: pop {r0, r1, r2, r3, r11, r12, lr}
 ; CHECK-A: subs pc, lr, #4
 
   call void @bar()
diff --git a/test/CodeGen/ARM/intrinsics-crypto.ll b/test/CodeGen/ARM/intrinsics-crypto.ll
index c038fe6..96413d3 100644
--- a/test/CodeGen/ARM/intrinsics-crypto.ll
+++ b/test/CodeGen/ARM/intrinsics-crypto.ll
@@ -3,13 +3,13 @@
 define arm_aapcs_vfpcc <16 x i8> @test_aesde(<16 x i8>* %a, <16 x i8> *%b) {
   %tmp = load <16 x i8>* %a
   %tmp2 = load <16 x i8>* %b
-  %tmp3 = call <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8> %tmp, <16 x i8> %tmp2)
+  %tmp3 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %tmp, <16 x i8> %tmp2)
   ; CHECK: aesd.8 q{{[0-9]+}}, q{{[0-9]+}}
-  %tmp4 = call <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8> %tmp3, <16 x i8> %tmp2)
+  %tmp4 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %tmp3, <16 x i8> %tmp2)
   ; CHECK: aese.8 q{{[0-9]+}}, q{{[0-9]+}}
-  %tmp5 = call <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8> %tmp4)
+  %tmp5 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %tmp4)
   ; CHECK: aesimc.8 q{{[0-9]+}}, q{{[0-9]+}}
-  %tmp6 = call <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8> %tmp5)
+  %tmp6 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %tmp5)
   ; CHECK: aesmc.8 q{{[0-9]+}}, q{{[0-9]+}}
   ret <16 x i8> %tmp6
 }
@@ -18,40 +18,42 @@ define arm_aapcs_vfpcc <4 x i32> @test_sha(<4 x i32> *%a, <4 x i32> *%b, <4 x i3
   %tmp = load <4 x i32>* %a
   %tmp2 = load <4 x i32>* %b
   %tmp3 = load <4 x i32>* %c
-  %res1 = call <4 x i32> @llvm.arm.neon.sha1h.v4i32(<4 x i32> %tmp)
+  %scalar = extractelement <4 x i32> %tmp, i32 0
+  %resscalar = call i32 @llvm.arm.neon.sha1h(i32 %scalar)
+  %res1 = insertelement <4 x i32> undef, i32 %resscalar, i32 0
   ; CHECK: sha1h.32 q{{[0-9]+}}, q{{[0-9]+}}
-  %res2 = call <4 x i32> @llvm.arm.neon.sha1c.v4i32(<4 x i32> %tmp2, <4 x i32> %tmp3, <4 x i32> %res1)
+  %res2 = call <4 x i32> @llvm.arm.neon.sha1c(<4 x i32> %tmp2, i32 %scalar, <4 x i32> %res1)
   ; CHECK: sha1c.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-  %res3 = call <4 x i32> @llvm.arm.neon.sha1m.v4i32(<4 x i32> %res2, <4 x i32> %tmp3, <4 x i32> %res1)
+  %res3 = call <4 x i32> @llvm.arm.neon.sha1m(<4 x i32> %res2, i32 %scalar, <4 x i32> %res1)
   ; CHECK: sha1m.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-  %res4 = call <4 x i32> @llvm.arm.neon.sha1p.v4i32(<4 x i32> %res3, <4 x i32> %tmp3, <4 x i32> %res1)
+  %res4 = call <4 x i32> @llvm.arm.neon.sha1p(<4 x i32> %res3, i32 %scalar, <4 x i32> %res1)
   ; CHECK: sha1p.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-  %res5 = call <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32> %res4, <4 x i32> %tmp3, <4 x i32> %res1)
+  %res5 = call <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32> %res4, <4 x i32> %tmp3, <4 x i32> %res1)
   ; CHECK: sha1su0.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-  %res6 = call <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32> %res5, <4 x i32> %res1)
+  %res6 = call <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32> %res5, <4 x i32> %res1)
   ; CHECK: sha1su1.32 q{{[0-9]+}}, q{{[0-9]+}}
-  %res7 = call <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32> %res6, <4 x i32> %tmp3, <4 x i32> %res1)
+  %res7 = call <4 x i32> @llvm.arm.neon.sha256h(<4 x i32> %res6, <4 x i32> %tmp3, <4 x i32> %res1)
   ; CHECK: sha256h.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-  %res8 = call <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32> %res7, <4 x i32> %tmp3, <4 x i32> %res1)
+  %res8 = call <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32> %res7, <4 x i32> %tmp3, <4 x i32> %res1)
   ; CHECK: sha256h2.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-  %res9 = call <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32> %res8, <4 x i32> %tmp3, <4 x i32> %res1)
+  %res9 = call <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32> %res8, <4 x i32> %tmp3, <4 x i32> %res1)
   ; CHECK: sha256su1.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-  %res10 = call <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32> %res9, <4 x i32> %tmp3)
+  %res10 = call <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32> %res9, <4 x i32> %tmp3)
   ; CHECK: sha256su0.32 q{{[0-9]+}}, q{{[0-9]+}}
   ret <4 x i32> %res10
 }
 
-declare <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8>)
-declare <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8>)
-declare <4 x i32> @llvm.arm.neon.sha1h.v4i32(<4 x i32>)
-declare <4 x i32> @llvm.arm.neon.sha1c.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.sha1m.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.sha1p.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.arm.neon.aesd(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.aese(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.aesimc(<16 x i8>)
+declare <16 x i8> @llvm.arm.neon.aesmc(<16 x i8>)
+declare i32 @llvm.arm.neon.sha1h(i32)
+declare <4 x i32> @llvm.arm.neon.sha1c(<4 x i32>, i32, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1m(<4 x i32>, i32, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1p(<4 x i32>, i32, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256h(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32>, <4 x i32>)
diff --git a/test/CodeGen/ARM/ispositive.ll b/test/CodeGen/ARM/ispositive.ll
index 2f1a2cf..3086d79 100644
--- a/test/CodeGen/ARM/ispositive.ll
+++ b/test/CodeGen/ARM/ispositive.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @test1(i32 %X) {
 ; CHECK: lsr{{.*}}#31
diff --git a/test/CodeGen/ARM/large-stack.ll b/test/CodeGen/ARM/large-stack.ll
index ddf0f0e..1a9a1fa 100644
--- a/test/CodeGen/ARM/large-stack.ll
+++ b/test/CodeGen/ARM/large-stack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define void @test1() {
     %tmp = alloca [ 64 x i32 ] , align 4
diff --git a/test/CodeGen/ARM/ldaex-stlex.ll b/test/CodeGen/ARM/ldaex-stlex.ll
new file mode 100644
index 0000000..bfdfea3
--- /dev/null
+++ b/test/CodeGen/ARM/ldaex-stlex.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -mtriple=armv8-apple-darwin   | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-apple-darwin | FileCheck %s
+
+%0 = type { i32, i32 }
+
+; CHECK-LABEL: f0:
+; CHECK: ldaexd
+define i64 @f0(i8* %p) nounwind readonly {
+entry:
+  %ldaexd = tail call %0 @llvm.arm.ldaexd(i8* %p)
+  %0 = extractvalue %0 %ldaexd, 1
+  %1 = extractvalue %0 %ldaexd, 0
+  %2 = zext i32 %0 to i64
+  %3 = zext i32 %1 to i64
+  %shl = shl nuw i64 %2, 32
+  %4 = or i64 %shl, %3
+  ret i64 %4
+}
+
+; CHECK-LABEL: f1:
+; CHECK: stlexd
+define i32 @f1(i8* %ptr, i64 %val) nounwind {
+entry:
+  %tmp4 = trunc i64 %val to i32
+  %tmp6 = lshr i64 %val, 32
+  %tmp7 = trunc i64 %tmp6 to i32
+  %stlexd = tail call i32 @llvm.arm.stlexd(i32 %tmp4, i32 %tmp7, i8* %ptr)
+  ret i32 %stlexd
+}
+
+declare %0 @llvm.arm.ldaexd(i8*) nounwind readonly
+declare i32 @llvm.arm.stlexd(i32, i32, i8*) nounwind
+
+; CHECK-LABEL: test_load_i8:
+; CHECK: ldaexb r0, [r0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+define zeroext i8 @test_load_i8(i8* %addr) {
+  %val = call i32 @llvm.arm.ldaex.p0i8(i8* %addr)
+  %val8 = trunc i32 %val to i8
+  ret i8 %val8
+}
+
+; CHECK-LABEL: test_load_i16:
+; CHECK: ldaexh r0, [r0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+define zeroext i16 @test_load_i16(i16* %addr) {
+  %val = call i32 @llvm.arm.ldaex.p0i16(i16* %addr)
+  %val16 = trunc i32 %val to i16
+  ret i16 %val16
+}
+
+; CHECK-LABEL: test_load_i32:
+; CHECK: ldaex r0, [r0]
+define i32 @test_load_i32(i32* %addr) {
+  %val = call i32 @llvm.arm.ldaex.p0i32(i32* %addr)
+  ret i32 %val
+}
+
+declare i32 @llvm.arm.ldaex.p0i8(i8*) nounwind readonly
+declare i32 @llvm.arm.ldaex.p0i16(i16*) nounwind readonly
+declare i32 @llvm.arm.ldaex.p0i32(i32*) nounwind readonly
+
+; CHECK-LABEL: test_store_i8:
+; CHECK-NOT: uxtb
+; CHECK: stlexb r0, r1, [r2]
+define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
+  %extval = zext i8 %val to i32
+  %res = call i32 @llvm.arm.stlex.p0i8(i32 %extval, i8* %addr)
+  ret i32 %res
+}
+
+; CHECK-LABEL: test_store_i16:
+; CHECK-NOT: uxth
+; CHECK: stlexh r0, r1, [r2]
+define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
+  %extval = zext i16 %val to i32
+  %res = call i32 @llvm.arm.stlex.p0i16(i32 %extval, i16* %addr)
+  ret i32 %res
+}
+
+; CHECK-LABEL: test_store_i32:
+; CHECK: stlex r0, r1, [r2]
+define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
+  %res = call i32 @llvm.arm.stlex.p0i32(i32 %val, i32* %addr)
+  ret i32 %res
+}
+
+declare i32 @llvm.arm.stlex.p0i8(i32, i8*) nounwind
+declare i32 @llvm.arm.stlex.p0i16(i32, i16*) nounwind
+declare i32 @llvm.arm.stlex.p0i32(i32, i32*) nounwind
diff --git a/test/CodeGen/ARM/ldm.ll b/test/CodeGen/ARM/ldm.ll
index d5b805c..3977da6 100644
--- a/test/CodeGen/ARM/ldm.ll
+++ b/test/CodeGen/ARM/ldm.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=armv4t-apple-darwin | FileCheck %s -check-prefix=V4T
+; RUN: llc < %s -mtriple=armv7-apple-ios3.0 | FileCheck %s
+; RUN: llc < %s -mtriple=armv4t-apple-ios3.0 | FileCheck %s -check-prefix=V4T
 
 @X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
 
diff --git a/test/CodeGen/ARM/ldr.ll b/test/CodeGen/ARM/ldr.ll
index e4c695b..57e9977 100644
--- a/test/CodeGen/ARM/ldr.ll
+++ b/test/CodeGen/ARM/ldr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @f1(i32* %v) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/ARM/ldr_ext.ll b/test/CodeGen/ARM/ldr_ext.ll
index d29eb02..31aaba5 100644
--- a/test/CodeGen/ARM/ldr_ext.ll
+++ b/test/CodeGen/ARM/ldr_ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @test1(i8* %t1) nounwind {
 ; CHECK: ldrb
diff --git a/test/CodeGen/ARM/ldr_frame.ll b/test/CodeGen/ARM/ldr_frame.ll
index f071b89..ed964ec 100644
--- a/test/CodeGen/ARM/ldr_frame.ll
+++ b/test/CodeGen/ARM/ldr_frame.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v4t | not grep mov
+; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s
 
 define i32 @f1() {
 	%buf = alloca [32 x i32], align 4
@@ -29,3 +29,6 @@ define i32 @f4() {
         %tmp2 = zext i8 %tmp1 to i32
 	ret i32 %tmp2
 }
+
+; CHECK-NOT: mov
+
diff --git a/test/CodeGen/ARM/ldr_post.ll b/test/CodeGen/ARM/ldr_post.ll
index f5ff7dd..2558b16 100644
--- a/test/CodeGen/ARM/ldr_post.ll
+++ b/test/CodeGen/ARM/ldr_post.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm | FileCheck %s
-; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=swift %s -o - | FileCheck %s
 
 ; CHECK-LABEL: test1:
 ; CHECK: ldr {{.*, \[.*]}}, -r2
diff --git a/test/CodeGen/ARM/ldr_pre.ll b/test/CodeGen/ARM/ldr_pre.ll
index 8281827..a97927a 100644
--- a/test/CodeGen/ARM/ldr_pre.ll
+++ b/test/CodeGen/ARM/ldr_pre.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm | FileCheck %s
-; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=swift %s -o - | FileCheck %s
 
 ; CHECK-LABEL: test1:
 ; CHECK: ldr {{.*!}}
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index 864d18a..caef2e7 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=A8
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=A8 -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=CHECK
 ; rdar://6949835
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK
 
 ; Magic ARM pair hints works best with linearscan / fast.
 
diff --git a/test/CodeGen/ARM/ldstrex.ll b/test/CodeGen/ARM/ldstrex.ll
index 5eaae53..a40e255 100644
--- a/test/CodeGen/ARM/ldstrex.ll
+++ b/test/CodeGen/ARM/ldstrex.ll
@@ -36,17 +36,21 @@ declare i32 @llvm.arm.strexd(i32, i32, i8*) nounwind
 ; CHECK-LABEL: test_load_i8:
 ; CHECK: ldrexb r0, [r0]
 ; CHECK-NOT: uxtb
-define i32 @test_load_i8(i8* %addr) {
+; CHECK-NOT: and
+define zeroext i8 @test_load_i8(i8* %addr) {
   %val = call i32 @llvm.arm.ldrex.p0i8(i8* %addr)
-  ret i32 %val
+  %val8 = trunc i32 %val to i8
+  ret i8 %val8
 }
 
 ; CHECK-LABEL: test_load_i16:
 ; CHECK: ldrexh r0, [r0]
 ; CHECK-NOT: uxth
-define i32 @test_load_i16(i16* %addr) {
+; CHECK-NOT: and
+define zeroext i16 @test_load_i16(i16* %addr) {
   %val = call i32 @llvm.arm.ldrex.p0i16(i16* %addr)
-  ret i32 %val
+  %val16 = trunc i32 %val to i16
+  ret i16 %val16
 }
 
 ; CHECK-LABEL: test_load_i32:
@@ -137,3 +141,19 @@ define void @excl_addrmode() {
 
   ret void
 }
+
+; LLVM should know, even across basic blocks, that ldrex is setting the high
+; bits of its i32 to 0. There should be no zero-extend operation.
+define zeroext i8 @test_cross_block_zext_i8(i1 %tst, i8* %addr) {
+; CHECK: test_cross_block_zext_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: bx lr
+  %val = call i32 @llvm.arm.ldrex.p0i8(i8* %addr)
+  br i1 %tst, label %end, label %mid
+mid:
+  ret i8 42
+end:
+  %val8 = trunc i32 %val to i8
+  ret i8 %val8
+}
diff --git a/test/CodeGen/ARM/load.ll b/test/CodeGen/ARM/load.ll
index 253b0e1..ca16adc 100644
--- a/test/CodeGen/ARM/load.ll
+++ b/test/CodeGen/ARM/load.ll
@@ -1,9 +1,4 @@
-; RUN: llc < %s -march=arm > %t
-; RUN: grep ldrsb %t
-; RUN: grep ldrb %t
-; RUN: grep ldrsh %t
-; RUN: grep ldrh %t
-
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @f1(i8* %p) {
 entry:
@@ -32,3 +27,9 @@ entry:
         %tmp4 = zext i16 %tmp to i32             ; <i32> [#uses=1]
         ret i32 %tmp4
 }
+
+; CHECK: ldrsb
+; CHECK: ldrb
+; CHECK: ldrsh
+; CHECK: ldrh
+
diff --git a/test/CodeGen/ARM/long-setcc.ll b/test/CodeGen/ARM/long-setcc.ll
index c76a5e4..f09167e 100644
--- a/test/CodeGen/ARM/long-setcc.ll
+++ b/test/CodeGen/ARM/long-setcc.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=arm | grep cmp | count 1
-
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i1 @t1(i64 %x) {
 	%B = icmp slt i64 %x, 0
@@ -15,3 +14,7 @@ define i1 @t3(i32 %x) {
 	%tmp = icmp ugt i32 %x, -1
 	ret i1 %tmp
 }
+
+; CHECK: cmp
+; CHECK-NOT: cmp
+
diff --git a/test/CodeGen/ARM/long.ll b/test/CodeGen/ARM/long.ll
index 7fffc81..d0bff4a 100644
--- a/test/CodeGen/ARM/long.ll
+++ b/test/CodeGen/ARM/long.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i64 @f1() {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll
index 2cf91c3..5636a12 100644
--- a/test/CodeGen/ARM/longMAC.ll
+++ b/test/CodeGen/ARM/longMAC.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s --check-prefix=CHECK-V7
 ; Check generated signed and unsigned multiply accumulate long.
 
 define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
@@ -42,3 +43,28 @@ define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
   %add = add nsw i64 %mul, %conv2
   ret i64 %add
 }
+
+; Two things to check here: the @earlyclobber constraint (on <= v5) and the "$Rd = $R" ones.
+;    + Without @earlyclobber the v7 code is natural. With it, the first two
+;      registers must be distinct from the third.
+;    + Without "$Rd = $R", this can be satisfied without a mov before the umlal
+;      by trying to use 6 different registers in the MachineInstr. The natural
+;      evolution of this attempt currently leaves only two movs in the final
+;      function, both after the umlal. With it, *some* move has to happen
+;      before the umlal.
+define i64 @MACLongTest5(i64 %c, i32 %a, i32 %b) {
+; CHECK-V7-LABEL: MACLongTest5:
+; CHECK-V7-LABEL: umlal r0, r1, r0, r0
+
+; CHECK-LABEL: MACLongTest5:
+; CHECK: mov [[RDLO:r[0-9]+]], r0
+; CHECK: umlal [[RDLO]], r1, r0, r0
+; CHECK: mov r0, [[RDLO]]
+
+  %conv.trunc = trunc i64 %c to i32
+  %conv = zext i32 %conv.trunc to i64
+  %conv1 = zext i32 %b to i64
+  %mul = mul i64 %conv, %conv
+  %add = add i64 %mul, %c
+  ret i64 %add
+}
diff --git a/test/CodeGen/ARM/long_shift.ll b/test/CodeGen/ARM/long_shift.ll
index 3e986d80..48b0ba7 100644
--- a/test/CodeGen/ARM/long_shift.ll
+++ b/test/CodeGen/ARM/long_shift.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i64 @f0(i64 %A, i64 %B) {
 ; CHECK-LABEL: f0:
diff --git a/test/CodeGen/ARM/lsr-scale-addr-mode.ll b/test/CodeGen/ARM/lsr-scale-addr-mode.ll
index 0c8d387..9480241 100644
--- a/test/CodeGen/ARM/lsr-scale-addr-mode.ll
+++ b/test/CodeGen/ARM/lsr-scale-addr-mode.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep lsl | grep -F "lsl #2]"
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 ; Should use scaled addressing mode.
 
 define void @sintzero(i32* %a) nounwind {
@@ -17,3 +17,6 @@ cond_next:		; preds = %cond_next, %entry
 return:		; preds = %cond_next
 	ret void
 }
+
+; CHECK: lsl{{.*}}#2]
+
diff --git a/test/CodeGen/ARM/lsr-unfolded-offset.ll b/test/CodeGen/ARM/lsr-unfolded-offset.ll
index 26d4be2..1dafa00 100644
--- a/test/CodeGen/ARM/lsr-unfolded-offset.ll
+++ b/test/CodeGen/ARM/lsr-unfolded-offset.ll
@@ -4,7 +4,7 @@
 ; register pressure and therefore spilling. There is more room for improvement
 ; here.
 
-; CHECK: sub sp, #{{40|32|28|24}}
+; CHECK: sub sp, #{{40|36|32|28|24}}
 
 ; CHECK: %for.inc
 ; CHECK-NOT: ldr
diff --git a/test/CodeGen/ARM/machine-licm.ll b/test/CodeGen/ARM/machine-licm.ll
index fc9b226..ca65501 100644
--- a/test/CodeGen/ARM/machine-licm.ll
+++ b/test/CodeGen/ARM/machine-licm.ll
@@ -5,20 +5,12 @@
 ; rdar://7354376
 ; rdar://8887598
 
-; The generated code is no where near ideal. It's not recognizing the two
-; constantpool entries being loaded can be merged into one.
-
 @GV = external global i32                         ; <i32*> [#uses=2]
 
 define void @t(i32* nocapture %vals, i32 %c) nounwind {
 entry:
 ; ARM-LABEL: t:
 ; ARM: ldr [[REGISTER_1:r[0-9]+]], LCPI0_0
-; Unfortunately currently ARM codegen doesn't cse the ldr from constantpool.
-; The issue is it can be read by an "add pc" or a "ldr [pc]" so it's messy
-; to add the pseudo instructions to make sure they are CSE'ed at the same
-; time as the "ldr cp".
-; ARM: ldr r{{[0-9]+}}, LCPI0_1
 ; ARM: LPC0_0:
 ; ARM: ldr r{{[0-9]+}}, [pc, [[REGISTER_1]]]
 ; ARM: ldr r{{[0-9]+}}, [r{{[0-9]+}}]
@@ -36,7 +28,7 @@ entry:
 
 bb.nph:                                           ; preds = %entry
 ; ARM: LCPI0_0:
-; ARM: LCPI0_1:
+; ARM-NOT: LCPI0_1:
 ; ARM: .section
 
 ; THUMB: BB#1
diff --git a/test/CodeGen/ARM/mature-mc-support.ll b/test/CodeGen/ARM/mature-mc-support.ll
new file mode 100644
index 0000000..0a7e5b9
--- /dev/null
+++ b/test/CodeGen/ARM/mature-mc-support.ll
@@ -0,0 +1,12 @@
+; Test that inline assembly is parsed by the MC layer when MC support is mature
+; (even when the output is assembly).
+
+; RUN: not llc -mtriple=arm-pc-linux < %s > /dev/null 2> %t1
+; RUN: FileCheck %s < %t1
+
+; RUN: not llc -mtriple=arm-pc-linux -filetype=obj < %s > /dev/null 2> %t2
+; RUN: FileCheck %s < %t2
+
+module asm "	.this_directive_is_very_unlikely_to_exist"
+
+; CHECK: LLVM ERROR: Error parsing inline asm
diff --git a/test/CodeGen/ARM/mem.ll b/test/CodeGen/ARM/mem.ll
index f46c7a5..3c9cd91 100644
--- a/test/CodeGen/ARM/mem.ll
+++ b/test/CodeGen/ARM/mem.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=arm | grep strb
-; RUN: llc < %s -march=arm | grep strh
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define void @f1() {
 entry:
@@ -7,8 +6,13 @@ entry:
         ret void
 }
 
+; CHECK: strb
+
 define void @f2() {
 entry:
         store i16 0, i16* null
         ret void
 }
+
+; CHECK: strh
+
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index 946c63e..14d84de 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -38,7 +38,8 @@ entry:
 define void @t2(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t2:
-; CHECK: ldr [[REG2:r[0-9]+]], [r1, #32]
+; CHECK: movw [[REG2:r[0-9]+]], #16716
+; CHECK: movt [[REG2:r[0-9]+]], #72
 ; CHECK: str [[REG2]], [r0, #32]
 ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
@@ -79,7 +80,8 @@ entry:
 ; CHECK: strb [[REG5]], [r0, #6]
 ; CHECK: movw [[REG6:r[0-9]+]], #21587
 ; CHECK: strh [[REG6]], [r0, #4]
-; CHECK: ldr [[REG7:r[0-9]+]], 
+; CHECK: movw [[REG7:r[0-9]+]], #18500
+; CHECK: movt [[REG7:r[0-9]+]], #22866
 ; CHECK: str [[REG7]]
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
   ret void
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index fe0056c..8d3800b 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios -o - | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7m-darwin-eabi -o - | FileCheck %s --check-prefix=DARWIN
+; RUN: llc < %s -mtriple=thumbv7m-none-macho -o - | FileCheck %s --check-prefix=DARWIN
 ; RUN: llc < %s -mtriple=arm-none-eabi -o - | FileCheck --check-prefix=EABI %s
+; RUN: llc < %s -mtriple=arm-none-eabihf -o - | FileCheck --check-prefix=EABI %s
 
 @from = common global [500 x i32] zeroinitializer, align 4
 @to = common global [500 x i32] zeroinitializer, align 4
diff --git a/test/CodeGen/ARM/minsize-imms.ll b/test/CodeGen/ARM/minsize-imms.ll
new file mode 100644
index 0000000..4c8ff39
--- /dev/null
+++ b/test/CodeGen/ARM/minsize-imms.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mtriple=thumbv7m-macho -o - -show-mc-encoding %s | FileCheck %s
+; RUN: llc -mtriple=thumbv6m-macho -o - -show-mc-encoding %s | FileCheck %s --check-prefix=CHECK-V6M
+; RUN: llc -mtriple=armv6-macho -o - -show-mc-encoding %s | FileCheck %s --check-prefix=CHECK-ARM
+define i32 @test_mov() minsize {
+; CHECK-LABEL: test_mov:
+; CHECK: movs r0, #255 @ encoding: [0xff,0x20]
+
+  ret i32 255
+}
+
+define i32 @test_mov_mvn() minsize {
+; CHECK-LABEL: test_mov_mvn:
+; CHECK: mvn r0, #203 @ encoding: [0x6f,0xf0,0xcb,0x00]
+
+; CHECK-V6M-LABEL: test_mov_mvn:
+; CHECK-V6M: movs [[TMP:r[0-7]]], #203 @ encoding: [0xcb,0x20]
+; CHECK-V6M: mvns r0, [[TMP]] @ encoding: [0xc0,0x43]
+
+; CHECK-ARM-LABEL: test_mov_mvn:
+; CHECK-ARM: mvn r0, #203 @ encoding: [0xcb,0x00,0xe0,0xe3]
+  ret i32 4294967092
+}
+
+define i32 @test_mov_lsl() minsize {
+; CHECK-LABEL: test_mov_lsl:
+; CHECK: mov.w r0, #589824 @ encoding: [0x4f,0xf4,0x10,0x20]
+
+; CHECK-V6M-LABEL: test_mov_lsl:
+; CHECK-V6M: movs [[TMP:r[0-7]]], #9 @ encoding: [0x09,0x20]
+; CHECK-V6M: lsls r0, [[TMP]], #16 @ encoding: [0x00,0x04]
+
+; CHECK-ARM-LABEL: test_mov_lsl:
+; CHECK-ARM: mov r0, #589824 @ encoding: [0x09,0x08,0xa0,0xe3]
+  ret i32 589824
+}
+
+define i32 @test_movw() minsize {
+; CHECK-LABEL: test_movw:
+; CHECK: movw r0, #65535
+
+; CHECK-V6M-LABEL: test_movw:
+; CHECK-V6M: ldr r0, [[CONSTPOOL:LCPI[0-9]+_[0-9]+]] @ encoding: [A,0x48]
+; CHECK-V6M: [[CONSTPOOL]]:
+; CHECK-V6M-NEXT: .long 65535
+
+; CHECK-ARM-LABEL: test_movw:
+; CHECK-ARM: mov r0, #255 @ encoding: [0xff,0x00,0xa0,0xe3]
+; CHECK-ARM: orr r0, r0, #65280 @ encoding: [0xff,0x0c,0x80,0xe3]
+ ret i32 65535
+}
+
+define i32 @test_regress1() {
+; CHECK-ARM-LABEL: test_regress1:
+; CHECK-ARM: mov r0, #248 @ encoding: [0xf8,0x00,0xa0,0xe3]
+; CHECK-ARM: orr r0, r0, #16252928 @ encoding: [0x3e,0x07,0x80,0xe3]
+  ret i32 16253176
+}
diff --git a/test/CodeGen/ARM/minsize-litpools.ll b/test/CodeGen/ARM/minsize-litpools.ll
new file mode 100644
index 0000000..d5cd2a9
--- /dev/null
+++ b/test/CodeGen/ARM/minsize-litpools.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=thumbv7s %s -o -  | FileCheck %s
+; RUN: llc -mtriple=armv7s %s -o -  | FileCheck %s
+
+; CodeGen should be able to set and reset the MinSize subtarget-feature, and
+; make use of it in deciding whether to use MOVW/MOVT for global variables or a
+; lit-pool load (saving roughly 2 bytes of code).
+
+@var = global i32 0
+
+define i32 @small_global() minsize {
+; CHECK-LABEL: small_global:
+; CHECK: ldr r[[GLOBDEST:[0-9]+]], {{.?LCPI0_0}}
+; CHECK: ldr r0, [r[[GLOBDEST]]]
+
+  %val = load i32* @var
+  ret i32 %val
+}
+
+define i32 @big_global() {
+; CHECK-LABEL: big_global:
+; CHECK: movw [[GLOBDEST:r[0-9]+]], :lower16:var
+; CHECK: movt [[GLOBDEST]], :upper16:var
+
+  %val = load i32* @var
+  ret i32 %val
+}
diff --git a/test/CodeGen/ARM/mls.ll b/test/CodeGen/ARM/mls.ll
index 8f0d3a8..6776e63 100644
--- a/test/CodeGen/ARM/mls.ll
+++ b/test/CodeGen/ARM/mls.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=arm -mattr=+v6t2 | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+v6t2 -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
+; RUN: llc -mtriple=arm-eabi -mattr=+v6t2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v6t2 -arm-use-mulops=false %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
diff --git a/test/CodeGen/ARM/movt-movw-global.ll b/test/CodeGen/ARM/movt-movw-global.ll
index bbedea1..1e10af1 100644
--- a/test/CodeGen/ARM/movt-movw-global.ll
+++ b/test/CodeGen/ARM/movt-movw-global.ll
@@ -16,8 +16,8 @@ entry:
 ; IOS-PIC:      movw    r0, :lower16:(L_foo$non_lazy_ptr-(LPC0_0+8))
 ; IOS-PIC-NEXT: movt    r0, :upper16:(L_foo$non_lazy_ptr-(LPC0_0+8))
 
-; IOS-STATIC-NOT:      movw    r0, :lower16:_foo
-; IOS-STATIC-NOT:       movt    r0, :upper16:_foo
+; IOS-STATIC:      movw    r0, :lower16:_foo
+; IOS-STATIC-NEXT:       movt    r0, :upper16:_foo
   ret i32* @foo
 }
 
@@ -32,8 +32,8 @@ entry:
 ; IOS-PIC:      movw    r1, :lower16:(L_foo$non_lazy_ptr-(LPC1_0+8))
 ; IOS-PIC-NEXT: movt    r1, :upper16:(L_foo$non_lazy_ptr-(LPC1_0+8))
 
-; IOS-STATIC-NOT:      movw    r1, :lower16:_foo
-; IOS-STATIC-NOT:      movt    r1, :upper16:_foo
+; IOS-STATIC:      movw    r1, :lower16:_foo
+; IOS-STATIC-NEXT:      movt    r1, :upper16:_foo
   store i32 %baz, i32* @foo, align 4
   ret void
 }
diff --git a/test/CodeGen/ARM/movt.ll b/test/CodeGen/ARM/movt.ll
index 25c1bfe..735d949 100644
--- a/test/CodeGen/ARM/movt.ll
+++ b/test/CodeGen/ARM/movt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 ; rdar://7317664
 
 define i32 @t(i32 %X) nounwind {
diff --git a/test/CodeGen/ARM/mul_const.ll b/test/CodeGen/ARM/mul_const.ll
index 482d8f2..ada3d4e 100644
--- a/test/CodeGen/ARM/mul_const.ll
+++ b/test/CodeGen/ARM/mul_const.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @t9(i32 %v) nounwind readnone {
 entry:
diff --git a/test/CodeGen/ARM/mulhi.ll b/test/CodeGen/ARM/mulhi.ll
index 63705c5..c66a804 100644
--- a/test/CodeGen/ARM/mulhi.ll
+++ b/test/CodeGen/ARM/mulhi.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=arm -mattr=+v6 | FileCheck %s -check-prefix=V6
-; RUN: llc < %s -march=arm | FileCheck %s -check-prefix=V4
-; RUN: llc < %s -march=thumb -mcpu=cortex-m3 | FileCheck %s -check-prefix=M3
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s -check-prefix=V6
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s -check-prefix=V4
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s -check-prefix=M3
 
 define i32 @smulhi(i32 %x, i32 %y) nounwind {
 ; V6-LABEL: smulhi:
diff --git a/test/CodeGen/ARM/mult-alt-generic-arm.ll b/test/CodeGen/ARM/mult-alt-generic-arm.ll
index a8104db..05e9b0f 100644
--- a/test/CodeGen/ARM/mult-alt-generic-arm.ll
+++ b/test/CodeGen/ARM/mult-alt-generic-arm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc < %s -march=arm -no-integrated-as
 ; ModuleID = 'mult-alt-generic.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32"
 target triple = "arm"
diff --git a/test/CodeGen/ARM/mvn.ll b/test/CodeGen/ARM/mvn.ll
index 2c5ccd7..489f247 100644
--- a/test/CodeGen/ARM/mvn.ll
+++ b/test/CodeGen/ARM/mvn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep mvn | count 9
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @f1() {
 entry:
@@ -72,3 +72,16 @@ entry:
 	%tmp102 = icmp eq i32 -2, %a		; <i1> [#uses=1]
 	ret i1 %tmp102
 }
+
+; CHECK-LABEL: f1
+; CHECK: mvn
+; CHECK: mvn
+; CHECK: mvn
+; CHECK: mvn
+; CHECK: mvn
+; CHECK: mvn
+; CHECK: mvn
+; CHECK: mvn
+; CHECK: mvn
+; CHECK-NOT: mvn
+
diff --git a/test/CodeGen/ARM/neon_arith1.ll b/test/CodeGen/ARM/neon_arith1.ll
index 5892737..42e7d82 100644
--- a/test/CodeGen/ARM/neon_arith1.ll
+++ b/test/CodeGen/ARM/neon_arith1.ll
@@ -1,7 +1,10 @@
-; RUN: llc < %s -march=arm -mattr=+neon | grep vadd
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @t_i8x8(<8 x i8> %a, <8 x i8> %b) nounwind {
 entry:
 	%0 = add <8 x i8> %a, %b
 	ret <8 x i8> %0
 }
+
+; CHECK: vadd
+
diff --git a/test/CodeGen/ARM/neon_cmp.ll b/test/CodeGen/ARM/neon_cmp.ll
index 046b5da..e1662c4 100644
--- a/test/CodeGen/ARM/neon_cmp.ll
+++ b/test/CodeGen/ARM/neon_cmp.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s
+
 ; bug 15283
 ; radar://13191881
 ; CHECK: vfcmp
diff --git a/test/CodeGen/ARM/neon_div.ll b/test/CodeGen/ARM/neon_div.ll
index 4a82c36..4f1607e 100644
--- a/test/CodeGen/ARM/neon_div.ll
+++ b/test/CodeGen/ARM/neon_div.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+neon -pre-RA-sched=source -disable-post-ra | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -pre-RA-sched=source -disable-post-ra %s -o - \
+; RUN:  | FileCheck %s
 
 define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vrecpe.f32
diff --git a/test/CodeGen/ARM/neon_fpconv.ll b/test/CodeGen/ARM/neon_fpconv.ll
index 149f4c7..8e37ce7 100644
--- a/test/CodeGen/ARM/neon_fpconv.ll
+++ b/test/CodeGen/ARM/neon_fpconv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 ; PR12540: ARM backend lowering of FP_ROUND v2f64 to v2f32.
 define <2 x float> @vtrunc(<2 x double> %a) {
diff --git a/test/CodeGen/ARM/neon_ld1.ll b/test/CodeGen/ARM/neon_ld1.ll
index b892d2d..9fd3fc5 100644
--- a/test/CodeGen/ARM/neon_ld1.ll
+++ b/test/CodeGen/ARM/neon_ld1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
 
 ; CHECK: t1
 ; CHECK: vldr d
diff --git a/test/CodeGen/ARM/neon_ld2.ll b/test/CodeGen/ARM/neon_ld2.ll
index 25a670b..571a16a 100644
--- a/test/CodeGen/ARM/neon_ld2.ll
+++ b/test/CodeGen/ARM/neon_ld2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
-; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s --check-prefix=SWIFT
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mcpu=swift %s -o - | FileCheck %s --check-prefix=SWIFT
 
 ; CHECK: t1
 ; CHECK: vld1.64
diff --git a/test/CodeGen/ARM/neon_minmax.ll b/test/CodeGen/ARM/neon_minmax.ll
index 2e45919..84e4b30 100644
--- a/test/CodeGen/ARM/neon_minmax.ll
+++ b/test/CodeGen/ARM/neon_minmax.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=swift %s -o - | FileCheck %s
 
 define float @fmin_ole(float %x) nounwind {
 ;CHECK-LABEL: fmin_ole:
diff --git a/test/CodeGen/ARM/neon_shift.ll b/test/CodeGen/ARM/neon_shift.ll
index 340f220..3c09358 100644
--- a/test/CodeGen/ARM/neon_shift.ll
+++ b/test/CodeGen/ARM/neon_shift.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 ; <rdar://problem/9055897>
 define <4 x i16> @t1(<4 x i32> %a) nounwind {
diff --git a/test/CodeGen/ARM/neon_vabs.ll b/test/CodeGen/ARM/neon_vabs.ll
index 76b6044..7a02512 100644
--- a/test/CodeGen/ARM/neon_vabs.ll
+++ b/test/CodeGen/ARM/neon_vabs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <4 x i32> @test1(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/ARM/none-macho.ll b/test/CodeGen/ARM/none-macho.ll
new file mode 100644
index 0000000..2795b8c
--- /dev/null
+++ b/test/CodeGen/ARM/none-macho.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=thumbv7m-none-macho %s -o - -relocation-model=pic -disable-fp-elim | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NON-FAST
+; RUN: llc -mtriple=thumbv7m-none-macho -O0 %s -o - -relocation-model=pic -disable-fp-elim | FileCheck %s
+; RUN: llc -mtriple=thumbv7m-none-macho -filetype=obj %s -o /dev/null
+
+  ; Bare-metal should probably "declare" segments just like normal MachO
+; CHECK: __picsymbolstub4
+; CHECK: __StaticInit
+; CHECK: __text
+
+@var = external global i32
+
+define i32 @test_litpool() minsize {
+; CHECK-LABEL: test_litpool:
+  %val = load i32* @var
+  ret i32 %val
+
+  ; Lit-pool entries need to produce a "$non_lazy_ptr" version of the symbol.
+; CHECK: LCPI0_0:
+; CHECK-NEXT: .long L_var$non_lazy_ptr-(LPC0_0+4)
+}
+
+define i32 @test_movw_movt() {
+; CHECK-LABEL: test_movw_movt:
+  %val = load i32* @var
+  ret i32 %val
+
+  ; movw/movt should also address their symbols MachO-style
+; CHECK: movw [[RTMP:r[0-9]+]], :lower16:(L_var$non_lazy_ptr-(LPC1_0+4))
+; CHECK: movt [[RTMP]], :upper16:(L_var$non_lazy_ptr-(LPC1_0+4))
+; CHECK: LPC1_0:
+; CHECK: add [[RTMP]], pc
+}
+
+declare void @llvm.trap()
+
+define void @test_trap() {
+; CHECK-LABEL: test_trap:
+
+  ; Bare-metal MachO gets compiled on top of normal MachO toolchain which
+  ; understands trap natively.
+  call void @llvm.trap()
+; CHECK: trap
+
+  ret void
+}
+
+define i32 @test_frame_ptr() {
+; CHECK-LABEL: test_frame_ptr:
+  call void @test_trap()
+
+  ; Frame pointer is r7 as for Darwin
+; CHECK: mov r7, sp
+  ret i32 42
+}
+
+%big_arr = type [8 x i32]
+define void @test_two_areas(%big_arr* %addr) {
+; CHECK-LABEL: test_two_areas:
+  %val = load %big_arr* %addr
+  call void @test_trap()
+  store %big_arr %val, %big_arr* %addr
+
+  ; This goes with the choice of r7 as FP (largely). FP and LR have to be stored
+  ; consecutively on the stack for the frame record to be valid, which means we
+  ; need the 2 register-save areas employed by iOS.
+; CHECK-NON-FAST: push {r4, r5, r6, r7, lr}
+; CHECK-NON-FAST: push.w {r8, r9, r10, r11}
+; ...
+; CHECK-NON-FAST: pop.w {r8, r9, r10, r11}
+; CHECK-NON-FAST: pop {r4, r5, r6, r7, pc}
+  ret void
+}
+
+define void @test_tail_call() {
+; CHECK-LABEL: test_tail_call:
+  tail call void @test_trap()
+
+  ; Tail calls should be available and use Thumb2 branch.
+; CHECK: b.w _test_trap
+  ret void
+}
+
+define float @test_softfloat_calls(float %in) {
+; CHECK-LABEL: test_softfloat_calls:
+  %sum = fadd float %in, %in
+
+  ; Soft-float calls should be GNU-style rather than RTABI and should not be the
+  ; *vfp variants used for ARMv6 iOS.
+; CHECK: blx ___addsf3{{$}}
+  ret float %sum
+}
+
+  ; Even bare-metal PIC needs GOT-like behaviour, in principle. Depends a bit on
+  ; the use-case of course, but LLVM doesn't know what that is.
+; CHECK: non_lazy_symbol_pointers
+; CHECK: L_var$non_lazy_ptr:
+; CHECK-NEXT:   .indirect_symbol _var
+
+  ; All MachO objects should have this to give the linker leeway in removing
+  ; dead code.
+; CHECK: .subsections_via_symbols
diff --git a/test/CodeGen/ARM/noreturn.ll b/test/CodeGen/ARM/noreturn.ll
index 4c876ce..edc3333 100644
--- a/test/CodeGen/ARM/noreturn.ll
+++ b/test/CodeGen/ARM/noreturn.ll
@@ -43,6 +43,23 @@ entry:
   unreachable
 }
 
+; Test case for uwtable
+define i32 @test4() uwtable {
+; CHECK-LABEL: @test4
+; CHECK: push
+entry:
+  tail call void @overflow() #0
+  unreachable
+}
+
+define i32 @test5() uwtable {
+; CHECK-LABEL: @test5
+; CHECK: push
+entry:
+  tail call void @overflow_with_unwind() #1
+  unreachable
+}
+
 ; Function Attrs: noreturn
 declare void @overflow_with_unwind() #1
 
diff --git a/test/CodeGen/ARM/optimize-dmbs-v7.ll b/test/CodeGen/ARM/optimize-dmbs-v7.ll
new file mode 100644
index 0000000..64f5e20
--- /dev/null
+++ b/test/CodeGen/ARM/optimize-dmbs-v7.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=armv7 -mattr=+db | FileCheck %s
+
+@x1 = global i32 0, align 4
+@x2 = global i32 0, align 4
+
+define void @test() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.013 = phi i32 [ 1, %entry ], [ %inc6, %for.body ]
+  store atomic i32 %i.013, i32* @x1 seq_cst, align 4
+  store atomic i32 %i.013, i32* @x1 seq_cst, align 4
+  store atomic i32 %i.013, i32* @x2 seq_cst, align 4
+  %inc6 = add nsw i32 %i.013, 1
+  %exitcond = icmp eq i32 %inc6, 2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; The for.body contains 3 seq_cst stores.
+; Hence it should have 3 dmb;str;dmb sequences with the middle dmbs collapsed
+; CHECK: %for.body
+; CHECK-NOT: str
+; CHECK: dmb
+; CHECK-NOT: dmb
+; CHECK: str
+
+; CHECK-NOT: str
+; CHECK: dmb
+; CHECK-NOT: dmb
+; CHECK: str
+
+; CHECK-NOT: str
+; CHECK: dmb
+; CHECK-NOT: dmb
+; CHECK: str
+
+; CHECK-NOT: str
+; CHECK: dmb
+; CHECK-NOT: dmb
+; CHECK-NOT: str
+; CHECK: %for.end
+}
+
+define void @test2() {
+  call void @llvm.arm.dmb(i32 11)
+  tail call void @test()
+  call void @llvm.arm.dmb(i32 11)
+  ret void
+; the call should prevent the two dmbs from collapsing
+; CHECK: test2:
+; CHECK: dmb
+; CHECK-NEXT: bl
+; CHECK-NEXT: dmb
+}
+
+define void @test3() {
+  call void @llvm.arm.dmb(i32 11)
+  call void @llvm.arm.dsb(i32 9)
+  call void @llvm.arm.dmb(i32 11)
+  ret void
+; the call should prevent the two dmbs from collapsing
+; CHECK: test3:
+; CHECK: dmb
+; CHECK-NEXT: dsb
+; CHECK-NEXT: dmb
+
+}
+
+
+declare void @llvm.arm.dmb(i32)
+declare void @llvm.arm.dsb(i32)
diff --git a/test/CodeGen/ARM/optselect-regclass.ll b/test/CodeGen/ARM/optselect-regclass.ll
index 1aa4520..0acb2f2 100644
--- a/test/CodeGen/ARM/optselect-regclass.ll
+++ b/test/CodeGen/ARM/optselect-regclass.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm -mcpu=swift -verify-machineinstrs
+; RUN: llc -mtriple=arm-eabi -mcpu=swift -verify-machineinstrs %s -o /dev/null
+
 %union.opcode.0.2.5.8.15.28 = type { i32 }
 
 @opcode = external global %union.opcode.0.2.5.8.15.28, align 4
diff --git a/test/CodeGen/ARM/pack.ll b/test/CodeGen/ARM/pack.ll
index fbc1155..89abe28 100644
--- a/test/CodeGen/ARM/pack.ll
+++ b/test/CodeGen/ARM/pack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s
 
 ; CHECK: test1
 ; CHECK: pkhbt   r0, r0, r1, lsl #16
diff --git a/test/CodeGen/ARM/phi.ll b/test/CodeGen/ARM/phi.ll
index dc1a95b..94bced5 100644
--- a/test/CodeGen/ARM/phi.ll
+++ b/test/CodeGen/ARM/phi.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=arm -mattr=+v4t < %s | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s
+
 ; <rdar://problem/8686347>
 
 define i32 @test1(i1 %a, i32* %b) {
diff --git a/test/CodeGen/ARM/popcnt.ll b/test/CodeGen/ARM/popcnt.ll
index bdf793d..7ace640 100644
--- a/test/CodeGen/ARM/popcnt.ll
+++ b/test/CodeGen/ARM/popcnt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 ; Implement ctpop with vcnt
 
 define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
diff --git a/test/CodeGen/ARM/prefetch-thumb.ll b/test/CodeGen/ARM/prefetch-thumb.ll
deleted file mode 100644
index e6f6ae8..0000000
--- a/test/CodeGen/ARM/prefetch-thumb.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc < %s -march=thumb -mattr=+v7         | FileCheck %s -check-prefix=THUMB2
-; TODO: This test case will be merged back into prefetch.ll when ARM mode issue is solved.
-
-declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
-
-define void @t6() {
-entry:
-;ARM: t6:
-;ARM: pld [sp]
-;ARM: pld [sp, #50]
-
-;THUMB2: t6:
-;THUMB2: pld [sp]
-;THUMB2: pld [sp, #50]
-
-%red = alloca [100 x i8], align 1
-%0 = getelementptr inbounds [100 x i8]* %red, i32 0, i32 0
-%1 = getelementptr inbounds [100 x i8]* %red, i32 0, i32 50
-call void @llvm.prefetch(i8* %0, i32 0, i32 3, i32 1)
-call void @llvm.prefetch(i8* %1, i32 0, i32 3, i32 1)
-ret void
-}
diff --git a/test/CodeGen/ARM/prefetch.ll b/test/CodeGen/ARM/prefetch.ll
index 5badb31..7350e0a 100644
--- a/test/CodeGen/ARM/prefetch.ll
+++ b/test/CodeGen/ARM/prefetch.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=thumb -mattr=-thumb2 | not grep pld
-; RUN: llc < %s -march=thumb -mattr=+v7         | FileCheck %s -check-prefix=THUMB2
-; RUN: llc < %s -march=arm   -mattr=+v7         | FileCheck %s -check-prefix=ARM
-; RUN: llc < %s -march=arm   -mcpu=cortex-a9-mp | FileCheck %s -check-prefix=ARM-MP
+; RUN: llc -mtriple=thumb-eabi -mattr=-thumb2 %s -o - | FileCheck %s -check-prefix CHECK-T1
+; RUN: llc -mtriple=thumb-eabi -mattr=+v7 %s -o - | FileCheck %s -check-prefix=THUMB2
+; RUN: llc -mtriple=arm-eabi -mattr=+v7 %s -o - | FileCheck %s -check-prefix=ARM
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9-mp %s -o - | FileCheck %s -check-prefix=ARM-MP
 ; rdar://8601536
 
+; CHECK-T1-NOT: pld
+
 define void @t1(i8* %ptr) nounwind  {
 entry:
 ; ARM-LABEL: t1:
@@ -75,3 +77,21 @@ entry:
   tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 0 )
   ret void
 }
+
+define void @t6() {
+entry:
+;ARM-LABEL: t6:
+;ARM: pld [sp]
+;ARM: pld [sp, #50]
+
+;THUMB2-LABEL: t6:
+;THUMB2: pld [sp]
+;THUMB2: pld [sp, #50]
+
+%red = alloca [100 x i8], align 1
+%0 = getelementptr inbounds [100 x i8]* %red, i32 0, i32 0
+%1 = getelementptr inbounds [100 x i8]* %red, i32 0, i32 50
+call void @llvm.prefetch(i8* %0, i32 0, i32 3, i32 1)
+call void @llvm.prefetch(i8* %1, i32 0, i32 3, i32 1)
+ret void
+}
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index 25484f4..b245674 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -34,9 +34,11 @@ entry:
   %12 = sext <4 x i16> %11 to <4 x i32>           ; <<4 x i32>> [#uses=1]
   %13 = mul <4 x i32> %1, %9                      ; <<4 x i32>> [#uses=1]
   %14 = mul <4 x i32> %3, %12                     ; <<4 x i32>> [#uses=1]
-  %15 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %13, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1]
-  %16 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %14, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1]
-  %17 = shufflevector <4 x i16> %15, <4 x i16> %16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1]
+  %15 = lshr <4 x i32> %13, <i32 12, i32 12, i32 12, i32 12>
+  %trunc_15 = trunc <4 x i32> %15 to <4 x i16>
+  %16 = lshr <4 x i32> %14, <i32 12, i32 12, i32 12, i32 12>
+  %trunc_16 = trunc <4 x i32> %16 to <4 x i16>
+  %17 = shufflevector <4 x i16> %trunc_15, <4 x i16> %trunc_16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1]
   %18 = bitcast i16* %o_ptr to i8*                ; <i8*> [#uses=1]
   tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17, i32 1)
   ret void
diff --git a/test/CodeGen/ARM/ret0.ll b/test/CodeGen/ARM/ret0.ll
index 5c312eb..e51067b 100644
--- a/test/CodeGen/ARM/ret0.ll
+++ b/test/CodeGen/ARM/ret0.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define i32 @test() {
         ret i32 0
diff --git a/test/CodeGen/ARM/ret_arg1.ll b/test/CodeGen/ARM/ret_arg1.ll
index 1ab947b..b7eef20 100644
--- a/test/CodeGen/ARM/ret_arg1.ll
+++ b/test/CodeGen/ARM/ret_arg1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define i32 @test(i32 %a1) {
         ret i32 %a1
diff --git a/test/CodeGen/ARM/ret_arg2.ll b/test/CodeGen/ARM/ret_arg2.ll
index 84477d0..bcb379b 100644
--- a/test/CodeGen/ARM/ret_arg2.ll
+++ b/test/CodeGen/ARM/ret_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define i32 @test(i32 %a1, i32 %a2) {
         ret i32 %a2
diff --git a/test/CodeGen/ARM/ret_arg3.ll b/test/CodeGen/ARM/ret_arg3.ll
index f7f9057..625162f 100644
--- a/test/CodeGen/ARM/ret_arg3.ll
+++ b/test/CodeGen/ARM/ret_arg3.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
+
 define i32 @test(i32 %a1, i32 %a2, i32 %a3) {
         ret i32 %a3
 }
diff --git a/test/CodeGen/ARM/ret_arg4.ll b/test/CodeGen/ARM/ret_arg4.ll
index f7b3e4a..81b55fe 100644
--- a/test/CodeGen/ARM/ret_arg4.ll
+++ b/test/CodeGen/ARM/ret_arg4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define i32 @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
         ret i32 %a4
diff --git a/test/CodeGen/ARM/ret_arg5.ll b/test/CodeGen/ARM/ret_arg5.ll
index c4f9fb5..680e89f 100644
--- a/test/CodeGen/ARM/ret_arg5.ll
+++ b/test/CodeGen/ARM/ret_arg5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define i32 @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5) {
         ret i32 %a5
diff --git a/test/CodeGen/ARM/ret_f32_arg2.ll b/test/CodeGen/ARM/ret_f32_arg2.ll
index 2bafea6..0caee0b 100644
--- a/test/CodeGen/ARM/ret_f32_arg2.ll
+++ b/test/CodeGen/ARM/ret_f32_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define float @test_f32(float %a1, float %a2) {
         ret float %a2
diff --git a/test/CodeGen/ARM/ret_f32_arg5.ll b/test/CodeGen/ARM/ret_f32_arg5.ll
index c6ce60e..d39dc7e 100644
--- a/test/CodeGen/ARM/ret_f32_arg5.ll
+++ b/test/CodeGen/ARM/ret_f32_arg5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define float @test_f32_arg5(float %a1, float %a2, float %a3, float %a4, float %a5) {
         ret float %a5
diff --git a/test/CodeGen/ARM/ret_f64_arg2.ll b/test/CodeGen/ARM/ret_f64_arg2.ll
index 386e85f..c4519ff 100644
--- a/test/CodeGen/ARM/ret_f64_arg2.ll
+++ b/test/CodeGen/ARM/ret_f64_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define double @test_f64(double %a1, double %a2) {
         ret double %a2
diff --git a/test/CodeGen/ARM/ret_f64_arg_reg_split.ll b/test/CodeGen/ARM/ret_f64_arg_reg_split.ll
index bdb0a60..ef11250 100644
--- a/test/CodeGen/ARM/ret_f64_arg_reg_split.ll
+++ b/test/CodeGen/ARM/ret_f64_arg_reg_split.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mcpu=arm8 -mattr=+vfp2
+; RUN: llc -mtriple=arm-eabi -mcpu=arm8 -mattr=+vfp2 %s -o /dev/null
 
 define double @test_double_arg_reg_split(i32 %a1, double %a2) {
         ret double %a2
diff --git a/test/CodeGen/ARM/ret_f64_arg_split.ll b/test/CodeGen/ARM/ret_f64_arg_split.ll
index 4f841a3..1130920 100644
--- a/test/CodeGen/ARM/ret_f64_arg_split.ll
+++ b/test/CodeGen/ARM/ret_f64_arg_split.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define double @test_double_arg_split(i64 %a1, i32 %a2, double %a3) {
         ret double %a3
diff --git a/test/CodeGen/ARM/ret_f64_arg_stack.ll b/test/CodeGen/ARM/ret_f64_arg_stack.ll
index 2144317..f45923e 100644
--- a/test/CodeGen/ARM/ret_f64_arg_stack.ll
+++ b/test/CodeGen/ARM/ret_f64_arg_stack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define double @test_double_arg_stack(i64 %a1, i32 %a2, i32 %a3, double %a4) {
         ret double %a4
diff --git a/test/CodeGen/ARM/ret_i128_arg2.ll b/test/CodeGen/ARM/ret_i128_arg2.ll
index 908c34f..a87f3f2 100644
--- a/test/CodeGen/ARM/ret_i128_arg2.ll
+++ b/test/CodeGen/ARM/ret_i128_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define i128 @test_i128(i128 %a1, i128 %a2, i128 %a3) {
         ret i128 %a3
diff --git a/test/CodeGen/ARM/ret_i64_arg2.ll b/test/CodeGen/ARM/ret_i64_arg2.ll
index b1a1024..c51d2b8 100644
--- a/test/CodeGen/ARM/ret_i64_arg2.ll
+++ b/test/CodeGen/ARM/ret_i64_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc -march=arm -mattr=+vfp2 %s -o /dev/null
 
 define i64 @test_i64(i64 %a1, i64 %a2) {
         ret i64 %a2
diff --git a/test/CodeGen/ARM/ret_i64_arg3.ll b/test/CodeGen/ARM/ret_i64_arg3.ll
index ffc1d2f..602997e 100644
--- a/test/CodeGen/ARM/ret_i64_arg3.ll
+++ b/test/CodeGen/ARM/ret_i64_arg3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc -march=arm -mattr=+vfp2 %s -o /dev/null
 
 define i64 @test_i64_arg3(i64 %a1, i64 %a2, i64 %a3) {
         ret i64 %a3
diff --git a/test/CodeGen/ARM/ret_i64_arg_split.ll b/test/CodeGen/ARM/ret_i64_arg_split.ll
index 956bce5..0583b27 100644
--- a/test/CodeGen/ARM/ret_i64_arg_split.ll
+++ b/test/CodeGen/ARM/ret_i64_arg_split.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define i64 @test_i64_arg_split(i64 %a1, i32 %a2, i64 %a3) {
         ret i64 %a3
diff --git a/test/CodeGen/ARM/ret_void.ll b/test/CodeGen/ARM/ret_void.ll
index 2b7ae05..93dc5c1 100644
--- a/test/CodeGen/ARM/ret_void.ll
+++ b/test/CodeGen/ARM/ret_void.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define void @test() {
         ret void
diff --git a/test/CodeGen/ARM/returned-ext.ll b/test/CodeGen/ARM/returned-ext.ll
index d2cdeb0..925e9e7 100644
--- a/test/CodeGen/ARM/returned-ext.ll
+++ b/test/CodeGen/ARM/returned-ext.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -arm-tail-calls | FileCheck %s -check-prefix=CHECKELF
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-tail-calls | FileCheck %s -check-prefix=CHECKT2D
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi | FileCheck %s -check-prefix=CHECKELF
+; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 | FileCheck %s -check-prefix=CHECKT2D
 
 declare i16 @identity16(i16 returned %x)
 declare i32 @identity32(i32 returned %x)
diff --git a/test/CodeGen/ARM/returned-trunc-tail-calls.ll b/test/CodeGen/ARM/returned-trunc-tail-calls.ll
index 5946727..6051a83 100644
--- a/test/CodeGen/ARM/returned-trunc-tail-calls.ll
+++ b/test/CodeGen/ARM/returned-trunc-tail-calls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7 -arm-tail-calls | FileCheck %s
+; RUN: llc < %s -mtriple=armv7 | FileCheck %s
 
 declare i16 @ret16(i16 returned)
 declare i32 @ret32(i32 returned)
diff --git a/test/CodeGen/ARM/rev.ll b/test/CodeGen/ARM/rev.ll
index 6c380ae..f95f971 100644
--- a/test/CodeGen/ARM/rev.ll
+++ b/test/CodeGen/ARM/rev.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s
 
 define i32 @test1(i32 %X) nounwind {
 ; CHECK: test1
diff --git a/test/CodeGen/ARM/saxpy10-a9.ll b/test/CodeGen/ARM/saxpy10-a9.ll
new file mode 100644
index 0000000..f8f5e18
--- /dev/null
+++ b/test/CodeGen/ARM/saxpy10-a9.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -march=arm -mtriple=thumbv7-apple-ios7.0.0 -float-abi=hard -mcpu=cortex-a9 -misched-postra -misched-bench -scheditins=false | FileCheck %s
+;
+; Test MI-Sched suppory latency based stalls on in in-order pipeline
+; using the new machine model.
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+; Don't be too strict with the top of the schedule, but most of it
+; should be nicely pipelined.
+;
+; CHECK: saxpy10:
+; CHECK: vldr
+; CHECK: vldr
+; CHECK: vldr
+; CHECK: vldr
+; CHECK: vldr
+; CHECK: vldr
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vmov
+; CHECK-NEXT: bx
+;
+; This accumulates a sum rather than storing each result.
+define float @saxpy10(float* nocapture readonly %data1, float* nocapture readonly %data2, float %a) {
+entry:
+  %0 = load float* %data1, align 4
+  %mul = fmul float %0, %a
+  %1 = load float* %data2, align 4
+  %add = fadd float %mul, %1
+  %add2 = fadd float %add, 0.000000e+00
+  %arrayidx.1 = getelementptr inbounds float* %data1, i32 1
+  %2 = load float* %arrayidx.1, align 4
+  %mul.1 = fmul float %2, %a
+  %arrayidx1.1 = getelementptr inbounds float* %data2, i32 1
+  %3 = load float* %arrayidx1.1, align 4
+  %add.1 = fadd float %mul.1, %3
+  %add2.1 = fadd float %add2, %add.1
+  %arrayidx.2 = getelementptr inbounds float* %data1, i32 2
+  %4 = load float* %arrayidx.2, align 4
+  %mul.2 = fmul float %4, %a
+  %arrayidx1.2 = getelementptr inbounds float* %data2, i32 2
+  %5 = load float* %arrayidx1.2, align 4
+  %add.2 = fadd float %mul.2, %5
+  %add2.2 = fadd float %add2.1, %add.2
+  %arrayidx.3 = getelementptr inbounds float* %data1, i32 3
+  %6 = load float* %arrayidx.3, align 4
+  %mul.3 = fmul float %6, %a
+  %arrayidx1.3 = getelementptr inbounds float* %data2, i32 3
+  %7 = load float* %arrayidx1.3, align 4
+  %add.3 = fadd float %mul.3, %7
+  %add2.3 = fadd float %add2.2, %add.3
+  %arrayidx.4 = getelementptr inbounds float* %data1, i32 4
+  %8 = load float* %arrayidx.4, align 4
+  %mul.4 = fmul float %8, %a
+  %arrayidx1.4 = getelementptr inbounds float* %data2, i32 4
+  %9 = load float* %arrayidx1.4, align 4
+  %add.4 = fadd float %mul.4, %9
+  %add2.4 = fadd float %add2.3, %add.4
+  %arrayidx.5 = getelementptr inbounds float* %data1, i32 5
+  %10 = load float* %arrayidx.5, align 4
+  %mul.5 = fmul float %10, %a
+  %arrayidx1.5 = getelementptr inbounds float* %data2, i32 5
+  %11 = load float* %arrayidx1.5, align 4
+  %add.5 = fadd float %mul.5, %11
+  %add2.5 = fadd float %add2.4, %add.5
+  %arrayidx.6 = getelementptr inbounds float* %data1, i32 6
+  %12 = load float* %arrayidx.6, align 4
+  %mul.6 = fmul float %12, %a
+  %arrayidx1.6 = getelementptr inbounds float* %data2, i32 6
+  %13 = load float* %arrayidx1.6, align 4
+  %add.6 = fadd float %mul.6, %13
+  %add2.6 = fadd float %add2.5, %add.6
+  %arrayidx.7 = getelementptr inbounds float* %data1, i32 7
+  %14 = load float* %arrayidx.7, align 4
+  %mul.7 = fmul float %14, %a
+  %arrayidx1.7 = getelementptr inbounds float* %data2, i32 7
+  %15 = load float* %arrayidx1.7, align 4
+  %add.7 = fadd float %mul.7, %15
+  %add2.7 = fadd float %add2.6, %add.7
+  %arrayidx.8 = getelementptr inbounds float* %data1, i32 8
+  %16 = load float* %arrayidx.8, align 4
+  %mul.8 = fmul float %16, %a
+  %arrayidx1.8 = getelementptr inbounds float* %data2, i32 8
+  %17 = load float* %arrayidx1.8, align 4
+  %add.8 = fadd float %mul.8, %17
+  %add2.8 = fadd float %add2.7, %add.8
+  %arrayidx.9 = getelementptr inbounds float* %data1, i32 9
+  %18 = load float* %arrayidx.9, align 4
+  %mul.9 = fmul float %18, %a
+  %arrayidx1.9 = getelementptr inbounds float* %data2, i32 9
+  %19 = load float* %arrayidx1.9, align 4
+  %add.9 = fadd float %mul.9, %19
+  %add2.9 = fadd float %add2.8, %add.9
+  ret float %add2.9
+}
diff --git a/test/CodeGen/ARM/sbfx.ll b/test/CodeGen/ARM/sbfx.ll
index 36fbd19..3c25edc 100644
--- a/test/CodeGen/ARM/sbfx.ll
+++ b/test/CodeGen/ARM/sbfx.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6t2 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v6t2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 entry:
diff --git a/test/CodeGen/ARM/segmented-stacks-dynamic.ll b/test/CodeGen/ARM/segmented-stacks-dynamic.ll
new file mode 100644
index 0000000..13b5bcf
--- /dev/null
+++ b/test/CodeGen/ARM/segmented-stacks-dynamic.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-android
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-linux
+; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+
+; Just to prevent the alloca from being optimized away
+declare void @dummy_use(i32*, i32)
+
+define i32 @test_basic(i32 %l) {
+        %mem = alloca i32, i32 %l
+        call void @dummy_use (i32* %mem, i32 %l)
+        %terminate = icmp eq i32 %l, 0
+        br i1 %terminate, label %true, label %false
+
+true:
+        ret i32 0
+
+false:
+        %newlen = sub i32 %l, 1
+        %retvalue = call i32 @test_basic(i32 %newlen)
+        ret i32 %retvalue
+
+; ARM-linux:      test_basic:
+
+; ARM-linux:      push    {r4, r5}
+; ARM-linux-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-linux-NEXT: mov     r5, sp
+; ARM-linux-NEXT: ldr     r4, [r4, #4]
+; ARM-linux-NEXT: cmp     r4, r5
+; ARM-linux-NEXT: blo     .LBB0_2
+
+; ARM-linux:      mov     r4, #24
+; ARM-linux-NEXT: mov     r5, #0
+; ARM-linux-NEXT: stmdb   sp!, {lr}
+; ARM-linux-NEXT: bl      __morestack
+; ARM-linux-NEXT: ldm     sp!, {lr}
+; ARM-linux-NEXT: pop     {r4, r5}
+; ARM-linux-NEXT: bx      lr
+
+; ARM-linux:      pop     {r4, r5}
+
+
+; ARM-android:      test_basic:
+
+; ARM-android:      push    {r4, r5}
+; ARM-android-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-android-NEXT: mov     r5, sp
+; ARM-android-NEXT: ldr     r4, [r4, #252]
+; ARM-android-NEXT: cmp     r4, r5
+; ARM-android-NEXT: blo     .LBB0_2
+
+; ARM-android:      mov     r4, #24
+; ARM-android-NEXT: mov     r5, #0
+; ARM-android-NEXT: stmdb   sp!, {lr}
+; ARM-android-NEXT: bl      __morestack
+; ARM-android-NEXT: ldm     sp!, {lr}
+; ARM-android-NEXT: pop     {r4, r5}
+; ARM-android-NEXT: bx      lr
+
+; ARM-android:      pop     {r4, r5}
+
+}
diff --git a/test/CodeGen/ARM/segmented-stacks.ll b/test/CodeGen/ARM/segmented-stacks.ll
new file mode 100644
index 0000000..5eff633
--- /dev/null
+++ b/test/CodeGen/ARM/segmented-stacks.ll
@@ -0,0 +1,235 @@
+; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-android
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-linux
+
+; We used to crash with filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+
+
+; Just to prevent the alloca from being optimized away
+declare void @dummy_use(i32*, i32)
+
+define void @test_basic() {
+        %mem = alloca i32, i32 10
+        call void @dummy_use (i32* %mem, i32 10)
+	ret void
+
+; ARM-linux:      test_basic:
+
+; ARM-linux:      push    {r4, r5}
+; ARM-linux-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-linux-NEXT: mov     r5, sp
+; ARM-linux-NEXT: ldr     r4, [r4, #4]
+; ARM-linux-NEXT: cmp     r4, r5
+; ARM-linux-NEXT: blo     .LBB0_2
+
+; ARM-linux:      mov     r4, #48
+; ARM-linux-NEXT: mov     r5, #0
+; ARM-linux-NEXT: stmdb   sp!, {lr}
+; ARM-linux-NEXT: bl      __morestack
+; ARM-linux-NEXT: ldm     sp!, {lr}
+; ARM-linux-NEXT: pop     {r4, r5}
+; ARM-linux-NEXT: bx      lr
+
+; ARM-linux:      pop     {r4, r5}
+
+; ARM-android:      test_basic:
+
+; ARM-android:      push    {r4, r5}
+; ARM-android-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-android-NEXT: mov     r5, sp
+; ARM-android-NEXT: ldr     r4, [r4, #252]
+; ARM-android-NEXT: cmp     r4, r5
+; ARM-android-NEXT: blo     .LBB0_2
+
+; ARM-android:      mov     r4, #48
+; ARM-android-NEXT: mov     r5, #0
+; ARM-android-NEXT: stmdb   sp!, {lr}
+; ARM-android-NEXT: bl      __morestack
+; ARM-android-NEXT: ldm     sp!, {lr}
+; ARM-android-NEXT: pop     {r4, r5}
+; ARM-android-NEXT: bx      lr
+
+; ARM-android:      pop     {r4, r5}
+
+}
+
+define i32 @test_nested(i32 * nest %closure, i32 %other) {
+       %addend = load i32 * %closure
+       %result = add i32 %other, %addend
+       ret i32 %result
+
+; ARM-linux:      test_nested:
+
+; ARM-linux:      push    {r4, r5}
+; ARM-linux-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-linux-NEXT: mov     r5, sp
+; ARM-linux-NEXT: ldr     r4, [r4, #4]
+; ARM-linux-NEXT: cmp     r4, r5
+; ARM-linux-NEXT: blo     .LBB1_2
+
+; ARM-linux:      mov     r4, #0
+; ARM-linux-NEXT: mov     r5, #0
+; ARM-linux-NEXT: stmdb   sp!, {lr}
+; ARM-linux-NEXT: bl      __morestack
+; ARM-linux-NEXT: ldm     sp!, {lr}
+; ARM-linux-NEXT: pop     {r4, r5}
+; ARM-linux-NEXT: bx      lr
+
+; ARM-linux:      pop     {r4, r5}
+
+; ARM-android:      test_nested:
+
+; ARM-android:      push    {r4, r5}
+; ARM-android-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-android-NEXT: mov     r5, sp
+; ARM-android-NEXT: ldr     r4, [r4, #252]
+; ARM-android-NEXT: cmp     r4, r5
+; ARM-android-NEXT: blo     .LBB1_2
+
+; ARM-android:      mov     r4, #0
+; ARM-android-NEXT: mov     r5, #0
+; ARM-android-NEXT: stmdb   sp!, {lr}
+; ARM-android-NEXT: bl      __morestack
+; ARM-android-NEXT: ldm     sp!, {lr}
+; ARM-android-NEXT: pop     {r4, r5}
+; ARM-android-NEXT: bx      lr
+
+; ARM-android:      pop     {r4, r5}
+
+}
+
+define void @test_large() {
+        %mem = alloca i32, i32 10000
+        call void @dummy_use (i32* %mem, i32 0)
+        ret void
+
+; ARM-linux:      test_large:
+
+; ARM-linux:      push    {r4, r5}
+; ARM-linux-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-linux-NEXT: sub     r5, sp, #40192
+; ARM-linux-NEXT: ldr     r4, [r4, #4]
+; ARM-linux-NEXT: cmp     r4, r5
+; ARM-linux-NEXT: blo     .LBB2_2
+
+; ARM-linux:      mov     r4, #40192
+; ARM-linux-NEXT: mov     r5, #0
+; ARM-linux-NEXT: stmdb   sp!, {lr}
+; ARM-linux-NEXT: bl      __morestack
+; ARM-linux-NEXT: ldm     sp!, {lr}
+; ARM-linux-NEXT: pop     {r4, r5}
+; ARM-linux-NEXT: bx      lr
+
+; ARM-linux:      pop     {r4, r5}
+
+; ARM-android:      test_large:
+
+; ARM-android:      push    {r4, r5}
+; ARM-android-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-android-NEXT: sub     r5, sp, #40192
+; ARM-android-NEXT: ldr     r4, [r4, #252]
+; ARM-android-NEXT: cmp     r4, r5
+; ARM-android-NEXT: blo     .LBB2_2
+
+; ARM-android:      mov     r4, #40192
+; ARM-android-NEXT: mov     r5, #0
+; ARM-android-NEXT: stmdb   sp!, {lr}
+; ARM-android-NEXT: bl      __morestack
+; ARM-android-NEXT: ldm     sp!, {lr}
+; ARM-android-NEXT: pop     {r4, r5}
+; ARM-android-NEXT: bx      lr
+
+; ARM-android:      pop     {r4, r5}
+
+}
+
+define fastcc void @test_fastcc() {
+        %mem = alloca i32, i32 10
+        call void @dummy_use (i32* %mem, i32 10)
+        ret void
+
+; ARM-linux:      test_fastcc:
+
+; ARM-linux:      push    {r4, r5}
+; ARM-linux-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-linux-NEXT: mov     r5, sp
+; ARM-linux-NEXT: ldr     r4, [r4, #4]
+; ARM-linux-NEXT: cmp     r4, r5
+; ARM-linux-NEXT: blo     .LBB3_2
+
+; ARM-linux:      mov     r4, #48
+; ARM-linux-NEXT: mov     r5, #0
+; ARM-linux-NEXT: stmdb   sp!, {lr}
+; ARM-linux-NEXT: bl      __morestack
+; ARM-linux-NEXT: ldm     sp!, {lr}
+; ARM-linux-NEXT: pop     {r4, r5}
+; ARM-linux-NEXT: bx      lr
+
+; ARM-linux:      pop     {r4, r5}
+
+; ARM-android:      test_fastcc:
+
+; ARM-android:      push    {r4, r5}
+; ARM-android-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-android-NEXT: mov     r5, sp
+; ARM-android-NEXT: ldr     r4, [r4, #252]
+; ARM-android-NEXT: cmp     r4, r5
+; ARM-android-NEXT: blo     .LBB3_2
+
+; ARM-android:      mov     r4, #48
+; ARM-android-NEXT: mov     r5, #0
+; ARM-android-NEXT: stmdb   sp!, {lr}
+; ARM-android-NEXT: bl      __morestack
+; ARM-android-NEXT: ldm     sp!, {lr}
+; ARM-android-NEXT: pop     {r4, r5}
+; ARM-android-NEXT: bx      lr
+
+; ARM-android:      pop     {r4, r5}
+
+}
+
+define fastcc void @test_fastcc_large() {
+        %mem = alloca i32, i32 10000
+        call void @dummy_use (i32* %mem, i32 0)
+        ret void
+
+; ARM-linux:      test_fastcc_large:
+
+; ARM-linux:      push    {r4, r5}
+; ARM-linux-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-linux-NEXT: sub     r5, sp, #40192
+; ARM-linux-NEXT: ldr     r4, [r4, #4]
+; ARM-linux-NEXT: cmp     r4, r5
+; ARM-linux-NEXT: blo     .LBB4_2
+
+; ARM-linux:      mov     r4, #40192
+; ARM-linux-NEXT: mov     r5, #0
+; ARM-linux-NEXT: stmdb   sp!, {lr}
+; ARM-linux-NEXT: bl      __morestack
+; ARM-linux-NEXT: ldm     sp!, {lr}
+; ARM-linux-NEXT: pop     {r4, r5}
+; ARM-linux-NEXT: bx      lr
+
+; ARM-linux:      pop     {r4, r5}
+
+; ARM-android:      test_fastcc_large:
+
+; ARM-android:      push    {r4, r5}
+; ARM-android-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; ARM-android-NEXT: sub     r5, sp, #40192
+; ARM-android-NEXT: ldr     r4, [r4, #252]
+; ARM-android-NEXT: cmp     r4, r5
+; ARM-android-NEXT: blo     .LBB4_2
+
+; ARM-android:      mov     r4, #40192
+; ARM-android-NEXT: mov     r5, #0
+; ARM-android-NEXT: stmdb   sp!, {lr}
+; ARM-android-NEXT: bl      __morestack
+; ARM-android-NEXT: ldm     sp!, {lr}
+; ARM-android-NEXT: pop     {r4, r5}
+; ARM-android-NEXT: bx      lr
+
+; ARM-android:      pop     {r4, r5}
+
+}
diff --git a/test/CodeGen/ARM/select-imm.ll b/test/CodeGen/ARM/select-imm.ll
index 6f4bfb8..e2dc554 100644
--- a/test/CodeGen/ARM/select-imm.ll
+++ b/test/CodeGen/ARM/select-imm.ll
@@ -1,6 +1,10 @@
-; RUN: llc < %s -march=arm                  | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -march=arm -mattr=+thumb2   | FileCheck %s --check-prefix=ARMT2
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s --check-prefix=THUMB2
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s --check-prefix=ARM
+
+; RUN: llc -mtriple=arm-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
+; RUN:  | FileCheck %s --check-prefix=ARMT2
+
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
+; RUN:  | FileCheck %s --check-prefix=THUMB2
 
 define i32 @t1(i32 %c) nounwind readnone {
 entry:
diff --git a/test/CodeGen/ARM/select-undef.ll b/test/CodeGen/ARM/select-undef.ll
index 23f7eb8..bae4d40 100644
--- a/test/CodeGen/ARM/select-undef.ll
+++ b/test/CodeGen/ARM/select-undef.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm -mcpu=swift -verify-machineinstrs
+; RUN: llc -mtriple=arm-eabi -mcpu=swift -verify-machineinstrs %s -o /dev/null
+
 define i32 @func(i32 %arg0, i32 %arg1) {
 entry:
   %cmp = icmp slt i32 %arg0, 10
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index ed006d6..e9394a7 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -1,6 +1,10 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s --check-prefix=CHECK-VFP
-; RUN: llc < %s -mattr=+neon,+thumb2 -mtriple=thumbv7-apple-darwin | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: llc -mtriple=arm-apple-darwin %s -o - | FileCheck %s
+
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \
+; RUN:	| FileCheck %s --check-prefix=CHECK-VFP
+
+; RUN: llc -mtriple=thumbv7-apple-darwin -mattr=+neon,+thumb2 %s -o - \
+; RUN:	| FileCheck %s --check-prefix=CHECK-NEON
 
 define i32 @f1(i32 %a.s) {
 ;CHECK-LABEL: f1:
diff --git a/test/CodeGen/ARM/setcc-sentinals.ll b/test/CodeGen/ARM/setcc-sentinals.ll
index 8878f9b..dc45e0e 100644
--- a/test/CodeGen/ARM/setcc-sentinals.ll
+++ b/test/CodeGen/ARM/setcc-sentinals.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -mcpu=cortex-a8 -march=arm -asm-verbose=false | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 -asm-verbose=false %s -o - | FileCheck %s
 
 define zeroext i1 @test0(i32 %x) nounwind {
 ; CHECK-LABEL: test0:
-; CHECK-NEXT: add [[REG:(r[0-9]+)|(lr)]], r0, #1
+; CHECK: add [[REG:(r[0-9]+)|(lr)]], r0, #1
 ; CHECK-NEXT: mov r0, #0
 ; CHECK-NEXT: cmp [[REG]], #1
 ; CHECK-NEXT: movwhi r0, #1
diff --git a/test/CodeGen/ARM/smul.ll b/test/CodeGen/ARM/smul.ll
index 686d791..b7ddd10 100644
--- a/test/CodeGen/ARM/smul.ll
+++ b/test/CodeGen/ARM/smul.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mcpu=generic
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=generic %s -o /dev/null
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
 
 @x = weak global i16 0          ; <i16*> [#uses=1]
 @y = weak global i16 0          ; <i16*> [#uses=0]
diff --git a/test/CodeGen/ARM/ssp-data-layout.ll b/test/CodeGen/ARM/ssp-data-layout.ll
new file mode 100644
index 0000000..e7dafac
--- /dev/null
+++ b/test/CodeGen/ARM/ssp-data-layout.ll
@@ -0,0 +1,528 @@
+; RUN: llc < %s -disable-fp-elim -march=arm -mcpu=cortex-a8 -mtriple arm-linux-gnu -o - | FileCheck %s
+;  This test is fairly fragile.  The goal is to ensure that "large" stack
+;  objects are allocated closest to the stack protector (i.e., farthest away 
+;  from the Stack Pointer.)  In standard SSP mode this means that large (>=
+;  ssp-buffer-size) arrays and structures containing such arrays are
+;  closet to the protector.  With sspstrong and sspreq this means large
+;  arrays/structures-with-arrays are closest, followed by small (< ssp-buffer-size)
+;  arrays/structures-with-arrays, and then addr-taken variables.
+;
+;  Ideally, we only want verify that the objects appear in the correct groups
+;  and that the groups have the correct relative stack offset.  The ordering
+;  within a group is not relevant to this test.  Unfortunately, there is not
+;  an elegant way to do this, so just match the offset for each object.
+
+%struct.struct_large_char = type { [8 x i8] }
+%struct.struct_large_char2 = type { [2 x i8], [8 x i8] }
+%struct.struct_small_char = type { [2 x i8] }
+%struct.struct_large_nonchar = type { [8 x i32] }
+%struct.struct_small_nonchar = type { [2 x i16] }
+
+define void @layout_ssp() ssp {
+entry:
+; Expected stack layout for ssp is
+;  180 large_char          . Group 1, nested arrays, arrays >= ssp-buffer-size
+;  172 struct_large_char   .
+;  168 scalar1             | Everything else
+;  164 scalar2
+;  160 scalar3
+;  156 addr-of
+;  152 small_nonchar (84+68)
+;  112 large_nonchar
+;  110 small_char
+;  108 struct_small_char
+;   72 struct_large_nonchar
+;   68 struct_small_nonchar
+
+; CHECK: layout_ssp:
+; r[[SP]] is used as an offset into the stack later
+; CHECK: add r[[SP:[0-9]+]], sp, #68
+
+; CHECK: bl get_scalar1
+; CHECK: str r0, [sp, #168]
+; CHECK: bl end_scalar1
+
+; CHECK: bl get_scalar2
+; CHECK: str r0, [sp, #164]
+; CHECK: bl end_scalar2
+
+; CHECK: bl get_scalar3
+; CHECK: str r0, [sp, #160]
+; CHECK: bl end_scalar3
+
+; CHECK: bl get_addrof
+; CHECK: str r0, [sp, #156]
+; CHECK: bl end_addrof
+
+; CHECK: get_small_nonchar
+; CHECK: strh r0, [r[[SP]], #84]
+; CHECK: bl end_small_nonchar
+
+; CHECK: bl get_large_nonchar
+; CHECK: str r0, [sp, #112]
+; CHECK: bl end_large_nonchar
+
+; CHECK: bl get_small_char
+; CHECK: strb r0, [sp, #110]
+; CHECK: bl end_small_char
+
+; CHECK: bl get_large_char
+; CHECK: strb r0, [sp, #180]
+; CHECK: bl end_large_char
+
+; CHECK: bl get_struct_large_char
+; CHECK: strb r0, [sp, #172]
+; CHECK: bl end_struct_large_char
+
+; CHECK: bl get_struct_small_char
+; CHECK: strb r0, [sp, #108]
+; CHECK: bl end_struct_small_char
+
+; CHECK: bl get_struct_large_nonchar
+; CHECK:str r0, [sp, #72]
+; CHECK: bl end_struct_large_nonchar
+
+; CHECK: bl get_struct_small_nonchar
+; CHECK: strh r0, [r[[SP]]]
+; CHECK: bl end_struct_small_nonchar
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  %ptr = alloca i32, align 4
+  %small2 = alloca [2 x i16], align 2
+  %large2 = alloca [8 x i32], align 16
+  %small = alloca [2 x i8], align 1
+  %large = alloca [8 x i8], align 1
+  %a = alloca %struct.struct_large_char, align 1
+  %b = alloca %struct.struct_small_char, align 1
+  %c = alloca %struct.struct_large_nonchar, align 8
+  %d = alloca %struct.struct_small_nonchar, align 2
+  %call = call i32 @get_scalar1()
+  store i32 %call, i32* %x, align 4
+  call void @end_scalar1()
+  %call1 = call i32 @get_scalar2()
+  store i32 %call1, i32* %y, align 4
+  call void @end_scalar2()
+  %call2 = call i32 @get_scalar3()
+  store i32 %call2, i32* %z, align 4
+  call void @end_scalar3()
+  %call3 = call i32 @get_addrof()
+  store i32 %call3, i32* %ptr, align 4
+  call void @end_addrof()
+  %call4 = call signext i16 @get_small_nonchar()
+  %arrayidx = getelementptr inbounds [2 x i16]* %small2, i32 0, i64 0
+  store i16 %call4, i16* %arrayidx, align 2
+  call void @end_small_nonchar()
+  %call5 = call i32 @get_large_nonchar()
+  %arrayidx6 = getelementptr inbounds [8 x i32]* %large2, i32 0, i64 0
+  store i32 %call5, i32* %arrayidx6, align 4
+  call void @end_large_nonchar()
+  %call7 = call signext i8 @get_small_char()
+  %arrayidx8 = getelementptr inbounds [2 x i8]* %small, i32 0, i64 0
+  store i8 %call7, i8* %arrayidx8, align 1
+  call void @end_small_char()
+  %call9 = call signext i8 @get_large_char()
+  %arrayidx10 = getelementptr inbounds [8 x i8]* %large, i32 0, i64 0
+  store i8 %call9, i8* %arrayidx10, align 1
+  call void @end_large_char()
+  %call11 = call signext i8 @get_struct_large_char()
+  %foo = getelementptr inbounds %struct.struct_large_char* %a, i32 0, i32 0
+  %arrayidx12 = getelementptr inbounds [8 x i8]* %foo, i32 0, i64 0
+  store i8 %call11, i8* %arrayidx12, align 1
+  call void @end_struct_large_char()
+  %call13 = call signext i8 @get_struct_small_char()
+  %foo14 = getelementptr inbounds %struct.struct_small_char* %b, i32 0, i32 0
+  %arrayidx15 = getelementptr inbounds [2 x i8]* %foo14, i32 0, i64 0
+  store i8 %call13, i8* %arrayidx15, align 1
+  call void @end_struct_small_char()
+  %call16 = call i32 @get_struct_large_nonchar()
+  %foo17 = getelementptr inbounds %struct.struct_large_nonchar* %c, i32 0, i32 0
+  %arrayidx18 = getelementptr inbounds [8 x i32]* %foo17, i32 0, i64 0
+  store i32 %call16, i32* %arrayidx18, align 4
+  call void @end_struct_large_nonchar()
+  %call19 = call signext i16 @get_struct_small_nonchar()
+  %foo20 = getelementptr inbounds %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %arrayidx21 = getelementptr inbounds [2 x i16]* %foo20, i32 0, i64 0
+  store i16 %call19, i16* %arrayidx21, align 2
+  call void @end_struct_small_nonchar()
+  %arraydecay = getelementptr inbounds [8 x i8]* %large, i32 0, i32 0
+  %arraydecay22 = getelementptr inbounds [2 x i8]* %small, i32 0, i32 0
+  %arraydecay23 = getelementptr inbounds [8 x i32]* %large2, i32 0, i32 0
+  %arraydecay24 = getelementptr inbounds [2 x i16]* %small2, i32 0, i32 0
+  %0 = load i32* %x, align 4
+  %1 = load i32* %y, align 4
+  %2 = load i32* %z, align 4
+  %coerce.dive = getelementptr %struct.struct_large_char* %a, i32 0, i32 0
+  %3 = bitcast [8 x i8]* %coerce.dive to i64*
+  %4 = load i64* %3, align 1
+  %coerce.dive25 = getelementptr %struct.struct_small_char* %b, i32 0, i32 0
+  %5 = bitcast [2 x i8]* %coerce.dive25 to i16*
+  %6 = load i16* %5, align 1
+  %coerce.dive26 = getelementptr %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
+  %8 = load i32* %7, align 1
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  ret void
+}
+
+define void @layout_sspstrong() sspstrong {
+entry:
+; Expected stack layout for sspstrong is
+; 144  large_nonchar          . Group 1, nested arrays,
+; 136  large_char             .  arrays >= ssp-buffer-size
+; 128  struct_large_char      .
+; 96   struct_large_nonchar   .
+; 84+8 small_non_char         | Group 2, nested arrays, 
+; 90   small_char             |  arrays < ssp-buffer-size
+; 88   struct_small_char      |
+; 84   struct_small_nonchar   |
+; 80   addrof                 * Group 3, addr-of local
+; 76   scalar1                + Group 4, everything else
+; 72   scalar2                +
+; 68   scalar3                +
+;   
+; CHECK: layout_sspstrong:
+; r[[SP]] is used as an offset into the stack later
+; CHECK: add r[[SP:[0-9]+]], sp, #84
+
+; CHECK: bl get_scalar1
+; CHECK: str r0, [sp, #76]
+; CHECK: bl end_scalar1
+
+; CHECK: bl get_scalar2
+; CHECK: str r0, [sp, #72]
+; CHECK: bl end_scalar2
+
+; CHECK: bl get_scalar3
+; CHECK: str r0, [sp, #68]
+; CHECK: bl end_scalar3
+
+; CHECK: bl get_addrof
+; CHECK: str r0, [sp, #80]
+; CHECK: bl end_addrof
+
+; CHECK: get_small_nonchar
+; CHECK: strh r0, [r[[SP]], #8]
+; CHECK: bl end_small_nonchar
+
+; CHECK: bl get_large_nonchar
+; CHECK: str r0, [sp, #144]
+; CHECK: bl end_large_nonchar
+
+; CHECK: bl get_small_char
+; CHECK: strb r0, [sp, #90]
+; CHECK: bl end_small_char
+
+; CHECK: bl get_large_char
+; CHECK: strb r0, [sp, #136]
+; CHECK: bl end_large_char
+
+; CHECK: bl get_struct_large_char
+; CHECK: strb r0, [sp, #128]
+; CHECK: bl end_struct_large_char
+
+; CHECK: bl get_struct_small_char
+; CHECK: strb r0, [sp, #88]
+; CHECK: bl end_struct_small_char
+
+; CHECK: bl get_struct_large_nonchar
+; CHECK: str r0, [sp, #96]
+; CHECK: bl end_struct_large_nonchar
+
+; CHECK: bl get_struct_small_nonchar
+; CHECK: strh r0, [r[[SP]]]
+; CHECK: bl end_struct_small_nonchar
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  %ptr = alloca i32, align 4
+  %small2 = alloca [2 x i16], align 2
+  %large2 = alloca [8 x i32], align 16
+  %small = alloca [2 x i8], align 1
+  %large = alloca [8 x i8], align 1
+  %a = alloca %struct.struct_large_char, align 1
+  %b = alloca %struct.struct_small_char, align 1
+  %c = alloca %struct.struct_large_nonchar, align 8
+  %d = alloca %struct.struct_small_nonchar, align 2
+  %call = call i32 @get_scalar1()
+  store i32 %call, i32* %x, align 4
+  call void @end_scalar1()
+  %call1 = call i32 @get_scalar2()
+  store i32 %call1, i32* %y, align 4
+  call void @end_scalar2()
+  %call2 = call i32 @get_scalar3()
+  store i32 %call2, i32* %z, align 4
+  call void @end_scalar3()
+  %call3 = call i32 @get_addrof()
+  store i32 %call3, i32* %ptr, align 4
+  call void @end_addrof()
+  %call4 = call signext i16 @get_small_nonchar()
+  %arrayidx = getelementptr inbounds [2 x i16]* %small2, i32 0, i64 0
+  store i16 %call4, i16* %arrayidx, align 2
+  call void @end_small_nonchar()
+  %call5 = call i32 @get_large_nonchar()
+  %arrayidx6 = getelementptr inbounds [8 x i32]* %large2, i32 0, i64 0
+  store i32 %call5, i32* %arrayidx6, align 4
+  call void @end_large_nonchar()
+  %call7 = call signext i8 @get_small_char()
+  %arrayidx8 = getelementptr inbounds [2 x i8]* %small, i32 0, i64 0
+  store i8 %call7, i8* %arrayidx8, align 1
+  call void @end_small_char()
+  %call9 = call signext i8 @get_large_char()
+  %arrayidx10 = getelementptr inbounds [8 x i8]* %large, i32 0, i64 0
+  store i8 %call9, i8* %arrayidx10, align 1
+  call void @end_large_char()
+  %call11 = call signext i8 @get_struct_large_char()
+  %foo = getelementptr inbounds %struct.struct_large_char* %a, i32 0, i32 0
+  %arrayidx12 = getelementptr inbounds [8 x i8]* %foo, i32 0, i64 0
+  store i8 %call11, i8* %arrayidx12, align 1
+  call void @end_struct_large_char()
+  %call13 = call signext i8 @get_struct_small_char()
+  %foo14 = getelementptr inbounds %struct.struct_small_char* %b, i32 0, i32 0
+  %arrayidx15 = getelementptr inbounds [2 x i8]* %foo14, i32 0, i64 0
+  store i8 %call13, i8* %arrayidx15, align 1
+  call void @end_struct_small_char()
+  %call16 = call i32 @get_struct_large_nonchar()
+  %foo17 = getelementptr inbounds %struct.struct_large_nonchar* %c, i32 0, i32 0
+  %arrayidx18 = getelementptr inbounds [8 x i32]* %foo17, i32 0, i64 0
+  store i32 %call16, i32* %arrayidx18, align 4
+  call void @end_struct_large_nonchar()
+  %call19 = call signext i16 @get_struct_small_nonchar()
+  %foo20 = getelementptr inbounds %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %arrayidx21 = getelementptr inbounds [2 x i16]* %foo20, i32 0, i64 0
+  store i16 %call19, i16* %arrayidx21, align 2
+  call void @end_struct_small_nonchar()
+  %arraydecay = getelementptr inbounds [8 x i8]* %large, i32 0, i32 0
+  %arraydecay22 = getelementptr inbounds [2 x i8]* %small, i32 0, i32 0
+  %arraydecay23 = getelementptr inbounds [8 x i32]* %large2, i32 0, i32 0
+  %arraydecay24 = getelementptr inbounds [2 x i16]* %small2, i32 0, i32 0
+  %0 = load i32* %x, align 4
+  %1 = load i32* %y, align 4
+  %2 = load i32* %z, align 4
+  %coerce.dive = getelementptr %struct.struct_large_char* %a, i32 0, i32 0
+  %3 = bitcast [8 x i8]* %coerce.dive to i64*
+  %4 = load i64* %3, align 1
+  %coerce.dive25 = getelementptr %struct.struct_small_char* %b, i32 0, i32 0
+  %5 = bitcast [2 x i8]* %coerce.dive25 to i16*
+  %6 = load i16* %5, align 1
+  %coerce.dive26 = getelementptr %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
+  %8 = load i32* %7, align 1
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  ret void
+}
+
+define void @layout_sspreq() sspreq {
+entry:
+; Expected stack layout for sspreq is the same as sspstrong
+;   
+; CHECK: layout_sspreq:
+; r[[SP]] is used as an offset into the stack later
+; CHECK: add r[[SP:[0-9]+]], sp, #84
+
+; CHECK: bl get_scalar1
+; CHECK: str r0, [sp, #76]
+; CHECK: bl end_scalar1
+
+; CHECK: bl get_scalar2
+; CHECK: str r0, [sp, #72]
+; CHECK: bl end_scalar2
+
+; CHECK: bl get_scalar3
+; CHECK: str r0, [sp, #68]
+; CHECK: bl end_scalar3
+
+; CHECK: bl get_addrof
+; CHECK: str r0, [sp, #80]
+; CHECK: bl end_addrof
+
+; CHECK: get_small_nonchar
+; CHECK: strh r0, [r[[SP]], #8]
+; CHECK: bl end_small_nonchar
+
+; CHECK: bl get_large_nonchar
+; CHECK: str r0, [sp, #144]
+; CHECK: bl end_large_nonchar
+
+; CHECK: bl get_small_char
+; CHECK: strb r0, [sp, #90]
+; CHECK: bl end_small_char
+
+; CHECK: bl get_large_char
+; CHECK: strb r0, [sp, #136]
+; CHECK: bl end_large_char
+
+; CHECK: bl get_struct_large_char
+; CHECK: strb r0, [sp, #128]
+; CHECK: bl end_struct_large_char
+
+; CHECK: bl get_struct_small_char
+; CHECK: strb r0, [sp, #88]
+; CHECK: bl end_struct_small_char
+
+; CHECK: bl get_struct_large_nonchar
+; CHECK: str r0, [sp, #96]
+; CHECK: bl end_struct_large_nonchar
+
+; CHECK: bl get_struct_small_nonchar
+; CHECK: strh r0, [r[[SP]]]
+; CHECK: bl end_struct_small_nonchar
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  %ptr = alloca i32, align 4
+  %small2 = alloca [2 x i16], align 2
+  %large2 = alloca [8 x i32], align 16
+  %small = alloca [2 x i8], align 1
+  %large = alloca [8 x i8], align 1
+  %a = alloca %struct.struct_large_char, align 1
+  %b = alloca %struct.struct_small_char, align 1
+  %c = alloca %struct.struct_large_nonchar, align 8
+  %d = alloca %struct.struct_small_nonchar, align 2
+  %call = call i32 @get_scalar1()
+  store i32 %call, i32* %x, align 4
+  call void @end_scalar1()
+  %call1 = call i32 @get_scalar2()
+  store i32 %call1, i32* %y, align 4
+  call void @end_scalar2()
+  %call2 = call i32 @get_scalar3()
+  store i32 %call2, i32* %z, align 4
+  call void @end_scalar3()
+  %call3 = call i32 @get_addrof()
+  store i32 %call3, i32* %ptr, align 4
+  call void @end_addrof()
+  %call4 = call signext i16 @get_small_nonchar()
+  %arrayidx = getelementptr inbounds [2 x i16]* %small2, i32 0, i64 0
+  store i16 %call4, i16* %arrayidx, align 2
+  call void @end_small_nonchar()
+  %call5 = call i32 @get_large_nonchar()
+  %arrayidx6 = getelementptr inbounds [8 x i32]* %large2, i32 0, i64 0
+  store i32 %call5, i32* %arrayidx6, align 4
+  call void @end_large_nonchar()
+  %call7 = call signext i8 @get_small_char()
+  %arrayidx8 = getelementptr inbounds [2 x i8]* %small, i32 0, i64 0
+  store i8 %call7, i8* %arrayidx8, align 1
+  call void @end_small_char()
+  %call9 = call signext i8 @get_large_char()
+  %arrayidx10 = getelementptr inbounds [8 x i8]* %large, i32 0, i64 0
+  store i8 %call9, i8* %arrayidx10, align 1
+  call void @end_large_char()
+  %call11 = call signext i8 @get_struct_large_char()
+  %foo = getelementptr inbounds %struct.struct_large_char* %a, i32 0, i32 0
+  %arrayidx12 = getelementptr inbounds [8 x i8]* %foo, i32 0, i64 0
+  store i8 %call11, i8* %arrayidx12, align 1
+  call void @end_struct_large_char()
+  %call13 = call signext i8 @get_struct_small_char()
+  %foo14 = getelementptr inbounds %struct.struct_small_char* %b, i32 0, i32 0
+  %arrayidx15 = getelementptr inbounds [2 x i8]* %foo14, i32 0, i64 0
+  store i8 %call13, i8* %arrayidx15, align 1
+  call void @end_struct_small_char()
+  %call16 = call i32 @get_struct_large_nonchar()
+  %foo17 = getelementptr inbounds %struct.struct_large_nonchar* %c, i32 0, i32 0
+  %arrayidx18 = getelementptr inbounds [8 x i32]* %foo17, i32 0, i64 0
+  store i32 %call16, i32* %arrayidx18, align 4
+  call void @end_struct_large_nonchar()
+  %call19 = call signext i16 @get_struct_small_nonchar()
+  %foo20 = getelementptr inbounds %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %arrayidx21 = getelementptr inbounds [2 x i16]* %foo20, i32 0, i64 0
+  store i16 %call19, i16* %arrayidx21, align 2
+  call void @end_struct_small_nonchar()
+  %arraydecay = getelementptr inbounds [8 x i8]* %large, i32 0, i32 0
+  %arraydecay22 = getelementptr inbounds [2 x i8]* %small, i32 0, i32 0
+  %arraydecay23 = getelementptr inbounds [8 x i32]* %large2, i32 0, i32 0
+  %arraydecay24 = getelementptr inbounds [2 x i16]* %small2, i32 0, i32 0
+  %0 = load i32* %x, align 4
+  %1 = load i32* %y, align 4
+  %2 = load i32* %z, align 4
+  %coerce.dive = getelementptr %struct.struct_large_char* %a, i32 0, i32 0
+  %3 = bitcast [8 x i8]* %coerce.dive to i64*
+  %4 = load i64* %3, align 1
+  %coerce.dive25 = getelementptr %struct.struct_small_char* %b, i32 0, i32 0
+  %5 = bitcast [2 x i8]* %coerce.dive25 to i16*
+  %6 = load i16* %5, align 1
+  %coerce.dive26 = getelementptr %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
+  %8 = load i32* %7, align 1
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  ret void
+}
+
+define void @struct_with_protectable_arrays() sspstrong {
+entry:
+; Check to ensure that a structure which contains a small array followed by a
+; large array is assigned to the stack properly as a large object.
+; CHECK: struct_with_protectable_arrays:
+; CHECK: bl get_struct_small_char
+; CHECK: strb r0, [sp, #68]
+; CHECK: bl end_struct_small_char
+; CHECK: bl get_struct_large_char2
+; CHECK: strb r0, [sp, #106]
+; CHECK: bl end_struct_large_char2
+  %a = alloca %struct.struct_small_char, align 1
+  %b = alloca %struct.struct_large_char2, align 1
+  %d1 = alloca %struct.struct_large_nonchar, align 8
+  %d2 = alloca %struct.struct_small_nonchar, align 2
+  %call = call signext i8 @get_struct_small_char()
+  %foo = getelementptr inbounds %struct.struct_small_char* %a, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i8]* %foo, i32 0, i64 0
+  store i8 %call, i8* %arrayidx, align 1
+  call void @end_struct_small_char()
+  %call1 = call signext i8 @get_struct_large_char2()
+  %foo2 = getelementptr inbounds %struct.struct_large_char2* %b, i32 0, i32 1
+  %arrayidx3 = getelementptr inbounds [8 x i8]* %foo2, i32 0, i64 0
+  store i8 %call1, i8* %arrayidx3, align 1
+  call void @end_struct_large_char2()
+  %0 = bitcast %struct.struct_large_char2* %b to %struct.struct_large_char*
+  %coerce.dive = getelementptr %struct.struct_large_char* %0, i32 0, i32 0
+  %1 = bitcast [8 x i8]* %coerce.dive to i64*
+  %2 = load i64* %1, align 1
+  %coerce.dive4 = getelementptr %struct.struct_small_char* %a, i32 0, i32 0
+  %3 = bitcast [2 x i8]* %coerce.dive4 to i16*
+  %4 = load i16* %3, align 1
+  %coerce.dive5 = getelementptr %struct.struct_small_nonchar* %d2, i32 0, i32 0
+  %5 = bitcast [2 x i16]* %coerce.dive5 to i32*
+  %6 = load i32* %5, align 1
+  call void @takes_all(i64 %2, i16 %4, %struct.struct_large_nonchar* byval align 8 %d1, i32 %6, i8* null, i8* null, i32* null, i16* null, i32* null, i32 0, i32 0, i32 0)
+  ret void
+}
+
+declare i32 @get_scalar1()
+declare void @end_scalar1()
+
+declare i32 @get_scalar2()
+declare void @end_scalar2()
+
+declare i32 @get_scalar3()
+declare void @end_scalar3()
+
+declare i32 @get_addrof()
+declare void @end_addrof()
+
+declare signext i16 @get_small_nonchar()
+declare void @end_small_nonchar()
+
+declare i32 @get_large_nonchar()
+declare void @end_large_nonchar()
+
+declare signext i8 @get_small_char()
+declare void @end_small_char()
+
+declare signext i8 @get_large_char()
+declare void @end_large_char()
+
+declare signext i8 @get_struct_large_char()
+declare void @end_struct_large_char()
+
+declare signext i8 @get_struct_large_char2()
+declare void @end_struct_large_char2()
+
+declare signext i8 @get_struct_small_char()
+declare void @end_struct_small_char()
+
+declare i32 @get_struct_large_nonchar()
+declare void @end_struct_large_nonchar()
+
+declare signext i16 @get_struct_small_nonchar()
+declare void @end_struct_small_nonchar()
+
+declare void @takes_all(i64, i16, %struct.struct_large_nonchar* byval align 8, i32, i8*, i8*, i32*, i16*, i32*, i32, i32, i32)
diff --git a/test/CodeGen/ARM/stack-frame.ll b/test/CodeGen/ARM/stack-frame.ll
index 1dd57dd..a419074 100644
--- a/test/CodeGen/ARM/stack-frame.ll
+++ b/test/CodeGen/ARM/stack-frame.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=arm
-; RUN: llc < %s -march=arm | grep add | count 1
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define void @f1() {
 	%c = alloca i8, align 1
@@ -10,4 +9,6 @@ define i32 @f2() {
 	ret i32 1
 }
 
+; CHECK: add
+; CHECK-NOT: add
 
diff --git a/test/CodeGen/ARM/str_post.ll b/test/CodeGen/ARM/str_post.ll
index 32e3b85..a4f8640 100644
--- a/test/CodeGen/ARM/str_post.ll
+++ b/test/CodeGen/ARM/str_post.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i16 @test1(i32* %X, i16* %A) {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/ARM/str_pre.ll b/test/CodeGen/ARM/str_pre.ll
index d8b3f0e..60e6e9ec 100644
--- a/test/CodeGen/ARM/str_pre.ll
+++ b/test/CodeGen/ARM/str_pre.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=arm | \
-; RUN:   grep "str.*\!" | count 2
+; RUN: llc -mtriple=arm-eabi %s -o -  | FileCheck %s
 
 define void @test1(i32* %X, i32* %A, i32** %dest) {
         %B = load i32* %A               ; <i32> [#uses=1]
@@ -16,3 +15,8 @@ define i16* @test2(i16* %X, i32* %A) {
         store i16 %tmp, i16* %Y
         ret i16* %Y
 }
+
+; CHECK: str{{.*}}!
+; CHECK: str{{.*}}!
+; CHECK-NOT: str{{.*}}!
+
diff --git a/test/CodeGen/ARM/str_trunc.ll b/test/CodeGen/ARM/str_trunc.ll
index 2f1166b..6739684 100644
--- a/test/CodeGen/ARM/str_trunc.ll
+++ b/test/CodeGen/ARM/str_trunc.ll
@@ -1,7 +1,4 @@
-; RUN: llc < %s -march=arm | \
-; RUN:   grep strb | count 1
-; RUN: llc < %s -march=arm | \
-; RUN:   grep strh | count 1
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define void @test1(i32 %v, i16* %ptr) {
         %tmp = trunc i32 %v to i16              ; <i16> [#uses=1]
@@ -14,3 +11,10 @@ define void @test2(i32 %v, i8* %ptr) {
         store i8 %tmp, i8* %ptr
         ret void
 }
+
+; CHECK: strh
+; CHECK-NOT: strh
+
+; CHECK: strb
+; CHECK-NOT: strb
+
diff --git a/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll b/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll
index 1899269..0a9bc3c 100644
--- a/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll
+++ b/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll
@@ -13,7 +13,7 @@
 ;structs at varying alignments. Each test is run for arm, thumb2 and thumb1.
 ;We check for the strings in the generated object code using llvm-objdump
 ;because it provides better assurance that we are generating instructions
-;for the correct architecture. Otherwise we could accidently generate an
+;for the correct architecture. Otherwise we could accidentally generate an
 ;ARM instruction for THUMB1 and wouldn't detect it because the assembly
 ;code representation is the same, but the object code would be generated
 ;incorrectly. For each test we check for the label, a load instruction of the
diff --git a/test/CodeGen/ARM/sub.ll b/test/CodeGen/ARM/sub.ll
index 7f82ca7..67bde2a 100644
--- a/test/CodeGen/ARM/sub.ll
+++ b/test/CodeGen/ARM/sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/ARM/subreg-remat.ll b/test/CodeGen/ARM/subreg-remat.ll
index 1bc0315..d5abfc0 100644
--- a/test/CodeGen/ARM/subreg-remat.ll
+++ b/test/CodeGen/ARM/subreg-remat.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=pic -disable-fp-elim -mcpu=cortex-a8 -pre-RA-sched=source | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -disable-fp-elim -mcpu=cortex-a8 -pre-RA-sched=source -no-integrated-as | FileCheck %s
 target triple = "thumbv7-apple-ios"
 ; <rdar://problem/10032939>
 ;
diff --git a/test/CodeGen/ARM/sxt_rot.ll b/test/CodeGen/ARM/sxt_rot.ll
index 656cd93..5ddea2e 100644
--- a/test/CodeGen/ARM/sxt_rot.ll
+++ b/test/CodeGen/ARM/sxt_rot.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s
 
 define i32 @test0(i8 %A) {
 ; CHECK: test0
diff --git a/test/CodeGen/ARM/t2-imm.ll b/test/CodeGen/ARM/t2-imm.ll
index 8b41459..dd75cd1 100644
--- a/test/CodeGen/ARM/t2-imm.ll
+++ b/test/CodeGen/ARM/t2-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f6(i32 %a) {
 ; CHECK:f6
diff --git a/test/CodeGen/ARM/tail-call.ll b/test/CodeGen/ARM/tail-call.ll
new file mode 100644
index 0000000..7711586
--- /dev/null
+++ b/test/CodeGen/ARM/tail-call.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple armv7 -O0 -o - < %s | FileCheck %s -check-prefix CHECK-TAIL
+; RUN: llc -mtriple armv7 -O0 -disable-tail-calls -o - < %s \
+; RUN:   | FileCheck %s -check-prefix CHECK-NO-TAIL
+
+declare i32 @callee(i32 %i)
+
+define i32 @caller(i32 %i) {
+entry:
+  %r = tail call i32 @callee(i32 %i)
+  ret i32 %r
+}
+
+; CHECK-TAIL-LABEL: caller
+; CHECK-TAIL: b callee
+
+; CHECK-NO-TAIL-LABEL: caller
+; CHECK-NO-TAIL: push {lr}
+; CHECK-NO-TAIL: bl callee
+; CHECK-NO-TAIL: pop {lr}
+; CHECK-NO-TAIL: bx lr
+
diff --git a/test/CodeGen/ARM/taildup-branch-weight.ll b/test/CodeGen/ARM/taildup-branch-weight.ll
new file mode 100644
index 0000000..0a16071
--- /dev/null
+++ b/test/CodeGen/ARM/taildup-branch-weight.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=arm-eabi -print-machineinstrs=tailduplication -tail-dup-size=100 \
+; RUN:      -enable-tail-merge=false -disable-cgp %s -o /dev/null 2>&1 \
+; RUN:	| FileCheck %s
+
+; CHECK: Machine code for function test0:
+; CHECK: Successors according to CFG: BB#1(4) BB#2(124)
+
+define void @test0(i32 %a, i32 %b, i32* %c, i32* %d) {
+entry:
+  store i32 3, i32* %d
+  br label %B1
+
+B2:
+  store i32 2, i32* %c
+  br label %B4
+
+B3:
+  store i32 2, i32* %c
+  br label %B4
+
+B1:
+  store i32 1, i32* %d
+  %test0 = icmp slt i32 %a, %b
+  br i1 %test0, label %B2, label %B3, !prof !0
+
+B4:
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 4, i32 124}
+
+; CHECK: Machine code for function test1:
+; CHECK: Successors according to CFG: BB#1(8) BB#2(248)
+
+@g0 = common global i32 0, align 4
+
+define void @test1(i32 %a, i32 %b, i32* %c, i32* %d, i32* %e) {
+
+  %test0 = icmp slt i32 %a, %b
+  br i1 %test0, label %B1, label %B2, !prof !1
+
+B1:
+  br label %B3
+
+B2:
+  store i32 2, i32* %c
+  br label %B3
+
+B3:
+  store i32 3, i32* %e
+  ret void
+}
+
+!1 = metadata !{metadata !"branch_weights", i32 248, i32 8}
diff --git a/test/CodeGen/ARM/this-return.ll b/test/CodeGen/ARM/this-return.ll
index cb42de6..c681a1c 100644
--- a/test/CodeGen/ARM/this-return.ll
+++ b/test/CodeGen/ARM/this-return.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -arm-tail-calls | FileCheck %s -check-prefix=CHECKELF
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-tail-calls | FileCheck %s -check-prefix=CHECKT2D
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi | FileCheck %s -check-prefix=CHECKELF
+; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 | FileCheck %s -check-prefix=CHECKT2D
 
 %struct.A = type { i8 }
 %struct.B = type { i32 }
diff --git a/test/CodeGen/ARM/thumb-litpool.ll b/test/CodeGen/ARM/thumb-litpool.ll
new file mode 100644
index 0000000..f68fdb6
--- /dev/null
+++ b/test/CodeGen/ARM/thumb-litpool.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=thumbv6m-apple-macho %s -relocation-model=static -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv6m-apple-macho %s -relocation-model=pic -o - | FileCheck %s
+
+@var = global i8 zeroinitializer
+
+declare void @callee(i8*)
+
+define void @foo() minsize {
+; CHECK-LABEL: foo:
+; CHECK: ldr {{r[0-7]}}, LCPI0_0
+  call void @callee(i8* @var)
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7}"()
+  call void @callee(i8* @var)
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/thumb2-it-block.ll b/test/CodeGen/ARM/thumb2-it-block.ll
index 47c5dcc..d954760 100644
--- a/test/CodeGen/ARM/thumb2-it-block.ll
+++ b/test/CodeGen/ARM/thumb2-it-block.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
 ; PR11107
 
diff --git a/test/CodeGen/ARM/tls-models.ll b/test/CodeGen/ARM/tls-models.ll
index ccc9032..42c1ba9 100644
--- a/test/CodeGen/ARM/tls-models.ll
+++ b/test/CodeGen/ARM/tls-models.ll
@@ -22,9 +22,9 @@ entry:
 
   ; Non-PIC code can use initial-exec, PIC code has to use general dynamic.
   ; CHECK-NONPIC-LABEL:   f1:
-  ; CHECK-NONPIC:   external_gd(gottpoff)
+  ; CHECK-NONPIC:   external_gd(GOTTPOFF)
   ; CHECK-PIC-LABEL:      f1:
-  ; CHECK-PIC:      external_gd(tlsgd)
+  ; CHECK-PIC:      external_gd(TLSGD)
 }
 
 define i32* @f2() {
@@ -34,9 +34,9 @@ entry:
   ; Non-PIC code can use local exec, PIC code can use local dynamic,
   ; but that is not implemented, so falls back to general dynamic.
   ; CHECK-NONPIC-LABEL:   f2:
-  ; CHECK-NONPIC:   internal_gd(tpoff)
+  ; CHECK-NONPIC:   internal_gd(TPOFF)
   ; CHECK-PIC-LABEL:      f2:
-  ; CHECK-PIC:      internal_gd(tlsgd)
+  ; CHECK-PIC:      internal_gd(TLSGD)
 }
 
 
@@ -49,9 +49,9 @@ entry:
   ; Non-PIC code can use initial exec, PIC should use local dynamic,
   ; but that is not implemented, so falls back to general dynamic.
   ; CHECK-NONPIC-LABEL:   f3:
-  ; CHECK-NONPIC:   external_ld(gottpoff)
+  ; CHECK-NONPIC:   external_ld(GOTTPOFF)
   ; CHECK-PIC-LABEL:      f3:
-  ; CHECK-PIC:      external_ld(tlsgd)
+  ; CHECK-PIC:      external_ld(TLSGD)
 }
 
 define i32* @f4() {
@@ -61,9 +61,9 @@ entry:
   ; Non-PIC code can use local exec, PIC code can use local dynamic,
   ; but that is not implemented, so it falls back to general dynamic.
   ; CHECK-NONPIC-LABEL:   f4:
-  ; CHECK-NONPIC:   internal_ld(tpoff)
+  ; CHECK-NONPIC:   internal_ld(TPOFF)
   ; CHECK-PIC-LABEL:      f4:
-  ; CHECK-PIC:      internal_ld(tlsgd)
+  ; CHECK-PIC:      internal_ld(TLSGD)
 }
 
 
@@ -75,9 +75,9 @@ entry:
 
   ; Non-PIC and PIC code will use initial exec as specified.
   ; CHECK-NONPIC-LABEL:   f5:
-  ; CHECK-NONPIC:   external_ie(gottpoff)
+  ; CHECK-NONPIC:   external_ie(GOTTPOFF)
   ; CHECK-PIC-LABEL:      f5:
-  ; CHECK-PIC:      external_ie(gottpoff)
+  ; CHECK-PIC:      external_ie(GOTTPOFF)
 }
 
 define i32* @f6() {
@@ -86,9 +86,9 @@ entry:
 
   ; Non-PIC code can use local exec, PIC code use initial exec as specified.
   ; CHECK-NONPIC-LABEL:   f6:
-  ; CHECK-NONPIC:   internal_ie(tpoff)
+  ; CHECK-NONPIC:   internal_ie(TPOFF)
   ; CHECK-PIC-LABEL:      f6:
-  ; CHECK-PIC:      internal_ie(gottpoff)
+  ; CHECK-PIC:      internal_ie(GOTTPOFF)
 }
 
 
@@ -100,9 +100,9 @@ entry:
 
   ; Non-PIC and PIC code will use local exec as specified.
   ; CHECK-NONPIC-LABEL:   f7:
-  ; CHECK-NONPIC:   external_le(tpoff)
+  ; CHECK-NONPIC:   external_le(TPOFF)
   ; CHECK-PIC-LABEL:      f7:
-  ; CHECK-PIC:      external_le(tpoff)
+  ; CHECK-PIC:      external_le(TPOFF)
 }
 
 define i32* @f8() {
@@ -111,7 +111,7 @@ entry:
 
   ; Non-PIC and PIC code will use local exec as specified.
   ; CHECK-NONPIC-LABEL:   f8:
-  ; CHECK-NONPIC:   internal_le(tpoff)
+  ; CHECK-NONPIC:   internal_le(TPOFF)
   ; CHECK-PIC-LABEL:      f8:
-  ; CHECK-PIC:      internal_le(tpoff)
+  ; CHECK-PIC:      internal_le(TPOFF)
 }
diff --git a/test/CodeGen/ARM/tls1.ll b/test/CodeGen/ARM/tls1.ll
index ec4278c..a1ca0b7 100644
--- a/test/CodeGen/ARM/tls1.ll
+++ b/test/CodeGen/ARM/tls1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep "i(tpoff)"
+; RUN:     grep "i(TPOFF)"
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
 ; RUN:     grep "__aeabi_read_tp"
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi \
diff --git a/test/CodeGen/ARM/tls2.ll b/test/CodeGen/ARM/tls2.ll
index f048125..24b4794 100644
--- a/test/CodeGen/ARM/tls2.ll
+++ b/test/CodeGen/ARM/tls2.ll
@@ -8,7 +8,7 @@
 define i32 @f() {
 ; CHECK-NONPIC-LABEL: f:
 ; CHECK-NONPIC: ldr {{r.}}, [pc, {{r.}}]
-; CHECK-NONPIC: i(gottpoff)
+; CHECK-NONPIC: i(GOTTPOFF)
 ; CHECK-PIC-LABEL: f:
 ; CHECK-PIC: __tls_get_addr
 entry:
@@ -19,7 +19,7 @@ entry:
 define i32* @g() {
 ; CHECK-NONPIC-LABEL: g:
 ; CHECK-NONPIC: ldr {{r.}}, [pc, {{r.}}]
-; CHECK-NONPIC: i(gottpoff)
+; CHECK-NONPIC: i(GOTTPOFF)
 ; CHECK-PIC-LABEL: g:
 ; CHECK-PIC: __tls_get_addr
 entry:
diff --git a/test/CodeGen/ARM/trunc_ldr.ll b/test/CodeGen/ARM/trunc_ldr.ll
index 3033c2b..2ce9b89 100644
--- a/test/CodeGen/ARM/trunc_ldr.ll
+++ b/test/CodeGen/ARM/trunc_ldr.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=arm | grep ldrb.*7 | count 1
-; RUN: llc < %s -march=arm | grep ldrsb.*7 | count 1
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 	%struct.A = type { i8, i8, i8, i8, i16, i8, i8, %struct.B** }
 	%struct.B = type { float, float, i32, i32, i32, [0 x i8] }
@@ -22,3 +21,10 @@ define i32 @f2(%struct.A* %d) {
         %tmp57 = sext i8 %tmp56 to i32
 	ret i32 %tmp57
 }
+
+; CHECK: ldrb{{.*}}7
+; CHECK-NOT: ldrb{{.*}}7
+
+; CHECK: ldrsb{{.*}}7
+; CHECK-NOT: ldrsb{{.*}}7
+
diff --git a/test/CodeGen/ARM/truncstore-dag-combine.ll b/test/CodeGen/ARM/truncstore-dag-combine.ll
index 5665440..360e3e1 100644
--- a/test/CodeGen/ARM/truncstore-dag-combine.ll
+++ b/test/CodeGen/ARM/truncstore-dag-combine.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v4t | not grep orr
-; RUN: llc < %s -march=arm -mattr=+v4t | not grep mov
+; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s
 
 define void @bar(i8* %P, i16* %Q) {
 entry:
@@ -16,3 +15,7 @@ entry:
 	store i32 %tmp, i32* %P1, align 1
 	ret void
 }
+
+; CHECK-NOT: orr
+; CHECK-NOT: mov
+
diff --git a/test/CodeGen/ARM/tst_teq.ll b/test/CodeGen/ARM/tst_teq.ll
index c83111e..bac4fd9 100644
--- a/test/CodeGen/ARM/tst_teq.ll
+++ b/test/CodeGen/ARM/tst_teq.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=arm | grep tst
-; RUN: llc < %s -march=arm | grep teq
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @f(i32 %a) {
 entry:
@@ -16,3 +15,7 @@ entry:
 	%retval = select i1 %0, i32 20, i32 10		; <i32> [#uses=1]
 	ret i32 %retval
 }
+
+; CHECK: tst
+; CHECK: teq
+
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
index 2172f6b..8da875f 100644
--- a/test/CodeGen/ARM/twoaddrinstr.ll
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -1,5 +1,5 @@
 ; Tests for the two-address instruction pass.
-; RUN: llc -march=arm -mcpu=cortex-a9 < %s | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s
 
 define void @PR13378() nounwind {
 ; This was orriginally a crasher trying to schedule the instructions.
diff --git a/test/CodeGen/ARM/unaligned_load_store.ll b/test/CodeGen/ARM/unaligned_load_store.ll
index e7ff63f..72163ae 100644
--- a/test/CodeGen/ARM/unaligned_load_store.ll
+++ b/test/CodeGen/ARM/unaligned_load_store.ll
@@ -1,6 +1,11 @@
-; RUN: llc < %s -march=arm -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
-; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -mattr=-neon -arm-strict-align -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
-; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=UNALIGNED
+; RUN: llc -mtriple=arm-eabi -pre-RA-sched=source %s -o - \
+; RUN:	| FileCheck %s -check-prefix=EXPANDED
+
+; RUN: llc -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -mattr=-neon -arm-strict-align -pre-RA-sched=source %s -o - \
+; RUN:	| FileCheck %s -check-prefix=EXPANDED
+
+; RUN: llc -mtriple=armv6-apple-darwin -mcpu=cortex-a8 %s -o - \
+; RUN:	| FileCheck %s -check-prefix=UNALIGNED
 
 ; rdar://7113725
 ; rdar://12091029
diff --git a/test/CodeGen/ARM/unaligned_load_store_vector.ll b/test/CodeGen/ARM/unaligned_load_store_vector.ll
index 968a2c7..000ed48 100644
--- a/test/CodeGen/ARM/unaligned_load_store_vector.ll
+++ b/test/CodeGen/ARM/unaligned_load_store_vector.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=arm -mattr=+v7 -mattr=+neon | FileCheck %s
+;RUN: llc -mtriple=arm-eabi -mattr=+v7 -mattr=+neon %s -o - | FileCheck %s
 
 ;ALIGN = 1
 ;SIZE  = 64
diff --git a/test/CodeGen/ARM/unord.ll b/test/CodeGen/ARM/unord.ll
index bd28034..7243e99 100644
--- a/test/CodeGen/ARM/unord.ll
+++ b/test/CodeGen/ARM/unord.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=arm | grep movne | count 1
-; RUN: llc < %s -march=arm | grep moveq | count 1
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @f1(float %X, float %Y) {
 	%tmp = fcmp uno float %X, %Y
@@ -12,3 +11,10 @@ define i32 @f2(float %X, float %Y) {
 	%retval = select i1 %tmp, i32 1, i32 -1
 	ret i32 %retval
 }
+
+; CHECK: movne
+; CHECK-NOT: movne
+
+; CHECK: moveq
+; CHECK-NOT: moveq
+
diff --git a/test/CodeGen/ARM/uxt_rot.ll b/test/CodeGen/ARM/uxt_rot.ll
index 628c079..235416a 100644
--- a/test/CodeGen/ARM/uxt_rot.ll
+++ b/test/CodeGen/ARM/uxt_rot.ll
@@ -1,6 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+v6 | grep uxtb | count 1
-; RUN: llc < %s -march=arm -mattr=+v6 | grep uxtab | count 1
-; RUN: llc < %s -march=arm -mattr=+v6 | grep uxth | count 1
+; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s
 
 define zeroext i8 @test1(i32 %A.u) {
     %B.u = trunc i32 %A.u to i8
@@ -22,3 +20,13 @@ define zeroext i32 @test3(i32 %A.u) {
     %F.u = zext i16 %E.u to i32
     ret i32 %F.u
 }
+
+; CHECK: uxtb
+; CHECK-NOT: uxtb
+
+; CHECK: uxtab
+; CHECK-NOT: uxtab
+
+; CHECK: uxth
+; CHECK-NOT: uxth
+
diff --git a/test/CodeGen/ARM/v1-constant-fold.ll b/test/CodeGen/ARM/v1-constant-fold.ll
index eb49a81..7421d25 100644
--- a/test/CodeGen/ARM/v1-constant-fold.ll
+++ b/test/CodeGen/ARM/v1-constant-fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mattr=+v7,+vfp3,-neon  | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mattr=+v7,+vfp3,-neon | FileCheck %s
 
 ; PR15611. Check that we don't crash when constant folding v1i32 types.
 
@@ -11,7 +11,7 @@ bb:
   %tmp3 = insertelement <4 x i32> %tmp2, i32 0, i32 3
   %tmp4 = add <4 x i32> %tmp3, <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK:  bl bar
-  tail call void @bar(<4 x i32> %tmp4)
+  call void @bar(<4 x i32> %tmp4)
   ret void
 }
 
diff --git a/test/CodeGen/ARM/vaba.ll b/test/CodeGen/ARM/vaba.ll
index 97139e9..6478b18 100644
--- a/test/CodeGen/ARM/vaba.ll
+++ b/test/CodeGen/ARM/vaba.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vabas8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK-LABEL: vabas8:
diff --git a/test/CodeGen/ARM/vabd.ll b/test/CodeGen/ARM/vabd.ll
index 2eb6d93..9ba8be2 100644
--- a/test/CodeGen/ARM/vabd.ll
+++ b/test/CodeGen/ARM/vabd.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vabds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vabds8:
diff --git a/test/CodeGen/ARM/vabs.ll b/test/CodeGen/ARM/vabs.ll
index 96dd38e..3a1aec8 100644
--- a/test/CodeGen/ARM/vabs.ll
+++ b/test/CodeGen/ARM/vabs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vabss8(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: vabss8:
@@ -28,7 +28,7 @@ define <2 x float> @vabsf32(<2 x float>* %A) nounwind {
 ;CHECK-LABEL: vabsf32:
 ;CHECK: vabs.f32
 	%tmp1 = load <2 x float>* %A
-	%tmp2 = call <2 x float> @llvm.arm.neon.vabs.v2f32(<2 x float> %tmp1)
+	%tmp2 = call <2 x float> @llvm.fabs.v2f32(<2 x float> %tmp1)
 	ret <2 x float> %tmp2
 }
 
@@ -60,19 +60,19 @@ define <4 x float> @vabsQf32(<4 x float>* %A) nounwind {
 ;CHECK-LABEL: vabsQf32:
 ;CHECK: vabs.f32
 	%tmp1 = load <4 x float>* %A
-	%tmp2 = call <4 x float> @llvm.arm.neon.vabs.v4f32(<4 x float> %tmp1)
+	%tmp2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %tmp1)
 	ret <4 x float> %tmp2
 }
 
 declare <8 x i8>  @llvm.arm.neon.vabs.v8i8(<8 x i8>) nounwind readnone
 declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) nounwind readnone
 declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) nounwind readnone
-declare <2 x float> @llvm.arm.neon.vabs.v2f32(<2 x float>) nounwind readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone
 
 declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) nounwind readnone
 declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
 declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vabs.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone
 
 define <8 x i8> @vqabss8(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: vqabss8:
diff --git a/test/CodeGen/ARM/vadd.ll b/test/CodeGen/ARM/vadd.ll
index fcb5408..86b0d02 100644
--- a/test/CodeGen/ARM/vadd.ll
+++ b/test/CodeGen/ARM/vadd.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vaddi8:
diff --git a/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll b/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
new file mode 100644
index 0000000..19d6cbe
--- /dev/null
+++ b/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm-nacl-gnueabi | FileCheck %s
+
+declare void @llvm.va_start(i8*)
+declare void @external_func(i8*)
+
+@va_list = external global i8*
+
+; On ARM, varargs arguments are passed in r0-r3 with the rest on the
+; stack.  A varargs function must therefore spill rN-r3 just below the
+; function's initial stack pointer.
+;
+; This test checks for a bug in which a gap was left between the spill
+; area and varargs arguments on the stack when using 16 byte stack
+; alignment.
+
+define void @varargs_func(i32 %arg1, ...) {
+  call void @llvm.va_start(i8* bitcast (i8** @va_list to i8*))
+  call void @external_func(i8* bitcast (i8** @va_list to i8*))
+  ret void
+}
+; CHECK-LABEL: varargs_func:
+; Reserve space for the varargs save area.  This currently reserves
+; more than enough (16 bytes rather than the 12 bytes needed).
+; CHECK: sub sp, sp, #16
+; CHECK: push {lr}
+; Align the stack pointer to a multiple of 16.
+; CHECK: sub sp, sp, #12
+; Calculate the address of the varargs save area and save varargs
+; arguments into it.
+; CHECK-NEXT: add r0, sp, #20
+; CHECK-NEXT: stm r0, {r1, r2, r3}
diff --git a/test/CodeGen/ARM/vargs.ll b/test/CodeGen/ARM/vargs.ll
index 5f3536c..3b810f3 100644
--- a/test/CodeGen/ARM/vargs.ll
+++ b/test/CodeGen/ARM/vargs.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
+
 @str = internal constant [43 x i8] c"Hello World %d %d %d %d %d %d %d %d %d %d\0A\00"           ; <[43 x i8]*> [#uses=1]
 
 define i32 @main() {
diff --git a/test/CodeGen/ARM/vbits.ll b/test/CodeGen/ARM/vbits.ll
index 7b48441..dfeaacf 100644
--- a/test/CodeGen/ARM/vbits.ll
+++ b/test/CodeGen/ARM/vbits.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -mcpu=cortex-a8 %s -o - | FileCheck %s
 
 define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: v_andi8:
diff --git a/test/CodeGen/ARM/vbsl.ll b/test/CodeGen/ARM/vbsl.ll
index 1e53e51..ddc37cc 100644
--- a/test/CodeGen/ARM/vbsl.ll
+++ b/test/CodeGen/ARM/vbsl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 ; rdar://12471808
 
diff --git a/test/CodeGen/ARM/vceq.ll b/test/CodeGen/ARM/vceq.ll
index 0a1f2eb..e3202e4 100644
--- a/test/CodeGen/ARM/vceq.ll
+++ b/test/CodeGen/ARM/vceq.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vceqi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vceqi8:
diff --git a/test/CodeGen/ARM/vcge.ll b/test/CodeGen/ARM/vcge.ll
index 81a59db..3739f5e 100644
--- a/test/CodeGen/ARM/vcge.ll
+++ b/test/CodeGen/ARM/vcge.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vcges8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vcges8:
@@ -145,7 +145,7 @@ define <2 x i32> @vacgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK: vacge.f32
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -154,12 +154,12 @@ define <4 x i32> @vacgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: vacge.f32
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
 
 define <8 x i8> @vcgei8Z(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: vcgei8Z:
diff --git a/test/CodeGen/ARM/vcgt.ll b/test/CodeGen/ARM/vcgt.ll
index 056866f..2f736f6 100644
--- a/test/CodeGen/ARM/vcgt.ll
+++ b/test/CodeGen/ARM/vcgt.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+neon -regalloc=basic | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -regalloc=basic %s -o - | FileCheck %s
 
 define <8 x i8> @vcgts8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vcgts8:
@@ -146,7 +146,7 @@ define <2 x i32> @vacgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK: vacgt.f32
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -155,7 +155,7 @@ define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: vacgt.f32
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -172,8 +172,8 @@ define <4 x i32> @vcgt_zext(<4 x float>* %A, <4 x float>* %B) nounwind {
 	ret <4 x i32> %tmp4
 }
 
-declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
 
 define <8 x i8> @vcgti8Z(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: vcgti8Z:
diff --git a/test/CodeGen/ARM/vcnt.ll b/test/CodeGen/ARM/vcnt.ll
index 0b53979..390559b 100644
--- a/test/CodeGen/ARM/vcnt.ll
+++ b/test/CodeGen/ARM/vcnt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 ; NB: this tests vcnt, vclz, and vcls
 
 define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
diff --git a/test/CodeGen/ARM/vcombine.ll b/test/CodeGen/ARM/vcombine.ll
index 527f93b..d611267 100644
--- a/test/CodeGen/ARM/vcombine.ll
+++ b/test/CodeGen/ARM/vcombine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
 
 define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ; CHECK: vcombine8
diff --git a/test/CodeGen/ARM/vcvt.ll b/test/CodeGen/ARM/vcvt.ll
index 4f17dc5..af4e6a3 100644
--- a/test/CodeGen/ARM/vcvt.ll
+++ b/test/CodeGen/ARM/vcvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon,+fp16 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon,+fp16 %s -o - | FileCheck %s
 
 define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind {
 ;CHECK-LABEL: vcvt_f32tos32:
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index b24be26..89f355c 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs %s -o - \
+; RUN:	| FileCheck %s
 
 define <8 x i8> @v_dup8(i8 %A) nounwind {
 ;CHECK-LABEL: v_dup8:
@@ -331,3 +332,35 @@ define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
   %2 = insertelement  <8  x i8> %1, i8 %x, i32 1
   ret <8 x i8> %2
 }
+
+; Check that an SPR splat produces a vdup.
+
+define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
+;CHECK-LABEL: check_spr_splat2:
+;CHECK: vdup.32 d
+  %conv = sitofp i16 %q to float
+  %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0
+  %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer
+  %sub = fsub <2 x float> %splat.splat, %p
+  ret <2 x float> %sub
+}
+
+define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
+;CHECK-LABEL: check_spr_splat4:
+;CHECK: vdup.32 q
+  %conv = sitofp i16 %q to float
+  %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0
+  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  %sub = fsub <4 x float> %splat.splat, %p
+  ret <4 x float> %sub
+}
+
+define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
+;CHECK-LABEL: check_spr_splat4_lane1:
+;CHECK: vdup.32 q{{.*}}, d{{.*}}[1]
+  %conv = sitofp i16 %q to float
+  %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1
+  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %sub = fsub <4 x float> %splat.splat, %p
+  ret <4 x float> %sub
+}
diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll
index 5555a47..4407451 100644
--- a/test/CodeGen/ARM/vext.ll
+++ b/test/CodeGen/ARM/vext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: test_vextd:
diff --git a/test/CodeGen/ARM/vfcmp.ll b/test/CodeGen/ARM/vfcmp.ll
index a23db7b..4b2fea9 100644
--- a/test/CodeGen/ARM/vfcmp.ll
+++ b/test/CodeGen/ARM/vfcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 ; This tests fcmp operations that do not map directly to NEON instructions.
 
diff --git a/test/CodeGen/ARM/vfp-regs-dwarf.ll b/test/CodeGen/ARM/vfp-regs-dwarf.ll
new file mode 100644
index 0000000..4976729
--- /dev/null
+++ b/test/CodeGen/ARM/vfp-regs-dwarf.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s
+
+; Generated from:
+;     void stack_offsets() {
+;       asm("" ::: "d8", "d9", "d11", "d13");
+;     }
+; Compiled with: "clang -target armv7-linux-gnueabihf -O3"
+
+; The important point we're checking here is that the .cfi directives describe
+; the layout of the VFP registers correctly. The fact that the numbers are
+; monotonic in memory is also a nice property to have.
+
+define void @stack_offsets() {
+; CHECK-LABEL: stack_offsets:
+; CHECK: vpush {d13}
+; CHECK: vpush {d11}
+; CHECK: vpush {d8, d9}
+
+; CHECK: .cfi_offset {{269|d13}}, -8
+; CHECK: .cfi_offset {{267|d11}}, -16
+; CHECK: .cfi_offset {{265|d9}}, -24
+; CHECK: .cfi_offset {{264|d8}}, -32
+
+; CHECK: vpop {d8, d9}
+; CHECK: vpop {d11}
+; CHECK: vpop {d13}
+  call void asm sideeffect "", "~{d8},~{d9},~{d11},~{d13}"() #1
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/Users/tim/llvm/build/tmp.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"tmp.c", metadata !"/Users/tim/llvm/build"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @stack_offsets, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/Users/tim/llvm/build/tmp.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+
diff --git a/test/CodeGen/ARM/vhadd.ll b/test/CodeGen/ARM/vhadd.ll
index 9c2ed57..6183db3 100644
--- a/test/CodeGen/ARM/vhadd.ll
+++ b/test/CodeGen/ARM/vhadd.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vhadds8:
diff --git a/test/CodeGen/ARM/vhsub.ll b/test/CodeGen/ARM/vhsub.ll
index 4bc2e87..f1a0cb2 100644
--- a/test/CodeGen/ARM/vhsub.ll
+++ b/test/CodeGen/ARM/vhsub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vhsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vhsubs8:
diff --git a/test/CodeGen/ARM/vicmp.ll b/test/CodeGen/ARM/vicmp.ll
index 0a8f103..bebb320 100644
--- a/test/CodeGen/ARM/vicmp.ll
+++ b/test/CodeGen/ARM/vicmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm -mattr=+neon %s -o - | FileCheck %s
 
 ; This tests icmp operations that do not map directly to NEON instructions.
 ; Not-equal (ne) operations are implemented by VCEQ/VMVN.  Less-than (lt/ult)
diff --git a/test/CodeGen/ARM/vld1.ll b/test/CodeGen/ARM/vld1.ll
index 444d0d5..caeeada 100644
--- a/test/CodeGen/ARM/vld1.ll
+++ b/test/CodeGen/ARM/vld1.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+neon -regalloc=basic | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
+
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -regalloc=basic %s -o - \
+; RUN:	| FileCheck %s
 
 define <8 x i8> @vld1i8(i8* %A) nounwind {
 ;CHECK-LABEL: vld1i8:
diff --git a/test/CodeGen/ARM/vld2.ll b/test/CodeGen/ARM/vld2.ll
index fddafea..7ac5cc7 100644
--- a/test/CodeGen/ARM/vld2.ll
+++ b/test/CodeGen/ARM/vld2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 %struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
 %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
diff --git a/test/CodeGen/ARM/vld3.ll b/test/CodeGen/ARM/vld3.ll
index 400541f..171a03c 100644
--- a/test/CodeGen/ARM/vld3.ll
+++ b/test/CodeGen/ARM/vld3.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+neon -regalloc=basic | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o -| FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -regalloc=basic %s -o - | FileCheck %s
 
 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
@@ -83,6 +83,19 @@ define <1 x i64> @vld3i64(i64* %A) nounwind {
 	ret <1 x i64> %tmp4
 }
 
+define <1 x i64> @vld3i64_update(i64** %ptr, i64* %A) nounwind {
+;CHECK-LABEL: vld3i64_update:
+;CHECK: vld1.64	{d16, d17, d18}, [r1:64]!
+        %tmp0 = bitcast i64* %A to i8*
+        %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
+        %tmp5 = getelementptr i64* %A, i32 3
+        store i64* %tmp5, i64** %ptr
+        %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
+        %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2
+        %tmp4 = add <1 x i64> %tmp2, %tmp3
+        ret <1 x i64> %tmp4
+}
+
 define <16 x i8> @vld3Qi8(i8* %A) nounwind {
 ;CHECK-LABEL: vld3Qi8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
diff --git a/test/CodeGen/ARM/vld4.ll b/test/CodeGen/ARM/vld4.ll
index f7376b5..94ad143 100644
--- a/test/CodeGen/ARM/vld4.ll
+++ b/test/CodeGen/ARM/vld4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 %struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>,  <8 x i8>, <8 x i8> }
 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
@@ -83,6 +83,19 @@ define <1 x i64> @vld4i64(i64* %A) nounwind {
 	ret <1 x i64> %tmp4
 }
 
+define <1 x i64> @vld4i64_update(i64** %ptr, i64* %A) nounwind {
+;CHECK-LABEL: vld4i64_update:
+;CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]!
+        %tmp0 = bitcast i64* %A to i8*
+        %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
+        %tmp5 = getelementptr i64* %A, i32 4
+        store i64* %tmp5, i64** %ptr
+        %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
+        %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2
+        %tmp4 = add <1 x i64> %tmp2, %tmp3
+        ret <1 x i64> %tmp4
+}
+
 define <16 x i8> @vld4Qi8(i8* %A) nounwind {
 ;CHECK-LABEL: vld4Qi8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll
index 5509f3e..64aac56 100644
--- a/test/CodeGen/ARM/vlddup.ll
+++ b/test/CodeGen/ARM/vlddup.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vld1dupi8(i8* %A) nounwind {
 ;CHECK-LABEL: vld1dupi8:
diff --git a/test/CodeGen/ARM/vldlane.ll b/test/CodeGen/ARM/vldlane.ll
index 7a83a4c..c7d69ff 100644
--- a/test/CodeGen/ARM/vldlane.ll
+++ b/test/CodeGen/ARM/vldlane.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+neon -regalloc=basic | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
+
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -regalloc=basic %s -o - \
+; RUN:	| FileCheck %s
 
 define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vld1lanei8:
diff --git a/test/CodeGen/ARM/vminmax.ll b/test/CodeGen/ARM/vminmax.ll
index 81f4578..1167ebe 100644
--- a/test/CodeGen/ARM/vminmax.ll
+++ b/test/CodeGen/ARM/vminmax.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vmins8:
diff --git a/test/CodeGen/ARM/vmla.ll b/test/CodeGen/ARM/vmla.ll
index caf6556..6073fc5 100644
--- a/test/CodeGen/ARM/vmla.ll
+++ b/test/CodeGen/ARM/vmla.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vmlai8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
 ;CHECK-LABEL: vmlai8:
diff --git a/test/CodeGen/ARM/vmls.ll b/test/CodeGen/ARM/vmls.ll
index 61f3424..f86739c 100644
--- a/test/CodeGen/ARM/vmls.ll
+++ b/test/CodeGen/ARM/vmls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vmlsi8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
 ;CHECK-LABEL: vmlsi8:
diff --git a/test/CodeGen/ARM/vmov.ll b/test/CodeGen/ARM/vmov.ll
index 8b63138..7900af4 100644
--- a/test/CodeGen/ARM/vmov.ll
+++ b/test/CodeGen/ARM/vmov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @v_movi8() nounwind {
 ;CHECK-LABEL: v_movi8:
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index de329ac..0fa43d8 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
 
 define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vmuli8:
diff --git a/test/CodeGen/ARM/vneg.ll b/test/CodeGen/ARM/vneg.ll
index 1be4f74..4d548dd 100644
--- a/test/CodeGen/ARM/vneg.ll
+++ b/test/CodeGen/ARM/vneg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vnegs8(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: vnegs8:
diff --git a/test/CodeGen/ARM/vpadal.ll b/test/CodeGen/ARM/vpadal.ll
index a616a8d..ffeac73 100644
--- a/test/CodeGen/ARM/vpadal.ll
+++ b/test/CodeGen/ARM/vpadal.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <4 x i16> @vpadals8(<4 x i16>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vpadals8:
diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll
index f84721f..01cb1c7 100644
--- a/test/CodeGen/ARM/vpadd.ll
+++ b/test/CodeGen/ARM/vpadd.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vpaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vpaddi8:
@@ -152,6 +152,17 @@ define void @addCombineToVPADDL() nounwind ssp {
   ret void
 }
 
+; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from
+; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s.
+define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) {
+;CHECK-LABEL: fromExtendingExtractVectorElt:
+;CHECK: vpaddl.s16
+  %tmp1 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
+  %tmp2 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
+  %x = add <2 x i16> %tmp2, %tmp1
+  ret <2 x i16> %x
+}
+
 declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
 declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
 declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vpminmax.ll b/test/CodeGen/ARM/vpminmax.ll
index c68b319..0b893e5 100644
--- a/test/CodeGen/ARM/vpminmax.ll
+++ b/test/CodeGen/ARM/vpminmax.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vpmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vpmins8:
diff --git a/test/CodeGen/ARM/vqadd.ll b/test/CodeGen/ARM/vqadd.ll
index 7840766..81acc8b 100644
--- a/test/CodeGen/ARM/vqadd.ll
+++ b/test/CodeGen/ARM/vqadd.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vqadds8:
diff --git a/test/CodeGen/ARM/vqshl.ll b/test/CodeGen/ARM/vqshl.ll
index b5cd716..4afef6d 100644
--- a/test/CodeGen/ARM/vqshl.ll
+++ b/test/CodeGen/ARM/vqshl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vqshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vqshls8:
diff --git a/test/CodeGen/ARM/vqshrn.ll b/test/CodeGen/ARM/vqshrn.ll
index 4abae70..f02482c 100644
--- a/test/CodeGen/ARM/vqshrn.ll
+++ b/test/CodeGen/ARM/vqshrn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vqshrns8(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: vqshrns8:
diff --git a/test/CodeGen/ARM/vqsub.ll b/test/CodeGen/ARM/vqsub.ll
index 90bc349..4af4380 100644
--- a/test/CodeGen/ARM/vqsub.ll
+++ b/test/CodeGen/ARM/vqsub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vqsubs8:
diff --git a/test/CodeGen/ARM/vrec.ll b/test/CodeGen/ARM/vrec.ll
index c0deca9..91979e5 100644
--- a/test/CodeGen/ARM/vrec.ll
+++ b/test/CodeGen/ARM/vrec.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <2 x i32> @vrecpei32(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: vrecpei32:
diff --git a/test/CodeGen/ARM/vrev.ll b/test/CodeGen/ARM/vrev.ll
index b6da694..eb76ba6 100644
--- a/test/CodeGen/ARM/vrev.ll
+++ b/test/CodeGen/ARM/vrev.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: test_vrev64D8:
diff --git a/test/CodeGen/ARM/vsel.ll b/test/CodeGen/ARM/vsel.ll
index 7e1f714..746b1b0 100644
--- a/test/CodeGen/ARM/vsel.ll
+++ b/test/CodeGen/ARM/vsel.ll
@@ -61,7 +61,7 @@ define void @test_vsel32slt(i32 %lhs32, i32 %rhs32, float %a, float %b) {
   %val1 = select i1 %tst1, float %a, float %b
   store float %val1, float* @varfloat
 ; CHECK: cmp r0, r1
-; CHECK: vselgt.f32 s0, s1, s0
+; CHECK: vselge.f32 s0, s1, s0
   ret void
 }
 define void @test_vsel64slt(i32 %lhs32, i32 %rhs32, double %a, double %b) {
@@ -70,7 +70,7 @@ define void @test_vsel64slt(i32 %lhs32, i32 %rhs32, double %a, double %b) {
   %val1 = select i1 %tst1, double %a, double %b
   store double %val1, double* @vardouble
 ; CHECK: cmp r0, r1
-; CHECK: vselgt.f64 d16, d1, d0
+; CHECK: vselge.f64 d16, d1, d0
   ret void
 }
 define void @test_vsel32sle(i32 %lhs32, i32 %rhs32, float %a, float %b) {
@@ -79,7 +79,7 @@ define void @test_vsel32sle(i32 %lhs32, i32 %rhs32, float %a, float %b) {
   %val1 = select i1 %tst1, float %a, float %b
   store float %val1, float* @varfloat
 ; CHECK: cmp r0, r1
-; CHECK: vselge.f32 s0, s1, s0
+; CHECK: vselgt.f32 s0, s1, s0
   ret void
 }
 define void @test_vsel64sle(i32 %lhs32, i32 %rhs32, double %a, double %b) {
@@ -88,7 +88,7 @@ define void @test_vsel64sle(i32 %lhs32, i32 %rhs32, double %a, double %b) {
   %val1 = select i1 %tst1, double %a, double %b
   store double %val1, double* @vardouble
 ; CHECK: cmp r0, r1
-; CHECK: vselge.f64 d16, d1, d0
+; CHECK: vselgt.f64 d16, d1, d0
   ret void
 }
 define void @test_vsel32ogt(float %lhs32, float %rhs32, float %a, float %b) {
diff --git a/test/CodeGen/ARM/vselect_imax.ll b/test/CodeGen/ARM/vselect_imax.ll
index 9ea56a4..e999034 100644
--- a/test/CodeGen/ARM/vselect_imax.ll
+++ b/test/CodeGen/ARM/vselect_imax.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -march=arm -mcpu=cortex-a8 | FileCheck %s --check-prefix=COST
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 ; Make sure that ARM backend with NEON handles vselect.
 
 define void @vmax_v4i32(<4 x i32>* %m, <4 x i32> %a, <4 x i32> %b) {
diff --git a/test/CodeGen/ARM/vshift.ll b/test/CodeGen/ARM/vshift.ll
index de380d3..618a137 100644
--- a/test/CodeGen/ARM/vshift.ll
+++ b/test/CodeGen/ARM/vshift.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vshls8:
@@ -180,7 +180,7 @@ define <8 x i8> @vlshri8(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: vlshri8:
 ;CHECK: vshr.u8
 	%tmp1 = load <8 x i8>* %A
-	%tmp2 = lshr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	%tmp2 = lshr <8 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
 	ret <8 x i8> %tmp2
 }
 
@@ -188,7 +188,7 @@ define <4 x i16> @vlshri16(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: vlshri16:
 ;CHECK: vshr.u16
 	%tmp1 = load <4 x i16>* %A
-	%tmp2 = lshr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 >
+	%tmp2 = lshr <4 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15 >
 	ret <4 x i16> %tmp2
 }
 
@@ -196,7 +196,7 @@ define <2 x i32> @vlshri32(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: vlshri32:
 ;CHECK: vshr.u32
 	%tmp1 = load <2 x i32>* %A
-	%tmp2 = lshr <2 x i32> %tmp1, < i32 32, i32 32 >
+	%tmp2 = lshr <2 x i32> %tmp1, < i32 31, i32 31 >
 	ret <2 x i32> %tmp2
 }
 
@@ -204,7 +204,7 @@ define <1 x i64> @vlshri64(<1 x i64>* %A) nounwind {
 ;CHECK-LABEL: vlshri64:
 ;CHECK: vshr.u64
 	%tmp1 = load <1 x i64>* %A
-	%tmp2 = lshr <1 x i64> %tmp1, < i64 64 >
+	%tmp2 = lshr <1 x i64> %tmp1, < i64 63 >
 	ret <1 x i64> %tmp2
 }
 
@@ -252,7 +252,7 @@ define <16 x i8> @vlshrQi8(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: vlshrQi8:
 ;CHECK: vshr.u8
 	%tmp1 = load <16 x i8>* %A
-	%tmp2 = lshr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	%tmp2 = lshr <16 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
 	ret <16 x i8> %tmp2
 }
 
@@ -260,7 +260,7 @@ define <8 x i16> @vlshrQi16(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: vlshrQi16:
 ;CHECK: vshr.u16
 	%tmp1 = load <8 x i16>* %A
-	%tmp2 = lshr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+	%tmp2 = lshr <8 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
 	ret <8 x i16> %tmp2
 }
 
@@ -268,7 +268,7 @@ define <4 x i32> @vlshrQi32(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: vlshrQi32:
 ;CHECK: vshr.u32
 	%tmp1 = load <4 x i32>* %A
-	%tmp2 = lshr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 >
+	%tmp2 = lshr <4 x i32> %tmp1, < i32 31, i32 31, i32 31, i32 31 >
 	ret <4 x i32> %tmp2
 }
 
@@ -276,7 +276,7 @@ define <2 x i64> @vlshrQi64(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: vlshrQi64:
 ;CHECK: vshr.u64
 	%tmp1 = load <2 x i64>* %A
-	%tmp2 = lshr <2 x i64> %tmp1, < i64 64, i64 64 >
+	%tmp2 = lshr <2 x i64> %tmp1, < i64 63, i64 63 >
 	ret <2 x i64> %tmp2
 }
 
@@ -331,7 +331,7 @@ define <8 x i8> @vashri8(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: vashri8:
 ;CHECK: vshr.s8
 	%tmp1 = load <8 x i8>* %A
-	%tmp2 = ashr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	%tmp2 = ashr <8 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
 	ret <8 x i8> %tmp2
 }
 
@@ -339,7 +339,7 @@ define <4 x i16> @vashri16(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: vashri16:
 ;CHECK: vshr.s16
 	%tmp1 = load <4 x i16>* %A
-	%tmp2 = ashr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 >
+	%tmp2 = ashr <4 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15 >
 	ret <4 x i16> %tmp2
 }
 
@@ -347,7 +347,7 @@ define <2 x i32> @vashri32(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: vashri32:
 ;CHECK: vshr.s32
 	%tmp1 = load <2 x i32>* %A
-	%tmp2 = ashr <2 x i32> %tmp1, < i32 32, i32 32 >
+	%tmp2 = ashr <2 x i32> %tmp1, < i32 31, i32 31 >
 	ret <2 x i32> %tmp2
 }
 
@@ -355,7 +355,7 @@ define <1 x i64> @vashri64(<1 x i64>* %A) nounwind {
 ;CHECK-LABEL: vashri64:
 ;CHECK: vshr.s64
 	%tmp1 = load <1 x i64>* %A
-	%tmp2 = ashr <1 x i64> %tmp1, < i64 64 >
+	%tmp2 = ashr <1 x i64> %tmp1, < i64 63 >
 	ret <1 x i64> %tmp2
 }
 
@@ -403,7 +403,7 @@ define <16 x i8> @vashrQi8(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: vashrQi8:
 ;CHECK: vshr.s8
 	%tmp1 = load <16 x i8>* %A
-	%tmp2 = ashr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	%tmp2 = ashr <16 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
 	ret <16 x i8> %tmp2
 }
 
@@ -411,7 +411,7 @@ define <8 x i16> @vashrQi16(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: vashrQi16:
 ;CHECK: vshr.s16
 	%tmp1 = load <8 x i16>* %A
-	%tmp2 = ashr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+	%tmp2 = ashr <8 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
 	ret <8 x i16> %tmp2
 }
 
@@ -419,7 +419,7 @@ define <4 x i32> @vashrQi32(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: vashrQi32:
 ;CHECK: vshr.s32
 	%tmp1 = load <4 x i32>* %A
-	%tmp2 = ashr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 >
+	%tmp2 = ashr <4 x i32> %tmp1, < i32 31, i32 31, i32 31, i32 31 >
 	ret <4 x i32> %tmp2
 }
 
@@ -427,6 +427,6 @@ define <2 x i64> @vashrQi64(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: vashrQi64:
 ;CHECK: vshr.s64
 	%tmp1 = load <2 x i64>* %A
-	%tmp2 = ashr <2 x i64> %tmp1, < i64 64, i64 64 >
+	%tmp2 = ashr <2 x i64> %tmp1, < i64 63, i64 63 >
 	ret <2 x i64> %tmp2
 }
diff --git a/test/CodeGen/ARM/vshiftins.ll b/test/CodeGen/ARM/vshiftins.ll
index 27610bf..9526c32 100644
--- a/test/CodeGen/ARM/vshiftins.ll
+++ b/test/CodeGen/ARM/vshiftins.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vsli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vsli8:
diff --git a/test/CodeGen/ARM/vshl.ll b/test/CodeGen/ARM/vshl.ll
index 462f7fe..6228652 100644
--- a/test/CodeGen/ARM/vshl.ll
+++ b/test/CodeGen/ARM/vshl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vshls8:
diff --git a/test/CodeGen/ARM/vshll.ll b/test/CodeGen/ARM/vshll.ll
index ae80664..27873eb 100644
--- a/test/CodeGen/ARM/vshll.ll
+++ b/test/CodeGen/ARM/vshll.ll
@@ -1,51 +1,57 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i16> @vshlls8(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: vshlls8:
 ;CHECK: vshll.s8
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
-	ret <8 x i16> %tmp2
+        %tmp1 = load <8 x i8>* %A
+        %sext = sext <8 x i8> %tmp1 to <8 x i16>
+        %shift = shl <8 x i16> %sext, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+        ret <8 x i16> %shift
 }
 
 define <4 x i32> @vshlls16(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: vshlls16:
 ;CHECK: vshll.s16
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftls.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
-	ret <4 x i32> %tmp2
+        %tmp1 = load <4 x i16>* %A
+        %sext = sext <4 x i16> %tmp1 to <4 x i32>
+        %shift = shl <4 x i32> %sext, <i32 15, i32 15, i32 15, i32 15>
+        ret <4 x i32> %shift
 }
 
 define <2 x i64> @vshlls32(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: vshlls32:
 ;CHECK: vshll.s32
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
-	ret <2 x i64> %tmp2
+        %tmp1 = load <2 x i32>* %A
+        %sext = sext <2 x i32> %tmp1 to <2 x i64>
+        %shift = shl <2 x i64> %sext, <i64 31, i64 31>
+        ret <2 x i64> %shift
 }
 
 define <8 x i16> @vshllu8(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: vshllu8:
 ;CHECK: vshll.u8
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftlu.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
-	ret <8 x i16> %tmp2
+        %tmp1 = load <8 x i8>* %A
+        %zext = zext <8 x i8> %tmp1 to <8 x i16>
+        %shift = shl <8 x i16> %zext, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+        ret <8 x i16> %shift
 }
 
 define <4 x i32> @vshllu16(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: vshllu16:
 ;CHECK: vshll.u16
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
-	ret <4 x i32> %tmp2
+        %tmp1 = load <4 x i16>* %A
+        %zext = zext <4 x i16> %tmp1 to <4 x i32>
+        %shift = shl <4 x i32> %zext, <i32 15, i32 15, i32 15, i32 15>
+        ret <4 x i32> %shift
 }
 
 define <2 x i64> @vshllu32(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: vshllu32:
 ;CHECK: vshll.u32
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftlu.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
-	ret <2 x i64> %tmp2
+        %tmp1 = load <2 x i32>* %A
+        %zext = zext <2 x i32> %tmp1 to <2 x i64>
+        %shift = shl <2 x i64> %zext, <i64 31, i64 31>
+        ret <2 x i64> %shift
 }
 
 ; The following tests use the maximum shift count, so the signedness is
@@ -53,31 +59,58 @@ define <2 x i64> @vshllu32(<2 x i32>* %A) nounwind {
 define <8 x i16> @vshlli8(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: vshlli8:
 ;CHECK: vshll.i8
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >)
-	ret <8 x i16> %tmp2
+        %tmp1 = load <8 x i8>* %A
+        %sext = sext <8 x i8> %tmp1 to <8 x i16>
+        %shift = shl <8 x i16> %sext, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        ret <8 x i16> %shift
 }
 
 define <4 x i32> @vshlli16(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: vshlli16:
 ;CHECK: vshll.i16
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 16, i16 16, i16 16, i16 16 >)
-	ret <4 x i32> %tmp2
+        %tmp1 = load <4 x i16>* %A
+        %zext = zext <4 x i16> %tmp1 to <4 x i32>
+        %shift = shl <4 x i32> %zext, <i32 16, i32 16, i32 16, i32 16>
+        ret <4 x i32> %shift
 }
 
 define <2 x i64> @vshlli32(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: vshlli32:
 ;CHECK: vshll.i32
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 32, i32 32 >)
-	ret <2 x i64> %tmp2
+        %tmp1 = load <2 x i32>* %A
+        %zext = zext <2 x i32> %tmp1 to <2 x i64>
+        %shift = shl <2 x i64> %zext, <i64 32, i64 32>
+        ret <2 x i64> %shift
 }
 
-declare <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vshiftls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+; And these have a shift just out of range so separate vmovl and vshl
+; instructions are needed.
+define <8 x i16> @vshllu8_bad(<8 x i8>* %A) nounwind {
+; CHECK-LABEL: vshllu8_bad:
+; CHECK: vmovl.u8
+; CHECK: vshl.i16
+        %tmp1 = load <8 x i8>* %A
+        %zext = zext <8 x i8> %tmp1 to <8 x i16>
+        %shift = shl <8 x i16> %zext, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
+        ret <8 x i16> %shift
+}
+
+define <4 x i32> @vshlls16_bad(<4 x i16>* %A) nounwind {
+; CHECK-LABEL: vshlls16_bad:
+; CHECK: vmovl.s16
+; CHECK: vshl.i32
+        %tmp1 = load <4 x i16>* %A
+        %sext = sext <4 x i16> %tmp1 to <4 x i32>
+        %shift = shl <4 x i32> %sext, <i32 17, i32 17, i32 17, i32 17>
+        ret <4 x i32> %shift
+}
 
-declare <8 x i16> @llvm.arm.neon.vshiftlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vshiftlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+define <2 x i64> @vshllu32_bad(<2 x i32>* %A) nounwind {
+; CHECK-LABEL: vshllu32_bad:
+; CHECK: vmovl.u32
+; CHECK: vshl.i64
+        %tmp1 = load <2 x i32>* %A
+        %zext = zext <2 x i32> %tmp1 to <2 x i64>
+        %shift = shl <2 x i64> %zext, <i64 33, i64 33>
+        ret <2 x i64> %shift
+}
diff --git a/test/CodeGen/ARM/vshrn.ll b/test/CodeGen/ARM/vshrn.ll
index 40a94fe..8aa009a 100644
--- a/test/CodeGen/ARM/vshrn.ll
+++ b/test/CodeGen/ARM/vshrn.ll
@@ -1,32 +1,61 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vshrns8(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: vshrns8:
 ;CHECK: vshrn.i16
 	%tmp1 = load <8 x i16>* %A
-	%tmp2 = call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
-	ret <8 x i8> %tmp2
+        %tmp2 = lshr <8 x i16> %tmp1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+	ret <8 x i8> %tmp3
 }
 
 define <4 x i16> @vshrns16(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: vshrns16:
 ;CHECK: vshrn.i32
 	%tmp1 = load <4 x i32>* %A
-	%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
-	ret <4 x i16> %tmp2
+        %tmp2 = ashr <4 x i32> %tmp1, <i32 16, i32 16, i32 16, i32 16>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+	ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @vshrns32(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: vshrns32:
 ;CHECK: vshrn.i64
 	%tmp1 = load <2 x i64>* %A
-	%tmp2 = call <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
-	ret <2 x i32> %tmp2
+        %tmp2 = ashr <2 x i64> %tmp1, <i64 32, i64 32>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vshrns8_bad(<8 x i16>* %A) nounwind {
+; CHECK-LABEL: vshrns8_bad:
+; CHECK: vshr.s16
+; CHECK: vmovn.i16
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = ashr <8 x i16> %tmp1, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vshrns16_bad(<4 x i32>* %A) nounwind {
+; CHECK-LABEL: vshrns16_bad:
+; CHECK: vshr.u32
+; CHECK: vmovn.i32
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = lshr <4 x i32> %tmp1, <i32 17, i32 17, i32 17, i32 17>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+        ret <4 x i16> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+define <2 x i32> @vshrns32_bad(<2 x i64>* %A) nounwind {
+; CHECK-LABEL: vshrns32_bad:
+; CHECK: vshr.u64
+; CHECK: vmovn.i64
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = lshr <2 x i64> %tmp1, <i64 33, i64 33>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+        ret <2 x i32> %tmp3
+}
 
 define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: vrshrns8:
diff --git a/test/CodeGen/ARM/vsra.ll b/test/CodeGen/ARM/vsra.ll
index 7a211c3..fa5985a 100644
--- a/test/CodeGen/ARM/vsra.ll
+++ b/test/CodeGen/ARM/vsra.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vsras8:
 ;CHECK: vsra.s8
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = ashr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
-        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	%tmp3 = ashr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+    %tmp4 = add <8 x i8> %tmp1, %tmp3
 	ret <8 x i8> %tmp4
 }
 
@@ -15,7 +15,7 @@ define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vsra.s16
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = ashr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 >
+	%tmp3 = ashr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
         %tmp4 = add <4 x i16> %tmp1, %tmp3
 	ret <4 x i16> %tmp4
 }
@@ -25,7 +25,7 @@ define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vsra.s32
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = ashr <2 x i32> %tmp2, < i32 32, i32 32 >
+	%tmp3 = ashr <2 x i32> %tmp2, < i32 31, i32 31 >
         %tmp4 = add <2 x i32> %tmp1, %tmp3
 	ret <2 x i32> %tmp4
 }
@@ -35,7 +35,7 @@ define <1 x i64> @vsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vsra.s64
 	%tmp1 = load <1 x i64>* %A
 	%tmp2 = load <1 x i64>* %B
-	%tmp3 = ashr <1 x i64> %tmp2, < i64 64 >
+	%tmp3 = ashr <1 x i64> %tmp2, < i64 63 >
         %tmp4 = add <1 x i64> %tmp1, %tmp3
 	ret <1 x i64> %tmp4
 }
@@ -45,7 +45,7 @@ define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vsra.s8
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = ashr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	%tmp3 = ashr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
         %tmp4 = add <16 x i8> %tmp1, %tmp3
 	ret <16 x i8> %tmp4
 }
@@ -55,7 +55,7 @@ define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vsra.s16
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = ashr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+	%tmp3 = ashr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
         %tmp4 = add <8 x i16> %tmp1, %tmp3
 	ret <8 x i16> %tmp4
 }
@@ -65,7 +65,7 @@ define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vsra.s32
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = ashr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 >
+	%tmp3 = ashr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
         %tmp4 = add <4 x i32> %tmp1, %tmp3
 	ret <4 x i32> %tmp4
 }
@@ -75,7 +75,7 @@ define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: vsra.s64
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
-	%tmp3 = ashr <2 x i64> %tmp2, < i64 64, i64 64 >
+	%tmp3 = ashr <2 x i64> %tmp2, < i64 63, i64 63 >
         %tmp4 = add <2 x i64> %tmp1, %tmp3
 	ret <2 x i64> %tmp4
 }
@@ -85,7 +85,7 @@ define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vsra.u8
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = lshr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	%tmp3 = lshr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
         %tmp4 = add <8 x i8> %tmp1, %tmp3
 	ret <8 x i8> %tmp4
 }
@@ -95,7 +95,7 @@ define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vsra.u16
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = lshr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 >
+	%tmp3 = lshr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
         %tmp4 = add <4 x i16> %tmp1, %tmp3
 	ret <4 x i16> %tmp4
 }
@@ -105,7 +105,7 @@ define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vsra.u32
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = lshr <2 x i32> %tmp2, < i32 32, i32 32 >
+	%tmp3 = lshr <2 x i32> %tmp2, < i32 31, i32 31 >
         %tmp4 = add <2 x i32> %tmp1, %tmp3
 	ret <2 x i32> %tmp4
 }
@@ -115,7 +115,7 @@ define <1 x i64> @vsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vsra.u64
 	%tmp1 = load <1 x i64>* %A
 	%tmp2 = load <1 x i64>* %B
-	%tmp3 = lshr <1 x i64> %tmp2, < i64 64 >
+	%tmp3 = lshr <1 x i64> %tmp2, < i64 63 >
         %tmp4 = add <1 x i64> %tmp1, %tmp3
 	ret <1 x i64> %tmp4
 }
@@ -125,7 +125,7 @@ define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vsra.u8
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = lshr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	%tmp3 = lshr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
         %tmp4 = add <16 x i8> %tmp1, %tmp3
 	ret <16 x i8> %tmp4
 }
@@ -135,7 +135,7 @@ define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vsra.u16
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = lshr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+	%tmp3 = lshr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
         %tmp4 = add <8 x i16> %tmp1, %tmp3
 	ret <8 x i16> %tmp4
 }
@@ -145,7 +145,7 @@ define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vsra.u32
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = lshr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 >
+	%tmp3 = lshr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
         %tmp4 = add <4 x i32> %tmp1, %tmp3
 	ret <4 x i32> %tmp4
 }
@@ -155,7 +155,7 @@ define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: vsra.u64
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
-	%tmp3 = lshr <2 x i64> %tmp2, < i64 64, i64 64 >
+	%tmp3 = lshr <2 x i64> %tmp2, < i64 63, i64 63 >
         %tmp4 = add <2 x i64> %tmp1, %tmp3
 	ret <2 x i64> %tmp4
 }
diff --git a/test/CodeGen/ARM/vst1.ll b/test/CodeGen/ARM/vst1.ll
index 36439fd..14f3ff0 100644
--- a/test/CodeGen/ARM/vst1.ll
+++ b/test/CodeGen/ARM/vst1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst1i8:
diff --git a/test/CodeGen/ARM/vst2.ll b/test/CodeGen/ARM/vst2.ll
index 7551a56..2180259 100644
--- a/test/CodeGen/ARM/vst2.ll
+++ b/test/CodeGen/ARM/vst2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst2i8:
diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll
index 91eb7fc..5f150ed 100644
--- a/test/CodeGen/ARM/vst3.ll
+++ b/test/CodeGen/ARM/vst3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon -fast-isel=0 -O0 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon -fast-isel=0 -O0 %s -o - | FileCheck %s
 
 define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst3i8:
@@ -61,6 +61,18 @@ define void @vst3i64(i64* %A, <1 x i64>* %B) nounwind {
 	ret void
 }
 
+define void @vst3i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vst3i64_update
+;CHECK: vst1.64	{d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]!
+        %A = load i64** %ptr
+        %tmp0 = bitcast i64* %A to i8*
+        %tmp1 = load <1 x i64>* %B
+        call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
+        %tmp2 = getelementptr i64* %A, i32 3
+        store i64* %tmp2, i64** %ptr
+        ret void
+}
+
 define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst3Qi8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
diff --git a/test/CodeGen/ARM/vst4.ll b/test/CodeGen/ARM/vst4.ll
index ef5c83a..44c76b5 100644
--- a/test/CodeGen/ARM/vst4.ll
+++ b/test/CodeGen/ARM/vst4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst4i8:
@@ -60,6 +60,18 @@ define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
 	ret void
 }
 
+define void @vst4i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vst4i64_update:
+;CHECK: vst1.64	{d16, d17, d18, d19}, [r1]!
+        %A = load i64** %ptr
+        %tmp0 = bitcast i64* %A to i8*
+        %tmp1 = load <1 x i64>* %B
+        call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
+        %tmp2 = getelementptr i64* %A, i32 4
+        store i64* %tmp2, i64** %ptr
+        ret void
+}
+
 define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst4Qi8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
diff --git a/test/CodeGen/ARM/vstlane.ll b/test/CodeGen/ARM/vstlane.ll
index 34c5c70..7dd6e7b 100644
--- a/test/CodeGen/ARM/vstlane.ll
+++ b/test/CodeGen/ARM/vstlane.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm -mattr=+neon %s -o - | FileCheck %s
 
 define void @vst1lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst1lanei8:
diff --git a/test/CodeGen/ARM/vsub.ll b/test/CodeGen/ARM/vsub.ll
index 6b95b97..d1a094b 100644
--- a/test/CodeGen/ARM/vsub.ll
+++ b/test/CodeGen/ARM/vsub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vsubi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vsubi8:
diff --git a/test/CodeGen/ARM/vtbl.ll b/test/CodeGen/ARM/vtbl.ll
index 21614b0..32258a3 100644
--- a/test/CodeGen/ARM/vtbl.ll
+++ b/test/CodeGen/ARM/vtbl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>, <8 x i8> }
diff --git a/test/CodeGen/ARM/vtrn.ll b/test/CodeGen/ARM/vtrn.ll
index 7d101bc..cdae7f8 100644
--- a/test/CodeGen/ARM/vtrn.ll
+++ b/test/CodeGen/ARM/vtrn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vtrni8:
diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll
index 2d193c1..832be6c 100644
--- a/test/CodeGen/ARM/vuzp.ll
+++ b/test/CodeGen/ARM/vuzp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vuzpi8:
diff --git a/test/CodeGen/ARM/vzip.ll b/test/CodeGen/ARM/vzip.ll
index f71aef7..f74dc62 100644
--- a/test/CodeGen/ARM/vzip.ll
+++ b/test/CodeGen/ARM/vzip.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vzipi8:
diff --git a/test/CodeGen/ARM/warn-stack.ll b/test/CodeGen/ARM/warn-stack.ll
index 9538bbf..90a3e1f 100644
--- a/test/CodeGen/ARM/warn-stack.ll
+++ b/test/CodeGen/ARM/warn-stack.ll
@@ -12,7 +12,7 @@ entry:
   ret void
 }
 
-; CHECK: warning: Stack size limit exceeded (96) in warn.
+; CHECK: warning: stack size limit exceeded (96) in warn
 define void @warn() nounwind ssp {
 entry:
   %buffer = alloca [80 x i8], align 1
diff --git a/test/CodeGen/ARM/weak.ll b/test/CodeGen/ARM/weak.ll
index 5ac4b8c..375ce22 100644
--- a/test/CodeGen/ARM/weak.ll
+++ b/test/CodeGen/ARM/weak.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=arm | grep .weak.*f
-; RUN: llc < %s -march=arm | grep .weak.*h
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define weak i32 @f() {
 entry:
@@ -14,3 +13,6 @@ entry:
 
 declare extern_weak void @h()
 
+; CHECK: {{.}}weak{{.*}}f
+; CHECK: {{.}}weak{{.*}}h
+
diff --git a/test/CodeGen/ARM/weak2.ll b/test/CodeGen/ARM/weak2.ll
index cf327bb..82ab90e 100644
--- a/test/CodeGen/ARM/weak2.ll
+++ b/test/CodeGen/ARM/weak2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep .weak
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @f(i32 %a) {
 entry:
@@ -16,3 +16,6 @@ UnifiedReturnBlock:		; preds = %entry
 }
 
 declare extern_weak i32 @test_weak(...)
+
+; CHECK: {{.}}weak
+
diff --git a/test/CodeGen/ARM/zero-cycle-zero.ll b/test/CodeGen/ARM/zero-cycle-zero.ll
new file mode 100644
index 0000000..121a87f
--- /dev/null
+++ b/test/CodeGen/ARM/zero-cycle-zero.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mtriple=armv8 -mcpu=cyclone < %s | FileCheck %s --check-prefix=CHECK-CYCLONE
+; RUN: llc -mtriple=armv8 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-SWIFT
+
+declare arm_aapcs_vfpcc void @take_vec64(<2 x i32>)
+
+define void @test_vec64() {
+; CHECK-CYCLONE-LABEL: test_vec64:
+; CHECK-SWIFT-LABEL: test_vec64:
+
+  call arm_aapcs_vfpcc void @take_vec64(<2 x i32> <i32 0, i32 0>)
+  call arm_aapcs_vfpcc void @take_vec64(<2 x i32> <i32 0, i32 0>)
+; CHECK-CYCLONE-NOT: vmov.f64 d0,
+; CHECK-CYCLONE: vmov.i32 d0, #0
+; CHECK-CYCLONE: bl
+; CHECK-CYCLONE: vmov.i32 d0, #0
+; CHECK-CYCLONE: bl
+
+; CHECK-SWIFT: vmov.f64 [[ZEROREG:d[0-9]+]],
+; CHECK-SWIFT: vmov.i32 [[ZEROREG]], #0
+; CHECK-SWIFT: vorr d0, [[ZEROREG]], [[ZEROREG]]
+; CHECK-SWIFT: bl
+; CHECK-SWIFT: vorr d0, [[ZEROREG]], [[ZEROREG]]
+; CHECK-SWIFT: bl
+
+  ret void
+}
+
+declare arm_aapcs_vfpcc void @take_vec128(<8 x i16>)
+
+define void @test_vec128() {
+; CHECK-CYCLONE-LABEL: test_vec128:
+; CHECK-SWIFT-LABEL: test_vec128:
+
+  call arm_aapcs_vfpcc void @take_vec128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  call arm_aapcs_vfpcc void @take_vec128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+; CHECK-CYCLONE-NOT: vmov.f64 [[ZEROREG:d[0-9]+]],
+; CHECK-CYCLONE: vmov.i32 q0, #0
+; CHECK-CYCLONE: bl
+; CHECK-CYCLONE: vmov.i32 q0, #0
+; CHECK-CYCLONE: bl
+
+; CHECK-SWIFT-NOT: vmov.f64 [[ZEROREG:d[0-9]+]],
+; CHECK-SWIFT: vmov.i32 [[ZEROREG:q[0-9]+]], #0
+; CHECK-SWIFT: vorr q0, [[ZEROREG]], [[ZEROREG]]
+; CHECK-SWIFT: bl
+; CHECK-SWIFT: vorr q0, [[ZEROREG]], [[ZEROREG]]
+; CHECK-SWIFT: bl
+
+  ret void
+}
+
+declare void @take_i32(i32)
+
+define void @test_i32() {
+; CHECK-CYCLONE-LABEL: test_i32:
+; CHECK-SWIFT-LABEL: test_i32:
+
+  call arm_aapcs_vfpcc void @take_i32(i32 0)
+  call arm_aapcs_vfpcc void @take_i32(i32 0)
+; CHECK-CYCLONE-NOT: vmov.f64 [[ZEROREG:d[0-9]+]],
+; CHECK-CYCLONE: mov r0, #0
+; CHECK-CYCLONE: bl
+; CHECK-CYCLONE: mov r0, #0
+; CHECK-CYCLONE: bl
+
+; It doesn't particularly matter what Swift does here, there isn't carefully
+; crafted behaviour that we might break in Cyclone.
+
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll b/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
new file mode 100644
index 0000000..6fb7c3f
--- /dev/null
+++ b/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; Can't copy or spill / restore CPSR.
+; rdar://9105206
+
+define fastcc void @t() ssp align 2 {
+entry:
+  br i1 undef, label %bb3.i, label %bb2.i
+
+bb2.i:                                            ; preds = %entry
+  br label %bb3.i
+
+bb3.i:                                            ; preds = %bb2.i, %entry
+  br i1 undef, label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71, label %bb.i69
+
+bb.i69:                                           ; preds = %bb3.i
+  br label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+
+_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71: ; preds = %bb.i69, %bb3.i
+  %0 = select i1 undef, float 0.000000e+00, float undef
+  %1 = fdiv float %0, undef
+  %2 = fcmp ult float %1, 0xBF847AE140000000
+  %storemerge9 = select i1 %2, float %1, float 0.000000e+00
+  store float %storemerge9, float* undef, align 4
+  br i1 undef, label %bb42, label %bb47
+
+bb42:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+  br i1 undef, label %bb46, label %bb53
+
+bb46:                                             ; preds = %bb42
+  br label %bb48
+
+bb47:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+  br label %bb48
+
+bb48:                                             ; preds = %bb47, %bb46
+  br i1 undef, label %bb1.i14, label %bb.i13
+
+bb.i13:                                           ; preds = %bb48
+  br label %bb1.i14
+
+bb1.i14:                                          ; preds = %bb.i13, %bb48
+  br label %bb53
+
+bb53:                                             ; preds = %bb1.i14, %bb42
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
new file mode 100644
index 0000000..2b083d8
--- /dev/null
+++ b/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; rdar://9146594
+
+define void @drt_vsprintf() nounwind ssp {
+entry:
+  %do_tab_convert = alloca i32, align 4
+  br i1 undef, label %if.then24, label %if.else295, !dbg !13
+
+if.then24:                                        ; preds = %entry
+  unreachable
+
+if.else295:                                       ; preds = %entry
+  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18
+  store i32 0, i32* %do_tab_convert, align 4, !dbg !19
+  unreachable
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.gv = !{!0}
+!llvm.dbg.sp = !{!1, !7, !10, !11, !12}
+
+!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!9 = metadata !{null}
+!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!13 = metadata !{i32 653, i32 5, metadata !14, null}
+!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{i32 853, i32 11, metadata !17, null}
+!19 = metadata !{i32 853, i32 29, metadata !17, null}
+!20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}
+!21 = metadata !{i32 0}
diff --git a/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll b/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
new file mode 100644
index 0000000..6f0ec34
--- /dev/null
+++ b/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo(i64 %val) {
+; CHECK: foo
+;   The stack frame store is not 64-bit aligned. Make sure we use an
+;   instruction that can handle that.
+; CHECK: stur x0, [sp, #20]
+  %a = alloca [49 x i32], align 4
+  %p32 = getelementptr inbounds [49 x i32]* %a, i64 0, i64 2
+  %p = bitcast i32* %p32 to i64*
+  store i64 %val, i64* %p, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll b/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
new file mode 100644
index 0000000..88232fc
--- /dev/null
+++ b/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-iOS5.0
+
+; CPSR is not allocatable so fast allocatable wouldn't mark them killed.
+; rdar://9313272
+
+define hidden void @t() nounwind {
+entry:
+  %cmp = icmp eq i32* null, undef
+  %frombool = zext i1 %cmp to i8
+  store i8 %frombool, i8* undef, align 1
+  %tmp4 = load i8* undef, align 1
+  %tobool = trunc i8 %tmp4 to i1
+  br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %land.lhs.true14, label %if.end33
+
+land.lhs.true14:                                  ; preds = %if.end
+  unreachable
+
+if.end33:                                         ; preds = %if.end
+  unreachable
+}
diff --git a/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll b/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
new file mode 100644
index 0000000..ea1cd02
--- /dev/null
+++ b/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+; Can't fold the increment by 1<<12 into a post-increment load
+; rdar://10301335
+
+@test_data = common global i32 0, align 4
+
+define void @t() nounwind ssp {
+; CHECK-LABEL: t:
+entry:
+  br label %for.body
+
+for.body:
+; CHECK: for.body
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
+; CHECK: add x[[REG:[0-9]+]],
+; CHECK:                      x[[REG]], #4096
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 12
+  %add = add nsw i64 %0, 34628173824
+  %1 = inttoptr i64 %add to i32*
+  %2 = load volatile i32* %1, align 4096
+  store volatile i32 %2, i32* @test_data, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll b/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
new file mode 100644
index 0000000..d47dbb2
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=arm64
+
+; The target lowering for integer comparisons was replacing some DAG nodes
+; during operation legalization, which resulted in dangling pointers,
+; cycles in DAGs, and eventually crashes.  This is the testcase for
+; one of those crashes. (rdar://10653656)
+
+define void @test(i1 zeroext %IsArrow) nounwind ssp align 2 {
+entry:
+  br i1 undef, label %return, label %lor.lhs.false
+
+lor.lhs.false:
+  br i1 undef, label %return, label %if.end
+
+if.end:
+  %tmp.i = load i64* undef, align 8
+  %and.i.i.i = and i64 %tmp.i, -16
+  br i1 %IsArrow, label %if.else_crit_edge, label %if.end32
+
+if.else_crit_edge:
+  br i1 undef, label %if.end32, label %return
+
+if.end32:
+  %0 = icmp ult i32 undef, 3
+  %1 = zext i64 %tmp.i to i320
+  %.pn.v = select i1 %0, i320 128, i320 64
+  %.pn = shl i320 %1, %.pn.v
+  %ins346392 = or i320 %.pn, 0
+  store i320 %ins346392, i320* undef, align 8
+  br i1 undef, label %sw.bb.i.i, label %exit
+
+sw.bb.i.i:
+  unreachable
+
+exit:
+  unreachable
+
+return:
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll b/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
new file mode 100644
index 0000000..a4d37e4
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i32 @foo(<4 x i32> %a, i32 %n) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: fmov w0, s0
+; CHECK-NEXT: ret
+  %b = bitcast <4 x i32> %a to i128
+  %c = trunc i128 %b to i32
+  ret i32 %c
+}
+
+define i64 @bar(<2 x i64> %a, i64 %n) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: fmov x0, d0
+; CHECK-NEXT: ret
+  %b = bitcast <2 x i64> %a to i128
+  %c = trunc i128 %b to i64
+  ret i64 %c
+}
+
diff --git a/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll b/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
new file mode 100644
index 0000000..d59b0d0
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march arm64 -mcpu=cyclone | FileCheck %s
+; <rdar://problem/11294426>
+
+@b = private unnamed_addr constant [3 x i32] [i32 1768775988, i32 1685481784, i32 1836253201], align 4
+
+; The important thing for this test is that we need an unaligned load of `l_b'
+; ("ldr w2, [x1, #8]" in this case).
+
+; CHECK:      adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}}
+; CHECK: add  x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}}
+; CHECK-NEXT: ldr  [[VAL:w[0-9]+]], [x[[ADDR]], #8]
+; CHECK-NEXT: str  [[VAL]], [x0, #8]
+; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
+; CHECK-NEXT: str  [[VAL2]], [x0]
+
+define void @foo(i8* %a) {
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([3 x i32]* @b to i8*), i64 12, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll b/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
new file mode 100644
index 0000000..d1840d3
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://problem/11392109>
+
+define hidden void @t() optsize ssp {
+entry:
+  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8
+; CHECK:             adrp    x{{[0-9]+}}, _x@GOTPAGE
+; CHECK:        ldr     x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF]
+; CHECK-NEXT:        and     x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff
+; CHECK-NEXT:        str     x{{[0-9]+}}, [x{{[0-9]+}}]
+  unreachable
+}
+
+declare i64 @x(i32) optsize
+
+; Worth checking the Linux code is sensible too: only way to access
+; the GOT is via a 64-bit load. Just loading wN is unacceptable
+; (there's no ELF relocation to do that).
+
+; CHECK-LINUX: adrp {{x[0-9]+}}, :got:x
+; CHECK-LINUX: ldr {{x[0-9]+}}, [{{x[0-9]+}}, :got_lo12:x]
diff --git a/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll b/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
new file mode 100644
index 0000000..4b037db
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -verify-machineinstrs | FileCheck %s
+
+; LdStOpt bug created illegal instruction:
+;   %D1<def>, %D2<def> = LDPSi %X0, 1
+; rdar://11512047
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @t(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: ldp d{{[0-9]+}}, d{{[0-9]+}}
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr.sum = add i64 %ivar, 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr.sum17 = add i64 %ivar, 16
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.sum = add i64 %ivar, 24
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}
diff --git a/test/CodeGen/ARM64/2012-06-06-FPToUI.ll b/test/CodeGen/ARM64/2012-06-06-FPToUI.ll
new file mode 100644
index 0000000..dda4ff5
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-06-06-FPToUI.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=arm64 -O0 < %s | FileCheck %s
+; RUN: llc -march=arm64 -O3 < %s | FileCheck %s
+
+@.str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1
+@.str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1
+@.str2 = private unnamed_addr constant [8 x i8] c"%f %lu\0A\00", align 1
+@.str3 = private unnamed_addr constant [7 x i8] c"%f %u\0A\00", align 1
+
+define void @testDouble(double %d) ssp {
+; CHECK:  fcvtzu x{{.}}, d{{.}}
+; CHECK:  fcvtzu w{{.}}, d{{.}}
+entry:
+  %d.addr = alloca double, align 8
+  store double %d, double* %d.addr, align 8
+  %0 = load double* %d.addr, align 8
+  %1 = load double* %d.addr, align 8
+  %conv = fptoui double %1 to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
+  %2 = load double* %d.addr, align 8
+  %3 = load double* %d.addr, align 8
+  %conv1 = fptoui double %3 to i32
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
+
+define void @testFloat(float %f) ssp {
+; CHECK:  fcvtzu x{{.}}, s{{.}}
+; CHECK:  fcvtzu w{{.}}, s{{.}}
+entry:
+  %f.addr = alloca float, align 4
+  store float %f, float* %f.addr, align 4
+  %0 = load float* %f.addr, align 4
+  %conv = fpext float %0 to double
+  %1 = load float* %f.addr, align 4
+  %conv1 = fptoui float %1 to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
+  %2 = load float* %f.addr, align 4
+  %conv2 = fpext float %2 to double
+  %3 = load float* %f.addr, align 4
+  %conv3 = fptoui float %3 to i32
+  %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
+  ret void
+}
+
+define i32 @main(i32 %argc, i8** %argv) ssp {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  store i32 0, i32* %retval
+  store i32 %argc, i32* %argc.addr, align 4
+  store i8** %argv, i8*** %argv.addr, align 8
+  call void @testDouble(double 1.159198e+01)
+  call void @testFloat(float 0x40272F1800000000)
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll b/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
new file mode 100644
index 0000000..55ecfb5
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios
+; rdar://11849816
+
+@shlib_path_substitutions = external hidden unnamed_addr global i8**, align 8
+
+declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+
+declare noalias i8* @xmalloc(i64) optsize
+
+declare i64 @strlen(i8* nocapture) nounwind readonly optsize
+
+declare i8* @__strcpy_chk(i8*, i8*, i64) nounwind optsize
+
+declare i8* @__strcat_chk(i8*, i8*, i64) nounwind optsize
+
+declare noalias i8* @xstrdup(i8*) optsize
+
+define i8* @dyld_fix_path(i8* %path) nounwind optsize ssp {
+entry:
+  br i1 undef, label %if.end56, label %for.cond
+
+for.cond:                                         ; preds = %entry
+  br i1 undef, label %for.cond10, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  unreachable
+
+for.cond10:                                       ; preds = %for.cond
+  br i1 undef, label %if.end56, label %for.body14
+
+for.body14:                                       ; preds = %for.cond10
+  %call22 = tail call i64 @strlen(i8* undef) nounwind optsize
+  %sext = shl i64 %call22, 32
+  %conv30 = ashr exact i64 %sext, 32
+  %add29 = sub i64 0, %conv30
+  %sub = add i64 %add29, 0
+  %add31 = shl i64 %sub, 32
+  %sext59 = add i64 %add31, 4294967296
+  %conv33 = ashr exact i64 %sext59, 32
+  %call34 = tail call noalias i8* @xmalloc(i64 %conv33) nounwind optsize
+  br i1 undef, label %cond.false45, label %cond.true43
+
+cond.true43:                                      ; preds = %for.body14
+  unreachable
+
+cond.false45:                                     ; preds = %for.body14
+  %add.ptr = getelementptr inbounds i8* %path, i64 %conv30
+  unreachable
+
+if.end56:                                         ; preds = %for.cond10, %entry
+  ret i8* null
+}
+
+declare i32 @strncmp(i8* nocapture, i8* nocapture, i64) nounwind readonly optsize
+
+declare i8* @strcpy(i8*, i8* nocapture) nounwind
diff --git a/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll b/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
new file mode 100644
index 0000000..b40a581
--- /dev/null
+++ b/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+;FAST-LABEL: _Z9example25v:
+;FAST: fcmgt.4s
+;FAST: ret
+
+;CHECK-LABEL: _Z9example25v:
+;CHECK: fcmgt.4s
+;CHECK: ret
+
+define <4 x i32> @_Z9example25v( <4 x float> %N0,  <4 x float> %N1) {
+  %A = fcmp olt <4 x float> %N0, %N1
+  %B = zext <4 x i1> %A to <4 x i32>
+  ret <4 x i32> %B
+}
diff --git a/test/CodeGen/ARM64/2013-01-23-frem-crash.ll b/test/CodeGen/ARM64/2013-01-23-frem-crash.ll
new file mode 100644
index 0000000..9451124
--- /dev/null
+++ b/test/CodeGen/ARM64/2013-01-23-frem-crash.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD13158() {
+entry:
+  %B26 = frem float 0.000000e+00, undef
+  br i1 undef, label %CF, label %CF77
+
+CF:                                               ; preds = %CF, %CF76
+  store float %B26, float* undef
+  br i1 undef, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2013-01-23-sext-crash.ll b/test/CodeGen/ARM64/2013-01-23-sext-crash.ll
new file mode 100644
index 0000000..404027b
--- /dev/null
+++ b/test/CodeGen/ARM64/2013-01-23-sext-crash.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=arm64
+
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD12881() {
+BB:
+  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+  br label %CF
+
+CF:                                               ; preds = %CF83, %CF, %BB
+  br i1 undef, label %CF, label %CF83
+
+CF83:                                             ; preds = %CF
+  %FC70 = sitofp <4 x i32> %B17 to <4 x double>
+  br label %CF
+}
+
+
+define void @autogen_SD12881_2() {
+BB:
+  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+  br label %CF
+
+CF:                                               ; preds = %CF83, %CF, %BB
+  br i1 undef, label %CF, label %CF83
+
+CF83:                                             ; preds = %CF
+  %FC70 = uitofp <4 x i32> %B17 to <4 x double>
+  br label %CF
+}
+
+define void @_Z12my_example2bv() nounwind noinline ssp {
+entry:
+  %0 = fptosi <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll b/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
new file mode 100644
index 0000000..70e745f
--- /dev/null
+++ b/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple
+
+;CHECK-LABEL: Shuff:
+;CHECK: tbl.8b
+;CHECK: ret
+define <8 x i8 > @Shuff(<8 x i8> %in, <8 x i8>* %out) nounwind ssp {
+  %value = shufflevector <8 x i8> %in, <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %value
+}
+
+
diff --git a/test/CodeGen/ARM64/AdvSIMD-Scalar.ll b/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
new file mode 100644
index 0000000..6397ac5
--- /dev/null
+++ b/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -arm64-simd-scalar=true -asm-verbose=false | FileCheck %s
+;
+define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: bar:
+; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
+; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+  %add = add <2 x i64> %a, %b
+  %vgetq_lane = extractelement <2 x i64> %add, i32 0
+  %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
+  %add3 = add i64 %vgetq_lane, %vgetq_lane2
+  %sub = sub i64 %vgetq_lane, %vgetq_lane2
+  %vecinit = insertelement <2 x i64> undef, i64 %add3, i32 0
+  %vecinit8 = insertelement <2 x i64> %vecinit, i64 %sub, i32 1
+  ret <2 x i64> %vecinit8
+}
+
+define double @subdd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: subdd_su64:
+; CHECK: sub d0, d1, d0
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %sub.i = sub nsw i64 %vecext1, %vecext
+  %retval = bitcast i64 %sub.i to double
+  ret double %retval
+}
+
+define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vaddd_su64:
+; CHECK: add d0, d1, d0
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %add.i = add nsw i64 %vecext1, %vecext
+  %retval = bitcast i64 %add.i to double
+  ret double %retval
+}
diff --git a/test/CodeGen/ARM64/aapcs.ll b/test/CodeGen/ARM64/aapcs.ll
new file mode 100644
index 0000000..27d2aa7
--- /dev/null
+++ b/test/CodeGen/ARM64/aapcs.ll
@@ -0,0 +1,86 @@
+; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s
+
+@var = global i32 0, align 4
+
+define i128 @test_i128_align(i32, i128 %arg, i32 %after) {
+  store i32 %after, i32* @var, align 4
+; CHECK: str w4, [{{x[0-9]+}}, :lo12:var]
+
+  ret i128 %arg
+; CHECK: mov x0, x2
+; CHECK: mov x1, x3
+}
+
+@var64 = global i64 0, align 8
+
+  ; Check stack slots are 64-bit at all times.
+define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
+                                i32 %int, i64 %long) {
+  ; Part of last store. Blasted scheduler.
+; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
+
+  %ext_bool = zext i1 %bool to i64
+  store volatile i64 %ext_bool, i64* @var64, align 8
+; CHECK: ldr w[[EXT:[0-9]+]], [sp]
+; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
+; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_char = zext i8 %char to i64
+  store volatile i64 %ext_char, i64* @var64, align 8
+; CHECK: ldrb w[[EXT:[0-9]+]], [sp, #8]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_short = zext i16 %short to i64
+  store volatile i64 %ext_short, i64* @var64, align 8
+; CHECK: ldrh w[[EXT:[0-9]+]], [sp, #16]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_int = zext i32 %int to i64
+  store volatile i64 %ext_int, i64* @var64, align 8
+; CHECK: ldr w[[EXT:[0-9]+]], [sp, #24]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  store volatile i64 %long, i64* @var64, align 8
+; CHECK: str [[LONG]], [{{x[0-9]+}}, :lo12:var64]
+
+  ret void
+}
+
+; Make sure the callee does extensions (in the absence of zext/sext
+; keyword on args) while we're here.
+
+define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
+  %ext_bool = zext i1 %bool to i64
+  store volatile i64 %ext_bool, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x0, #0x1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_char = sext i8 %char to i64
+  store volatile i64 %ext_char, i64* @var64
+; CHECK: sxtb [[EXT:x[0-9]+]], x1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_short = zext i16 %short to i64
+  store volatile i64 %ext_short, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_int = zext i32 %int to i64
+  store volatile i64 %ext_int, i64* @var64
+; CHECK: uxtw [[EXT:x[0-9]+]], x3
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  ret void
+}
+
+declare void @variadic(i32 %a, ...)
+
+  ; Under AAPCS variadic functions have the same calling convention as
+  ; others. The extra arguments should go in registers rather than on the stack.
+define void @test_variadic() {
+  call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
+; CHECK: fmov d0, #2.0
+; CHECK: orr x1, xzr, #0x1
+; CHECK: bl variadic
+  ret void
+}
diff --git a/test/CodeGen/ARM64/abi-varargs.ll b/test/CodeGen/ARM64/abi-varargs.ll
new file mode 100644
index 0000000..92db392
--- /dev/null
+++ b/test/CodeGen/ARM64/abi-varargs.ll
@@ -0,0 +1,191 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+; rdar://13625505
+; Here we have 9 fixed integer arguments the 9th argument in on stack, the
+; varargs start right after at 8-byte alignment.
+define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
+; CHECK-LABEL: fn9:
+; 9th fixed argument
+; CHECK: ldr {{w[0-9]+}}, [sp, #64]
+; CHECK: add [[ARGS:x[0-9]+]], sp, #72
+; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
+; First vararg
+; CHECK: ldr {{w[0-9]+}}, [sp, #72]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Second vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Third vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  %7 = alloca i32, align 4
+  %8 = alloca i32, align 4
+  %9 = alloca i32, align 4
+  %args = alloca i8*, align 8
+  %a10 = alloca i32, align 4
+  %a11 = alloca i32, align 4
+  %a12 = alloca i32, align 4
+  store i32 %a1, i32* %1, align 4
+  store i32 %a2, i32* %2, align 4
+  store i32 %a3, i32* %3, align 4
+  store i32 %a4, i32* %4, align 4
+  store i32 %a5, i32* %5, align 4
+  store i32 %a6, i32* %6, align 4
+  store i32 %a7, i32* %7, align 4
+  store i32 %a8, i32* %8, align 4
+  store i32 %a9, i32* %9, align 4
+  %10 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %10)
+  %11 = va_arg i8** %args, i32
+  store i32 %11, i32* %a10, align 4
+  %12 = va_arg i8** %args, i32
+  store i32 %12, i32* %a11, align 4
+  %13 = va_arg i8** %args, i32
+  store i32 %13, i32* %a12, align 4
+  ret void
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+define i32 @main() nounwind ssp {
+; CHECK-LABEL: main:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK: str {{w[0-9]+}}, [sp]
+  %a1 = alloca i32, align 4
+  %a2 = alloca i32, align 4
+  %a3 = alloca i32, align 4
+  %a4 = alloca i32, align 4
+  %a5 = alloca i32, align 4
+  %a6 = alloca i32, align 4
+  %a7 = alloca i32, align 4
+  %a8 = alloca i32, align 4
+  %a9 = alloca i32, align 4
+  %a10 = alloca i32, align 4
+  %a11 = alloca i32, align 4
+  %a12 = alloca i32, align 4
+  store i32 1, i32* %a1, align 4
+  store i32 2, i32* %a2, align 4
+  store i32 3, i32* %a3, align 4
+  store i32 4, i32* %a4, align 4
+  store i32 5, i32* %a5, align 4
+  store i32 6, i32* %a6, align 4
+  store i32 7, i32* %a7, align 4
+  store i32 8, i32* %a8, align 4
+  store i32 9, i32* %a9, align 4
+  store i32 10, i32* %a10, align 4
+  store i32 11, i32* %a11, align 4
+  store i32 12, i32* %a12, align 4
+  %1 = load i32* %a1, align 4
+  %2 = load i32* %a2, align 4
+  %3 = load i32* %a3, align 4
+  %4 = load i32* %a4, align 4
+  %5 = load i32* %a5, align 4
+  %6 = load i32* %a6, align 4
+  %7 = load i32* %a7, align 4
+  %8 = load i32* %a8, align 4
+  %9 = load i32* %a9, align 4
+  %10 = load i32* %a10, align 4
+  %11 = load i32* %a11, align 4
+  %12 = load i32* %a12, align 4
+  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+  ret i32 0
+}
+
+;rdar://13668483
+@.str = private unnamed_addr constant [4 x i8] c"fmt\00", align 1
+define void @foo(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+  %fmt.addr = alloca i8*, align 8
+  %args = alloca i8*, align 8
+  %vc = alloca i32, align 4
+  %vv = alloca <4 x i32>, align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %args1 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %args1)
+  %0 = va_arg i8** %args, i32
+  store i32 %0, i32* %vc, align 4
+  %1 = va_arg i8** %args, <4 x i32>
+  store <4 x i32> %1, <4 x i32>* %vv, align 16
+  ret void
+}
+
+define void @bar(i32 %x, <4 x i32> %y) nounwind {
+entry:
+; CHECK-LABEL: bar:
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca <4 x i32>, align 16
+  store i32 %x, i32* %x.addr, align 4
+  store <4 x i32> %y, <4 x i32>* %y.addr, align 16
+  %0 = load i32* %x.addr, align 4
+  %1 = load <4 x i32>* %y.addr, align 16
+  call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
+  ret void
+}
+
+; rdar://13668927
+; When passing 16-byte aligned small structs as vararg, make sure the caller
+; side is 16-byte aligned on stack.
+%struct.s41 = type { i32, i16, i32, i16 }
+define void @foo2(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo2:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+  %fmt.addr = alloca i8*, align 8
+  %args = alloca i8*, align 8
+  %vc = alloca i32, align 4
+  %vs = alloca %struct.s41, align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %args1 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %args1)
+  %0 = va_arg i8** %args, i32
+  store i32 %0, i32* %vc, align 4
+  %ap.cur = load i8** %args
+  %1 = getelementptr i8* %ap.cur, i32 15
+  %2 = ptrtoint i8* %1 to i64
+  %3 = and i64 %2, -16
+  %ap.align = inttoptr i64 %3 to i8*
+  %ap.next = getelementptr i8* %ap.align, i32 16
+  store i8* %ap.next, i8** %args
+  %4 = bitcast i8* %ap.align to %struct.s41*
+  %5 = bitcast %struct.s41* %vs to i8*
+  %6 = bitcast %struct.s41* %4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 16, i32 16, i1 false)
+  ret void
+}
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @bar2(i32 %x, i128 %s41.coerce) nounwind {
+entry:
+; CHECK-LABEL: bar2:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+  %x.addr = alloca i32, align 4
+  %s41 = alloca %struct.s41, align 16
+  store i32 %x, i32* %x.addr, align 4
+  %0 = bitcast %struct.s41* %s41 to i128*
+  store i128 %s41.coerce, i128* %0, align 1
+  %1 = load i32* %x.addr, align 4
+  %2 = bitcast %struct.s41* %s41 to i128*
+  %3 = load i128* %2, align 1
+  call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/abi.ll b/test/CodeGen/ARM64/abi.ll
new file mode 100644
index 0000000..a7693b6
--- /dev/null
+++ b/test/CodeGen/ARM64/abi.ll
@@ -0,0 +1,236 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://9932559
+define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline {
+entry:
+; CHECK-LABEL: i8i16callee:
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: ldrsb	{{w[0-9]+}}, [sp, #5]
+; CHECK: ldrsh	{{w[0-9]+}}, [sp, #2]
+; CHECK: ldrsb	{{w[0-9]+}}, [sp]
+; CHECK: ldrsb	{{w[0-9]+}}, [sp, #4]
+; FAST-LABEL: i8i16callee:
+; FAST: ldrb  {{w[0-9]+}}, [sp, #5]
+; FAST: ldrb  {{w[0-9]+}}, [sp, #4]
+; FAST: ldrh  {{w[0-9]+}}, [sp, #2]
+; FAST: ldrb  {{w[0-9]+}}, [sp]
+  %conv = sext i8 %a4 to i64
+  %conv3 = sext i16 %a5 to i64
+  %conv8 = sext i8 %b1 to i64
+  %conv9 = sext i16 %b2 to i64
+  %conv11 = sext i8 %b3 to i64
+  %conv13 = sext i8 %b4 to i64
+  %add10 = add i64 %a2, %a1
+  %add12 = add i64 %add10, %a3
+  %add14 = add i64 %add12, %conv
+  %add = add i64 %add14, %conv3
+  %add1 = add i64 %add, %a6
+  %add2 = add i64 %add1, %a7
+  %add4 = add i64 %add2, %a8
+  %add5 = add i64 %add4, %conv8
+  %add6 = add i64 %add5, %conv9
+  %add7 = add i64 %add6, %conv11
+  %add15 = add i64 %add7, %conv13
+  %sext = shl i64 %add15, 32
+  %conv17 = ashr exact i64 %sext, 32
+  ret i64 %conv17
+}
+
+define i32 @i8i16caller() nounwind readnone {
+entry:
+; CHECK: i8i16caller
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: strb {{w[0-9]+}}, [sp, #5]
+; CHECK: strb {{w[0-9]+}}, [sp, #4]
+; CHECK: strh {{w[0-9]+}}, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp]
+; CHECK: bl
+; FAST: i8i16caller
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strh {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #4]
+; FAST: strb {{w[0-9]+}}, [sp, #5]
+; FAST: bl
+  %call = tail call i64 @i8i16callee(i64 0, i64 1, i64 2, i8 signext 3, i16 signext 4, i64 5, i64 6, i64 7, i8 signext 97, i16 signext 98, i8 signext 99, i8 signext 100)
+  %conv = trunc i64 %call to i32
+  ret i32 %conv
+}
+
+; rdar://12651543
+define double @circle_center([2 x float] %a) nounwind ssp {
+  %call = tail call double @ext([2 x float] %a) nounwind
+; CHECK: circle_center
+; CHECK: bl
+  ret double %call
+}
+declare double @ext([2 x float])
+
+; rdar://12656141
+; 16-byte vector should be aligned at 16-byte when passing on stack.
+; A double argument will be passed on stack, so vecotr should be at sp+16.
+define double @fixed_4i(<4 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: fixed_4i
+; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
+; FAST: fixed_4i
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
+  %0 = load <4 x i32>* %in, align 16
+  %call = tail call double @args_vec_4i(double 3.000000e+00, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, double 3.000000e+00, <4 x i32> %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_vec_4i(double, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, double, <4 x i32>, i8 signext)
+
+; rdar://12695237
+; d8 at sp, i in register w0.
+@g_d = common global double 0.000000e+00, align 8
+define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4,
+       double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp {
+entry:
+; CHECK: test1
+; CHECK: ldr [[REG_1:d[0-9]+]], [sp]
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+  %conv = sitofp i32 %i to float
+  %add = fadd float %conv, %f1
+  %conv1 = fpext float %add to double
+  %add2 = fadd double %conv1, %d7
+  %add3 = fadd double %add2, %d8
+  store double %add3, double* @g_d, align 8
+  ret void
+}
+
+; i9 at sp, d1 in register s0.
+define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+            i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp {
+entry:
+; CHECK: test2
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+; CHECK: ldr [[REG_1:s[0-9]+]], [sp]
+  %conv = sitofp i32 %i1 to float
+  %add = fadd float %conv, %d1
+  %conv1 = fpext float %add to double
+  %conv2 = sitofp i32 %i8 to double
+  %add3 = fadd double %conv2, %conv1
+  %conv4 = sitofp i32 %i9 to double
+  %add5 = fadd double %conv4, %add3
+  store double %add5, double* @g_d, align 8
+  ret void
+}
+
+; rdar://12648441
+; Check alignment on stack for v64, f64, i64, f32, i32.
+define double @test3(<2 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: test3
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; FAST: test3
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
+  %0 = load <2 x i32>* %in, align 8
+  %call = tail call double @args_vec_2i(double 3.000000e+00, <2 x i32> %0,
+          <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0,
+          <2 x i32> %0, float 3.000000e+00, <2 x i32> %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_vec_2i(double, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>,
+               <2 x i32>, <2 x i32>, <2 x i32>, float, <2 x i32>, i8 signext)
+
+define double @test4(double* nocapture %in) nounwind {
+entry:
+; CHECK: test4
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+; CHECK: orr w0, wzr, #0x3
+  %0 = load double* %in, align 8
+  %call = tail call double @args_f64(double 3.000000e+00, double %0, double %0,
+          double %0, double %0, double %0, double %0, double %0,
+          float 3.000000e+00, double %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_f64(double, double, double, double, double, double, double,
+               double, float, double, i8 signext)
+
+define i64 @test5(i64* nocapture %in) nounwind {
+entry:
+; CHECK: test5
+; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16]
+; CHECK: str [[REG_1:x[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+  %0 = load i64* %in, align 8
+  %call = tail call i64 @args_i64(i64 3, i64 %0, i64 %0, i64 %0, i64 %0, i64 %0,
+                         i64 %0, i64 %0, i32 3, i64 %0, i8 signext 3)
+  ret i64 %call
+}
+declare i64 @args_i64(i64, i64, i64, i64, i64, i64, i64, i64, i32, i64,
+             i8 signext)
+
+define i32 @test6(float* nocapture %in) nounwind {
+entry:
+; CHECK: test6
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:s[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+  %0 = load float* %in, align 4
+  %call = tail call i32 @args_f32(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+          i32 7, i32 8, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0,
+          float 6.0, float 7.0, float 8.0, i16 signext 3, float %0,
+          i8 signext 3)
+  ret i32 %call
+}
+declare i32 @args_f32(i32, i32, i32, i32, i32, i32, i32, i32,
+                      float, float, float, float, float, float, float, float,
+                      i16 signext, float, i8 signext)
+
+define i32 @test7(i32* nocapture %in) nounwind {
+entry:
+; CHECK: test7
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:w[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+  %0 = load i32* %in, align 4
+  %call = tail call i32 @args_i32(i32 3, i32 %0, i32 %0, i32 %0, i32 %0, i32 %0,
+                         i32 %0, i32 %0, i16 signext 3, i32 %0, i8 signext 4)
+  ret i32 %call
+}
+declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
+             i8 signext)
+
+define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
+entry:
+; CHECK: test8
+; CHECK: strb {{w[0-9]+}}, [sp, #3]
+; CHECK: strb wzr, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp, #1]
+; CHECK: strb wzr, [sp]
+; CHECK: bl
+; FAST: test8
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strb {{w[0-9]+}}, [sp, #1]
+; FAST: strb {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #3]
+; FAST: bl
+  tail call void @args_i1(i1 zeroext false, i1 zeroext true, i1 zeroext false,
+                  i1 zeroext true, i1 zeroext false, i1 zeroext true,
+                  i1 zeroext false, i1 zeroext true, i1 zeroext false,
+                  i1 zeroext true, i1 zeroext false, i1 zeroext true)
+  ret i32 0
+}
+
+declare void @args_i1(i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext)
+
+define i32 @i1_stack_incoming(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f,
+                               i64 %g, i64 %h, i64 %i, i1 zeroext %j) {
+; CHECK-LABEL: i1_stack_incoming:
+; CHECK: ldrb w0, [sp, #8]
+; CHECK: ret
+  %v = zext i1 %j to i32
+  ret i32 %v
+}
diff --git a/test/CodeGen/ARM64/abi_align.ll b/test/CodeGen/ARM64/abi_align.ll
new file mode 100644
index 0000000..61c661e
--- /dev/null
+++ b/test/CodeGen/ARM64/abi_align.ll
@@ -0,0 +1,529 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://12648441
+; Generated from arm64-arguments.c with -O2.
+; Test passing structs with size < 8, < 16 and > 16
+; with alignment of 16 and without
+
+; Structs with size < 8
+%struct.s38 = type { i32, i16 }
+; With alignment of 16, the size will be padded to multiple of 16 bytes.
+%struct.s39 = type { i32, i16, [10 x i8] }
+; Structs with size < 16
+%struct.s40 = type { i32, i16, i32, i16 }
+%struct.s41 = type { i32, i16, i32, i16 }
+; Structs with size > 16
+%struct.s42 = type { i32, i16, i32, i16, i32, i16 }
+%struct.s43 = type { i32, i16, i32, i16, i32, i16, [10 x i8] }
+
+@g38 = common global %struct.s38 zeroinitializer, align 4
+@g38_2 = common global %struct.s38 zeroinitializer, align 4
+@g39 = common global %struct.s39 zeroinitializer, align 16
+@g39_2 = common global %struct.s39 zeroinitializer, align 16
+@g40 = common global %struct.s40 zeroinitializer, align 4
+@g40_2 = common global %struct.s40 zeroinitializer, align 4
+@g41 = common global %struct.s41 zeroinitializer, align 16
+@g41_2 = common global %struct.s41 zeroinitializer, align 16
+@g42 = common global %struct.s42 zeroinitializer, align 4
+@g42_2 = common global %struct.s42 zeroinitializer, align 4
+@g43 = common global %struct.s43 zeroinitializer, align 16
+@g43_2 = common global %struct.s43 zeroinitializer, align 16
+
+; structs with size < 8 bytes, passed via i64 in x1 and x2
+define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 {
+entry:
+; CHECK: f38
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w2
+  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i64 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i64 %s2.coerce, 32
+  %sext8 = shl nuw nsw i64 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i64 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i64 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i64 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller38() #1 {
+entry:
+; CHECK: caller38
+; CHECK: ldr x1,
+; CHECK: ldr x2,
+  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+  %call = tail call i32 @f38(i32 3, i64 %0, i64 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i64 %s1.coerce, i64 %s2.coerce) #0
+
+; structs with size < 8 bytes, passed on stack at [sp+8] and [sp+16]
+; i9 at [sp]
+define i32 @caller38_stack() #1 {
+entry:
+; CHECK: caller38_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+  %call = tail call i32 @f38_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                                   i32 7, i32 8, i32 9, i64 %0, i64 %1) #5
+  ret i32 %call
+}
+
+; structs with size < 8 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f39
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i128 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i128 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller39() #1 {
+entry:
+; CHECK: caller39
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+  %call = tail call i32 @f39(i32 3, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 8 bytes, alignment 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller39_stack() #1 {
+entry:
+; CHECK: caller39_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+  %call = tail call i32 @f39_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                                   i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+; structs with size < 16 bytes
+; passed via i128 in x1 and x3
+define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 {
+entry:
+; CHECK: f40
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0
+  %s2.coerce.fca.0.extract = extractvalue [2 x i64] %s2.coerce, 0
+  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce.fca.0.extract to i32
+  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce.fca.0.extract to i32
+  %s1.sroa.0.4.extract.shift = lshr i64 %s1.coerce.fca.0.extract, 32
+  %sext8 = shl nuw nsw i64 %s1.sroa.0.4.extract.shift, 16
+  %sext = trunc i64 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %s2.sroa.0.4.extract.shift = lshr i64 %s2.coerce.fca.0.extract, 32
+  %sext1011 = shl nuw nsw i64 %s2.sroa.0.4.extract.shift, 16
+  %sext10 = trunc i64 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller40() #1 {
+entry:
+; CHECK: caller40
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+  %call = tail call i32 @f40(i32 3, [2 x i64] %0, [2 x i64] %1) #5
+  ret i32 %call
+}
+
+declare i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0
+
+; structs with size < 16 bytes
+; passed on stack at [sp+8] and [sp+24]
+define i32 @caller40_stack() #1 {
+entry:
+; CHECK: caller40_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+  %call = tail call i32 @f40_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                         i32 7, i32 8, i32 9, [2 x i64] %0, [2 x i64] %1) #5
+  ret i32 %call
+}
+
+; structs with size < 16 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f41
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i128 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i128 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller41() #1 {
+entry:
+; CHECK: caller41
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+  %call = tail call i32 @f41(i32 3, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 16 bytes, alignment of 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller41_stack() #1 {
+entry:
+; CHECK: caller41_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+  %call = tail call i32 @f41_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                            i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+; structs with size of 22 bytes, passed indirectly in x1 and x2
+define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 {
+entry:
+; CHECK: f42
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f42
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+  %i1 = getelementptr inbounds %struct.s42* %s1, i64 0, i32 0
+  %0 = load i32* %i1, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 0
+  %1 = load i32* %i2, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.s42* %s1, i64 0, i32 1
+  %2 = load i16* %s, align 2, !tbaa !3
+  %conv = sext i16 %2 to i32
+  %s5 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 1
+  %3 = load i16* %s5, align 2, !tbaa !3
+  %conv6 = sext i16 %3 to i32
+  %add = add i32 %0, %i
+  %add3 = add i32 %add, %1
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+; For s1, we allocate a 22-byte space, pass its address via x1
+define i32 @caller42() #3 {
+entry:
+; CHECK: caller42
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller42
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-24 = sp+72
+; Space for s2 is allocated at sp+48
+; FAST: sub x[[A:[0-9]+]], fp, #24
+; FAST: add x[[A:[0-9]+]], sp, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+  %tmp = alloca %struct.s42, align 4
+  %tmp1 = alloca %struct.s42, align 4
+  %0 = bitcast %struct.s42* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s42* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %call = call i32 @f42(i32 3, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+  ret i32 %call
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #4
+
+declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                       i32 %i7, i32 %i8, i32 %i9, %struct.s42* nocapture %s1,
+                       %struct.s42* nocapture %s2) #2
+
+define i32 @caller42_stack() #3 {
+entry:
+; CHECK: caller42_stack
+; CHECK: mov fp, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{x[0-9]+}}, [fp, #-16]
+; CHECK: stur {{q[0-9]+}}, [fp, #-32]
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], fp, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller42_stack
+; Space for s1 is allocated at fp-24
+; Space for s2 is allocated at fp-48
+; FAST: sub x[[A:[0-9]+]], fp, #24
+; FAST: sub x[[B:[0-9]+]], fp, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+  %tmp = alloca %struct.s42, align 4
+  %tmp1 = alloca %struct.s42, align 4
+  %0 = bitcast %struct.s42* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s42* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %call = call i32 @f42_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+  ret i32 %call
+}
+
+; structs with size of 22 bytes, alignment of 16
+; passed indirectly in x1 and x2
+define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 {
+entry:
+; CHECK: f43
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f43
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+  %i1 = getelementptr inbounds %struct.s43* %s1, i64 0, i32 0
+  %0 = load i32* %i1, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 0
+  %1 = load i32* %i2, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.s43* %s1, i64 0, i32 1
+  %2 = load i16* %s, align 2, !tbaa !3
+  %conv = sext i16 %2 to i32
+  %s5 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 1
+  %3 = load i16* %s5, align 2, !tbaa !3
+  %conv6 = sext i16 %3 to i32
+  %add = add i32 %0, %i
+  %add3 = add i32 %add, %1
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller43() #3 {
+entry:
+; CHECK: caller43
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller43
+; FAST: mov fp, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+; FAST: add x1, sp, #32
+; FAST: mov x2, sp
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{x[0-9]+}}, [sp]
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+; FAST: str {{x[0-9]+}}, [sp, #24]
+  %tmp = alloca %struct.s43, align 16
+  %tmp1 = alloca %struct.s43, align 16
+  %0 = bitcast %struct.s43* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s43* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %call = call i32 @f43(i32 3, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+  ret i32 %call
+}
+
+declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                       i32 %i7, i32 %i8, i32 %i9, %struct.s43* nocapture %s1,
+                       %struct.s43* nocapture %s2) #2
+
+define i32 @caller43_stack() #3 {
+entry:
+; CHECK: caller43_stack
+; CHECK: mov fp, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{q[0-9]+}}, [fp, #-16]
+; CHECK: stur {{q[0-9]+}}, [fp, #-32]
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], fp, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller43_stack
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; FAST: sub x[[A:[0-9]+]], fp, #32
+; FAST: add x[[B:[0-9]+]], sp, #32
+; FAST: stur {{x[0-9]+}}, [fp, #-32]
+; FAST: stur {{x[0-9]+}}, [fp, #-24]
+; FAST: stur {{x[0-9]+}}, [fp, #-16]
+; FAST: stur {{x[0-9]+}}, [fp, #-8]
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+  %tmp = alloca %struct.s43, align 16
+  %tmp1 = alloca %struct.s43, align 16
+  %0 = bitcast %struct.s43* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s43* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %call = call i32 @f43_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+  ret i32 %call
+}
+
+; rdar://13668927
+; Check that we don't split an i128.
+declare i32 @callee_i128_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+                               i32 %i6, i32 %i7, i128 %s1, i32 %i8)
+
+define i32 @i128_split() {
+entry:
+; CHECK: i128_split
+; "i128 %0" should be on stack at [sp].
+; "i32 8" should be on stack at [sp, #16].
+; CHECK: str {{w[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
+; FAST: i128_split
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
+; FAST: stp {{x[0-9]+}}, {{x[0-9]+}}, [x[[ADDR]]]
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %call = tail call i32 @callee_i128_split(i32 1, i32 2, i32 3, i32 4, i32 5,
+                                           i32 6, i32 7, i128 %0, i32 8) #5
+  ret i32 %call
+}
+
+declare i32 @callee_i64(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+                               i32 %i6, i32 %i7, i64 %s1, i32 %i8)
+
+define i32 @i64_split() {
+entry:
+; CHECK: i64_split
+; "i64 %0" should be in register x7.
+; "i32 8" should be on stack at [sp].
+; CHECK: ldr x7, [{{x[0-9]+}}]
+; CHECK: str {{w[0-9]+}}, [sp]
+; FAST: i64_split
+; FAST: ldr x7, [{{x[0-9]+}}]
+; FAST: str {{w[0-9]+}}, [sp]
+  %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16
+  %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
+                                    i32 6, i32 7, i64 %0, i32 8) #5
+  ret i32 %call
+}
+
+attributes #0 = { noinline nounwind readnone "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #1 = { nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #2 = { noinline nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #3 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #4 = { nounwind }
+attributes #5 = { nobuiltin }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"short", metadata !1}
+!4 = metadata !{i64 0, i64 4, metadata !0, i64 4, i64 2, metadata !3, i64 8, i64 4, metadata !0, i64 12, i64 2, metadata !3, i64 16, i64 4, metadata !0, i64 20, i64 2, metadata !3}
diff --git a/test/CodeGen/ARM64/addp.ll b/test/CodeGen/ARM64/addp.ll
new file mode 100644
index 0000000..8283a00
--- /dev/null
+++ b/test/CodeGen/ARM64/addp.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define double @foo(<2 x double> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x double> %a, i32 0
+  %lane1.i = extractelement <2 x double> %a, i32 1
+  %vpaddd.i = fadd double %lane0.i, %lane1.i
+  ret double %vpaddd.i
+}
+
+define i64 @foo0(<2 x i64> %a) nounwind {
+; CHECK-LABEL: foo0:
+; CHECK: addp.2d d0, v0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x i64> %a, i32 0
+  %lane1.i = extractelement <2 x i64> %a, i32 1
+  %vpaddd.i = add i64 %lane0.i, %lane1.i
+  ret i64 %vpaddd.i
+}
+
+define float @foo1(<2 x float> %a) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: faddp.2s
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x float> %a, i32 0
+  %lane1.i = extractelement <2 x float> %a, i32 1
+  %vpaddd.i = fadd float %lane0.i, %lane1.i
+  ret float %vpaddd.i
+}
diff --git a/test/CodeGen/ARM64/addr-mode-folding.ll b/test/CodeGen/ARM64/addr-mode-folding.ll
new file mode 100644
index 0000000..dff2331
--- /dev/null
+++ b/test/CodeGen/ARM64/addr-mode-folding.ll
@@ -0,0 +1,171 @@
+; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
+; <rdar://problem/13621857>
+
+@block = common global i8* null, align 8
+
+define i32 @fct(i32 %i1, i32 %i2) {
+; CHECK: @fct
+; Sign extension is used more than once, thus it should not be folded.
+; CodeGenPrepare is not sharing sext accross uses, thus this is folded because
+; of that.
+; _CHECK-NOT_: , sxtw]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv8 = zext i1 %cmp7 to i32
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc9 = add nsw i32 %i2, 1
+  %idxprom10 = sext i32 %inc to i64
+  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+  %3 = load i8* %arrayidx11, align 1
+  %idxprom12 = sext i32 %inc9 to i64
+  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+  %4 = load i8* %arrayidx13, align 1
+  %cmp16 = icmp eq i8 %3, %4
+  br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18:                                        ; preds = %if.end
+  %cmp21 = icmp ugt i8 %3, %4
+  %conv22 = zext i1 %cmp21 to i32
+  br label %return
+
+if.end23:                                         ; preds = %if.end
+  %inc24 = add nsw i32 %i1, 2
+  %inc25 = add nsw i32 %i2, 2
+  %idxprom26 = sext i32 %inc24 to i64
+  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+  %5 = load i8* %arrayidx27, align 1
+  %idxprom28 = sext i32 %inc25 to i64
+  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+  %6 = load i8* %arrayidx29, align 1
+  %cmp32 = icmp eq i8 %5, %6
+  br i1 %cmp32, label %return, label %if.then34
+
+if.then34:                                        ; preds = %if.end23
+  %cmp37 = icmp ugt i8 %5, %6
+  %conv38 = zext i1 %cmp37 to i32
+  br label %return
+
+return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
+  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+  ret i32 %retval.0
+}
+
+define i32 @fct1(i32 %i1, i32 %i2) optsize {
+; CHECK: @fct1
+; Addressing are folded when optimizing for code size.
+; CHECK: , sxtw]
+; CHECK: , sxtw]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv8 = zext i1 %cmp7 to i32
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc9 = add nsw i32 %i2, 1
+  %idxprom10 = sext i32 %inc to i64
+  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+  %3 = load i8* %arrayidx11, align 1
+  %idxprom12 = sext i32 %inc9 to i64
+  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+  %4 = load i8* %arrayidx13, align 1
+  %cmp16 = icmp eq i8 %3, %4
+  br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18:                                        ; preds = %if.end
+  %cmp21 = icmp ugt i8 %3, %4
+  %conv22 = zext i1 %cmp21 to i32
+  br label %return
+
+if.end23:                                         ; preds = %if.end
+  %inc24 = add nsw i32 %i1, 2
+  %inc25 = add nsw i32 %i2, 2
+  %idxprom26 = sext i32 %inc24 to i64
+  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+  %5 = load i8* %arrayidx27, align 1
+  %idxprom28 = sext i32 %inc25 to i64
+  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+  %6 = load i8* %arrayidx29, align 1
+  %cmp32 = icmp eq i8 %5, %6
+  br i1 %cmp32, label %return, label %if.then34
+
+if.then34:                                        ; preds = %if.end23
+  %cmp37 = icmp ugt i8 %5, %6
+  %conv38 = zext i1 %cmp37 to i32
+  br label %return
+
+return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
+  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+  ret i32 %retval.0
+}
+
+; CHECK: @test
+; CHECK-NOT: , uxtw #2]
+define i32 @test(i32* %array, i8 zeroext %c, i32 %arg) {
+entry:
+  %conv = zext i8 %c to i32
+  %add = sub i32 0, %arg
+  %tobool = icmp eq i32 %conv, %add
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = zext i8 %c to i64
+  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+  %0 = load volatile i32* %arrayidx, align 4
+  %1 = load volatile i32* %arrayidx, align 4
+  %add3 = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+  ret i32 %res.0
+}
+
+
+; CHECK: @test2
+; CHECK: , uxtw #2]
+; CHECK: , uxtw #2]
+define i32 @test2(i32* %array, i8 zeroext %c, i32 %arg) optsize {
+entry:
+  %conv = zext i8 %c to i32
+  %add = sub i32 0, %arg
+  %tobool = icmp eq i32 %conv, %add
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = zext i8 %c to i64
+  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+  %0 = load volatile i32* %arrayidx, align 4
+  %1 = load volatile i32* %arrayidx, align 4
+  %add3 = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+  ret i32 %res.0
+}
diff --git a/test/CodeGen/ARM64/addr-type-promotion.ll b/test/CodeGen/ARM64/addr-type-promotion.ll
new file mode 100644
index 0000000..0677603
--- /dev/null
+++ b/test/CodeGen/ARM64/addr-type-promotion.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march arm64 < %s | FileCheck %s
+; rdar://13452552
+; ModuleID = 'reduced_test.ll'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+@block = common global i8* null, align 8
+
+define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
+; CHECK: fullGtU
+; CHECK: adrp [[PAGE:x[0-9]+]], _block@GOTPAGE
+; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block@GOTPAGEOFF]
+; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
+; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  x0, sxtw]
+; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], x1, sxtw]
+; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
+; CHECK-NEXT b.ne
+; Next BB
+; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw
+; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw
+; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1]
+; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1]
+; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]]
+; CHECK-NEXT: b.ne
+; Next BB
+; CHECK: ldrb [[LOADEDVAL3:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #2]
+; CHECK-NEXT: ldrb [[LOADEDVAL4:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #2]
+; CHECK-NEXT: cmp [[LOADEDVAL3]], [[LOADEDVAL4]]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %tmp = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %tmp, i64 %idxprom
+  %tmp1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %tmp, i64 %idxprom1
+  %tmp2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %tmp1, %tmp2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %tmp1, %tmp2
+  %conv9 = zext i1 %cmp7 to i8
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc10 = add nsw i32 %i2, 1
+  %idxprom11 = sext i32 %inc to i64
+  %arrayidx12 = getelementptr inbounds i8* %tmp, i64 %idxprom11
+  %tmp3 = load i8* %arrayidx12, align 1
+  %idxprom13 = sext i32 %inc10 to i64
+  %arrayidx14 = getelementptr inbounds i8* %tmp, i64 %idxprom13
+  %tmp4 = load i8* %arrayidx14, align 1
+  %cmp17 = icmp eq i8 %tmp3, %tmp4
+  br i1 %cmp17, label %if.end25, label %if.then19
+
+if.then19:                                        ; preds = %if.end
+  %cmp22 = icmp ugt i8 %tmp3, %tmp4
+  %conv24 = zext i1 %cmp22 to i8
+  br label %return
+
+if.end25:                                         ; preds = %if.end
+  %inc26 = add nsw i32 %i1, 2
+  %inc27 = add nsw i32 %i2, 2
+  %idxprom28 = sext i32 %inc26 to i64
+  %arrayidx29 = getelementptr inbounds i8* %tmp, i64 %idxprom28
+  %tmp5 = load i8* %arrayidx29, align 1
+  %idxprom30 = sext i32 %inc27 to i64
+  %arrayidx31 = getelementptr inbounds i8* %tmp, i64 %idxprom30
+  %tmp6 = load i8* %arrayidx31, align 1
+  %cmp34 = icmp eq i8 %tmp5, %tmp6
+  br i1 %cmp34, label %return, label %if.then36
+
+if.then36:                                        ; preds = %if.end25
+  %cmp39 = icmp ugt i8 %tmp5, %tmp6
+  %conv41 = zext i1 %cmp39 to i8
+  br label %return
+
+return:                                           ; preds = %if.then36, %if.end25, %if.then19, %if.then
+  %retval.0 = phi i8 [ %conv9, %if.then ], [ %conv24, %if.then19 ], [ %conv41, %if.then36 ], [ 0, %if.end25 ]
+  ret i8 %retval.0
+}
diff --git a/test/CodeGen/ARM64/addrmode.ll b/test/CodeGen/ARM64/addrmode.ll
new file mode 100644
index 0000000..e131237
--- /dev/null
+++ b/test/CodeGen/ARM64/addrmode.ll
@@ -0,0 +1,72 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+; rdar://10232252
+
+@object = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+; base + offset (imm9)
+; CHECK: @t1
+; CHECK: ldr xzr, [x{{[0-9]+}}, #8]
+; CHECK: ret
+define void @t1() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 1
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + offset (> imm9)
+; CHECK: @t2
+; CHECK: sub [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #264
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t2() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 -33
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + unsigned offset (> imm9 and <= imm12 * size of type in bytes)
+; CHECK: @t3
+; CHECK: ldr xzr, [x{{[0-9]+}}, #32760]
+; CHECK: ret
+define void @t3() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 4095
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + unsigned offset (> imm12 * size of type in bytes)
+; CHECK: @t4
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #32768
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t4() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 4096
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + reg
+; CHECK: @t5
+; CHECK: ldr xzr, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #3]
+; CHECK: ret
+define void @t5(i64 %a) {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 %a
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + reg + imm
+; CHECK: @t6
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
+; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #32768
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t6(i64 %a) {
+  %tmp1 = getelementptr inbounds i64* @object, i64 %a
+  %incdec.ptr = getelementptr inbounds i64* %tmp1, i64 4096
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM64/alloc-no-stack-realign.ll b/test/CodeGen/ARM64/alloc-no-stack-realign.ll
new file mode 100644
index 0000000..f396bc9
--- /dev/null
+++ b/test/CodeGen/ARM64/alloc-no-stack-realign.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=false | FileCheck %s
+
+; rdar://12713765
+; Make sure we are not creating stack objects that are assumed to be 64-byte
+; aligned.
+@T3_retval = common global <16 x float> zeroinitializer, align 16
+
+define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp {
+entry:
+; CHECK: test
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]]
+ %retval = alloca <16 x float>, align 16
+ %0 = load <16 x float>* @T3_retval, align 16
+ store <16 x float> %0, <16 x float>* %retval
+ %1 = load <16 x float>* %retval
+ store <16 x float> %1, <16 x float>* %agg.result, align 16
+ ret void
+}
diff --git a/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll b/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
new file mode 100644
index 0000000..3750f31
--- /dev/null
+++ b/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s
+
+; CHECK: foo
+; CHECK: ldr w[[REG:[0-9]+]], [x19, #264]
+; CHECK: str w[[REG]], [x19, #132]
+; CHECK: ldr w{{[0-9]+}}, [x19, #264]
+
+define i32 @foo(i32 %a) nounwind {
+  %retval = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  %arr = alloca [32 x i32], align 4
+  %i = alloca i32, align 4
+  %arr2 = alloca [32 x i32], align 4
+  %j = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %tmp1 = zext i32 %tmp to i64
+  %v = mul i64 4, %tmp1
+  %vla = alloca i8, i64 %v, align 4
+  %tmp2 = bitcast i8* %vla to i32*
+  %tmp3 = load i32* %a.addr, align 4
+  store i32 %tmp3, i32* %i, align 4
+  %tmp4 = load i32* %a.addr, align 4
+  store i32 %tmp4, i32* %j, align 4
+  %tmp5 = load i32* %j, align 4
+  store i32 %tmp5, i32* %retval
+  %x = load i32* %retval
+  ret i32 %x
+}
diff --git a/test/CodeGen/ARM64/andCmpBrToTBZ.ll b/test/CodeGen/ARM64/andCmpBrToTBZ.ll
new file mode 100644
index 0000000..4194977
--- /dev/null
+++ b/test/CodeGen/ARM64/andCmpBrToTBZ.ll
@@ -0,0 +1,72 @@
+; RUN: llc -O1 -march=arm64 -enable-andcmp-sinking=true < %s | FileCheck %s
+; ModuleID = 'and-cbz-extr-mr.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+define zeroext i1 @foo(i1 %IsEditable, i1 %isTextField, i8* %str1, i8* %str2, i8* %str3, i8* %str4, i8* %str5, i8* %str6, i8* %str7, i8* %str8, i8* %str9, i8* %str10, i8* %str11, i8* %str12, i8* %str13, i32 %int1, i8* %str14) unnamed_addr #0 align 2 {
+; CHECK: _foo:
+entry:
+  %tobool = icmp eq i8* %str14, null
+  br i1 %tobool, label %return, label %if.end
+
+; CHECK: %if.end
+; CHECK: tbz
+if.end:                                           ; preds = %entry
+  %and.i.i.i = and i32 %int1, 4
+  %tobool.i.i.i = icmp eq i32 %and.i.i.i, 0
+  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i
+
+land.rhs.i:                                       ; preds = %if.end
+  %cmp.i.i.i = icmp eq i8* %str12, %str13
+  br i1 %cmp.i.i.i, label %if.then3, label %lor.rhs.i.i.i
+
+lor.rhs.i.i.i:                                    ; preds = %land.rhs.i
+  %cmp.i13.i.i.i = icmp eq i8* %str10, %str11
+  br i1 %cmp.i13.i.i.i, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, label %if.end5
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit: ; preds = %lor.rhs.i.i.i
+  %cmp.i.i.i.i = icmp eq i8* %str8, %str9
+  br i1 %cmp.i.i.i.i, label %if.then3, label %if.end5
+
+if.then3:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %land.rhs.i
+  %tmp11 = load i8* %str14, align 8
+  %tmp12 = and i8 %tmp11, 2
+  %tmp13 = icmp ne i8 %tmp12, 0
+  br label %return
+
+if.end5:                                          ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %lor.rhs.i.i.i
+; CHECK: %if.end5
+; CHECK: tbz
+  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i19
+
+land.rhs.i19:                                     ; preds = %if.end5
+  %cmp.i.i.i18 = icmp eq i8* %str6, %str7
+  br i1 %cmp.i.i.i18, label %if.then7, label %lor.rhs.i.i.i23
+
+lor.rhs.i.i.i23:                                  ; preds = %land.rhs.i19
+  %cmp.i13.i.i.i22 = icmp eq i8* %str3, %str4
+  br i1 %cmp.i13.i.i.i22, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, label %if.end12
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28: ; preds = %lor.rhs.i.i.i23
+  %cmp.i.i.i.i26 = icmp eq i8* %str1, %str2
+  br i1 %cmp.i.i.i.i26, label %if.then7, label %if.end12
+
+if.then7:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %land.rhs.i19
+  br i1 %isTextField, label %if.then9, label %if.end12
+
+if.then9:                                         ; preds = %if.then7
+  %tmp23 = load i8* %str5, align 8
+  %tmp24 = and i8 %tmp23, 2
+  %tmp25 = icmp ne i8 %tmp24, 0
+  br label %return
+
+if.end12:                                         ; preds = %if.then7, %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %lor.rhs.i.i.i23, %if.end5, %if.end
+  %lnot = xor i1 %IsEditable, true
+  br label %return
+
+return:                                           ; preds = %if.end12, %if.then9, %if.then3, %entry
+  %retval.0 = phi i1 [ %tmp13, %if.then3 ], [ %tmp25, %if.then9 ], [ %lnot, %if.end12 ], [ true, %entry ]
+  ret i1 %retval.0
+}
+
+attributes #0 = { nounwind ssp }
diff --git a/test/CodeGen/ARM64/anyregcc-crash.ll b/test/CodeGen/ARM64/anyregcc-crash.ll
new file mode 100644
index 0000000..241cf97
--- /dev/null
+++ b/test/CodeGen/ARM64/anyregcc-crash.ll
@@ -0,0 +1,19 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+;
+; Check that misuse of anyregcc results in a compile time error.
+
+; CHECK: LLVM ERROR: ran out of registers during register allocation
+define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                        i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                        i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                        i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
+                i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32)
+  ret i64 %result
+}
+
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/anyregcc.ll b/test/CodeGen/ARM64/anyregcc.ll
new file mode 100644
index 0000000..e26875d
--- /dev/null
+++ b/test/CodeGen/ARM64/anyregcc.ll
@@ -0,0 +1,363 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Stackmap Header: no constants - 6 callsites
+; CHECK-LABEL: .section	__LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 8
+; Num LargeConstants
+; CHECK-NEXT:   .long 0
+; Num Callsites
+; CHECK-NEXT:   .long 8
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _test
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _property_access1
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _property_access2
+; CHECK-NEXT:   .quad 32
+; CHECK-NEXT:   .quad _property_access3
+; CHECK-NEXT:   .quad 32
+; CHECK-NEXT:   .quad _anyreg_test1
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _anyreg_test2
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _patchpoint_spilldef
+; CHECK-NEXT:   .quad 112
+; CHECK-NEXT:   .quad _patchpoint_spillargs
+; CHECK-NEXT:   .quad 128
+
+
+; test
+; CHECK-LABEL:  .long   L{{.*}}-_test
+; CHECK-NEXT:   .short  0
+; 3 locations
+; CHECK-NEXT:   .short  3
+; Loc 0: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Constant 3
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long 3
+define i64 @test() nounwind ssp uwtable {
+entry:
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
+  ret i64 0
+}
+
+; property access 1 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   L{{.*}}-_property_access1
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
+  ret i64 %ret
+}
+
+; property access 2 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   L{{.*}}-_property_access2
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access2() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
+  ret i64 %ret
+}
+
+; property access 3 - %obj is a frame index
+; CHECK-LABEL:  .long   L{{.*}}-_property_access3
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Direct FP - 8
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+; CHECK-NEXT:   .long -8
+define i64 @property_access3() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
+  ret i64 %ret
+}
+
+; anyreg_test1
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test1
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; anyreg_test2
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test2
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; Test spilling the return value of an anyregcc call.
+;
+; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 3
+; Loc 0: Register (some register that will be spilled to the stack)
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  ret i64 %result
+}
+
+; Test spilling the arguments of an anyregcc call.
+;
+; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 5
+; Loc 0: Return a register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Arg0 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 2: Arg1 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 3: Arg2 spilled to FP -96
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -96
+; Loc 4: Arg3 spilled to FP - 88
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -88
+define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/arith-saturating.ll b/test/CodeGen/ARM64/arith-saturating.ll
new file mode 100644
index 0000000..437ebb8
--- /dev/null
+++ b/test/CodeGen/ARM64/arith-saturating.ll
@@ -0,0 +1,153 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qadds:
+; CHECK: sqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.arm64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qaddd:
+; CHECK: sqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.arm64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqadds:
+; CHECK: uqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.arm64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqaddd:
+; CHECK: uqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.arm64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+declare i64 @llvm.arm64.neon.uqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.uqadd.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32) nounwind readnone
+
+define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubs:
+; CHECK: sqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.arm64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubd:
+; CHECK: sqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.arm64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubs:
+; CHECK: uqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.arm64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubd:
+; CHECK: uqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.arm64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+declare i64 @llvm.arm64.neon.uqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.uqsub.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32) nounwind readnone
+
+define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qabss:
+; CHECK: sqabs s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqabs.i = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %vecext) nounwind
+  ret i32 %vqabs.i
+}
+
+define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qabsd:
+; CHECK: sqabs d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqabs.i = tail call i64 @llvm.arm64.neon.sqabs.i64(i64 %vecext) nounwind
+  ret i64 %vqabs.i
+}
+
+define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qnegs:
+; CHECK: sqneg s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqneg.i = tail call i32 @llvm.arm64.neon.sqneg.i32(i32 %vecext) nounwind
+  ret i32 %vqneg.i
+}
+
+define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qnegd:
+; CHECK: sqneg d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqneg.i = tail call i64 @llvm.arm64.neon.sqneg.i64(i64 %vecext) nounwind
+  ret i64 %vqneg.i
+}
+
+declare i64 @llvm.arm64.neon.sqneg.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqneg.i32(i32) nounwind readnone
+declare i64 @llvm.arm64.neon.sqabs.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqabs.i32(i32) nounwind readnone
+
+
+define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovund:
+; CHECK: sqxtun s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovun.i = tail call i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovun.i
+}
+
+define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_s:
+; CHECK: sqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_u:
+; CHECK: uqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+declare i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
diff --git a/test/CodeGen/ARM64/arith.ll b/test/CodeGen/ARM64/arith.ll
new file mode 100644
index 0000000..b6ff0da
--- /dev/null
+++ b/test/CodeGen/ARM64/arith.ll
@@ -0,0 +1,262 @@
+; RUN: llc < %s -march=arm64 -asm-verbose=false | FileCheck %s
+
+define i32 @t1(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: add w0, w1, w0
+; CHECK: ret
+  %add = add i32 %b, %a
+  ret i32 %add
+}
+
+define i32 @t2(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+  %udiv = udiv i32 %a, %b
+  ret i32 %udiv
+}
+
+define i64 @t3(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+  %udiv = udiv i64 %a, %b
+  ret i64 %udiv
+}
+
+define i32 @t4(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+  %sdiv = sdiv i32 %a, %b
+  ret i32 %sdiv
+}
+
+define i64 @t5(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+  %sdiv = sdiv i64 %a, %b
+  ret i64 %sdiv
+}
+
+define i32 @t6(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: lslv w0, w0, w1
+; CHECK: ret
+  %shl = shl i32 %a, %b
+  ret i32 %shl
+}
+
+define i64 @t7(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: lslv x0, x0, x1
+; CHECK: ret
+  %shl = shl i64 %a, %b
+  ret i64 %shl
+}
+
+define i32 @t8(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: lsrv w0, w0, w1
+; CHECK: ret
+  %lshr = lshr i32 %a, %b
+  ret i32 %lshr
+}
+
+define i64 @t9(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t9:
+; CHECK: lsrv x0, x0, x1
+; CHECK: ret
+  %lshr = lshr i64 %a, %b
+  ret i64 %lshr
+}
+
+define i32 @t10(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t10:
+; CHECK: asrv w0, w0, w1
+; CHECK: ret
+  %ashr = ashr i32 %a, %b
+  ret i32 %ashr
+}
+
+define i64 @t11(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t11:
+; CHECK: asrv x0, x0, x1
+; CHECK: ret
+  %ashr = ashr i64 %a, %b
+  ret i64 %ashr
+}
+
+define i32 @t12(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t12:
+; CHECK: add	w0, w1, w0, sxth
+; CHECK: ret
+  %c = sext i16 %a to i32
+  %e = add i32 %x, %c
+  ret i32 %e
+}
+
+define i32 @t13(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t13:
+; CHECK: add	w0, w1, w0, sxth #2
+; CHECK: ret
+  %c = sext i16 %a to i32
+  %d = shl i32 %c, 2
+  %e = add i32 %x, %d
+  ret i32 %e
+}
+
+define i64 @t14(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t14:
+; CHECK: add	x0, x1, w0, uxth #3
+; CHECK: ret
+  %c = zext i16 %a to i64
+  %d = shl i64 %c, 3
+  %e = add i64 %x, %d
+  ret i64 %e
+}
+
+; rdar://9160598
+define i64 @t15(i64 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t15:
+; CHECK: add x0, x1, w0, uxtw
+; CHECK: ret
+  %b = and i64 %a, 4294967295
+  %c = add i64 %x, %b
+  ret i64 %c
+}
+
+define i64 @t16(i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t16:
+; CHECK: lsl x0, x0, #1
+; CHECK: ret
+  %a = shl i64 %x, 1
+  ret i64 %a
+}
+
+; rdar://9166974
+define i64 @t17(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t17:
+; CHECK: sxth [[REG:x[0-9]+]], x0
+; CHECK: sub x0, xzr, [[REG]], lsl #32
+; CHECK: ret
+  %tmp16 = sext i16 %a to i64
+  %tmp17 = mul i64 %tmp16, -4294967296
+  ret i64 %tmp17
+}
+
+define i32 @t18(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t18:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+  %sdiv = call i32 @llvm.arm64.sdiv.i32(i32 %a, i32 %b)
+  ret i32 %sdiv
+}
+
+define i64 @t19(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t19:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+  %sdiv = call i64 @llvm.arm64.sdiv.i64(i64 %a, i64 %b)
+  ret i64 %sdiv
+}
+
+define i32 @t20(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t20:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+  %udiv = call i32 @llvm.arm64.udiv.i32(i32 %a, i32 %b)
+  ret i32 %udiv
+}
+
+define i64 @t21(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t21:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+  %udiv = call i64 @llvm.arm64.udiv.i64(i64 %a, i64 %b)
+  ret i64 %udiv
+}
+
+declare i32 @llvm.arm64.sdiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.sdiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.udiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.udiv.i64(i64, i64) nounwind readnone
+
+; 32-bit not.
+define i32 @inv_32(i32 %x) nounwind ssp {
+entry:
+; CHECK: inv_32
+; CHECK: mvn w0, w0
+; CHECK: ret
+  %inv = xor i32 %x, -1
+  ret i32 %inv
+}
+
+; 64-bit not.
+define i64 @inv_64(i64 %x) nounwind ssp {
+entry:
+; CHECK: inv_64
+; CHECK: mvn x0, x0
+; CHECK: ret
+  %inv = xor i64 %x, -1
+  ret i64 %inv
+}
+
+; Multiplying by a power of two plus or minus one is better done via shift
+; and add/sub rather than the madd/msub instructions. The latter are 4+ cycles,
+; and the former are two (total for the two instruction sequence for subtract).
+define i32 @f0(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f0:
+; CHECK-NEXT: add w0, w0, w0, lsl #3
+; CHECK-NEXT: ret
+  %res = mul i32 %a, 9
+  ret i32 %res
+}
+
+define i64 @f1(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f1:
+; CHECK-NEXT: lsl x8, x0, #4
+; CHECK-NEXT: sub x0, x8, x0
+; CHECK-NEXT: ret
+  %res = mul i64 %a, 15
+  ret i64 %res
+}
+
+define i32 @f2(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f2:
+; CHECK-NEXT: lsl w8, w0, #3
+; CHECK-NEXT: sub w0, w8, w0
+; CHECK-NEXT: ret
+  %res = mul nsw i32 %a, 7
+  ret i32 %res
+}
+
+define i64 @f3(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f3:
+; CHECK-NEXT: add x0, x0, x0, lsl #4
+; CHECK-NEXT: ret
+  %res = mul nsw i64 %a, 17
+  ret i64 %res
+}
diff --git a/test/CodeGen/ARM64/atomic-128.ll b/test/CodeGen/ARM64/atomic-128.ll
new file mode 100644
index 0000000..a0039a3
--- /dev/null
+++ b/test/CodeGen/ARM64/atomic-128.ll
@@ -0,0 +1,213 @@
+; RUN: llc < %s -march=arm64 -mtriple=arm64-linux-gnu -verify-machineinstrs | FileCheck %s
+
+@var = global i128 0
+
+define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp   [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[RESULTLO]], x2
+; CHECK: sbc    xzr, [[RESULTHI]], x3
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  ret i128 %val
+}
+
+define void @fetch_and_nand(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: bic    [[SCRATCH_REGLO:x[0-9]+]], x2, [[DEST_REGLO]]
+; CHECK: bic    [[SCRATCH_REGHI:x[0-9]+]], x3, [[DEST_REGHI]]
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw nand i128* %p, i128 %bits release
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_or(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: orr    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: orr    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw or i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_add(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_add:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: adds   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: adc    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw add i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_sub(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_sub:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: subs   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: sbc    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw sub i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_min(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_min:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, lt
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, lt
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw min i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_max(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_max:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, gt
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, gt
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw max i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umin(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umin:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, cc
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, cc
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw umin i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umax(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umax:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, hi
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, hi
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw umax i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define i128 @atomic_load_seq_cst(i128* %p) {
+; CHECK-LABEL: atomic_load_seq_cst:
+; CHECK-NOT: dmb
+; CHECK-LABEL: ldaxp
+; CHECK-NOT: dmb
+   %r = load atomic i128* %p seq_cst, align 16
+   ret i128 %r
+}
+
+define i128 @atomic_load_relaxed(i128* %p) {
+; CHECK-LABEL: atomic_load_relaxed:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: orr [[SAMELO:x[0-9]+]], [[LO]], xzr
+; CHECK: orr [[SAMEHI:x[0-9]+]], [[HI]], xzr
+; CHECK: stxp [[SUCCESS:w[0-9]+]], [[SAMELO]], [[SAMEHI]], [x0]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   %r = load atomic i128* %p monotonic, align 16
+   ret i128 %r
+}
+
+
+define void @atomic_store_seq_cst(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_seq_cst:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p seq_cst, align 16
+   ret void
+}
+
+define void @atomic_store_release(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_release:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p release, align 16
+   ret void
+}
+
+define void @atomic_store_relaxed(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_relaxed:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p unordered, align 16
+   ret void
+}
diff --git a/test/CodeGen/ARM64/atomic.ll b/test/CodeGen/ARM64/atomic.ll
new file mode 100644
index 0000000..cf8cf7d
--- /dev/null
+++ b/test/CodeGen/ARM64/atomic.ll
@@ -0,0 +1,343 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
+
+define i32 @val_compare_and_swap(i32* %p) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: orr    [[NEWVAL_REG:w[0-9]+]], wzr, #0x4
+; CHECK: orr    [[OLDVAL_REG:w[0-9]+]], wzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   [[RESULT:w[0-9]+]], [x0]
+; CHECK: cmp    [[RESULT]], [[OLDVAL_REG]]
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+  ret i32 %val
+}
+
+define i64 @val_compare_and_swap_64(i64* %p) {
+; CHECK-LABEL: val_compare_and_swap_64:
+; CHECK: orr    [[NEWVAL_REG:x[0-9]+]], xzr, #0x4
+; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   [[RESULT:x[0-9]+]], [x0]
+; CHECK: cmp    [[RESULT]], [[OLDVAL_REG]]
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK-NOT: stxr [[NEWVAL_REG]], [[NEWVAL_REG]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+  ret i64 %val
+}
+
+define i32 @fetch_and_nand(i32* %p) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: orr    [[OLDVAL_REG:w[0-9]+]], wzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: bic    [[SCRATCH2_REG:w[0-9]+]], [[OLDVAL_REG]], w[[DEST_REG]]
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, x[[DEST_REG]]
+  %val = atomicrmw nand i32* %p, i32 7 release
+  ret i32 %val
+}
+
+define i64 @fetch_and_nand_64(i64* %p) {
+; CHECK-LABEL: fetch_and_nand_64:
+; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   [[DEST_REG:x[0-9]+]], [x0]
+; CHECK: bic    [[SCRATCH2_REG:x[0-9]+]], [[OLDVAL_REG]], [[DEST_REG]]
+; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, [[DEST_REG]]
+  %val = atomicrmw nand i64* %p, i64 7 acq_rel
+  ret i64 %val
+}
+
+define i32 @fetch_and_or(i32* %p) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: movz   [[OLDVAL_REG:w[0-9]+]], #5
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, x[[DEST_REG]]
+  %val = atomicrmw or i32* %p, i32 5 seq_cst
+  ret i32 %val
+}
+
+define i64 @fetch_and_or_64(i64* %p) {
+; CHECK: fetch_and_or_64:
+; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   [[DEST_REG:x[0-9]+]], [x0]
+; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], [[OLDVAL_REG]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, [[DEST_REG]]
+  %val = atomicrmw or i64* %p, i64 7 monotonic
+  ret i64 %val
+}
+
+define void @acquire_fence() {
+   fence acquire
+   ret void
+   ; CHECK-LABEL: acquire_fence:
+   ; CHECK: dmb ishld
+}
+
+define void @release_fence() {
+   fence release
+   ret void
+   ; CHECK-LABEL: release_fence:
+   ; CHECK: dmb ish{{$}}
+}
+
+define void @seq_cst_fence() {
+   fence seq_cst
+   ret void
+   ; CHECK-LABEL: seq_cst_fence:
+   ; CHECK: dmb ish{{$}}
+}
+
+define i32 @atomic_load(i32* %p) {
+   %r = load atomic i32* %p seq_cst, align 4
+   ret i32 %r
+   ; CHECK-LABEL: atomic_load:
+   ; CHECK: ldar
+}
+
+define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_8:
+  %ptr_unsigned = getelementptr i8* %p, i32 4095
+  %val_unsigned = load atomic i8* %ptr_unsigned monotonic, align 1
+; CHECK: ldrb {{w[0-9]+}}, [x0, #4095]
+
+  %ptr_regoff = getelementptr i8* %p, i32 %off32
+  %val_regoff = load atomic i8* %ptr_regoff unordered, align 1
+  %tot1 = add i8 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldrb {{w[0-9]+}}, [x0, x1, sxtw]
+
+  %ptr_unscaled = getelementptr i8* %p, i32 -256
+  %val_unscaled = load atomic i8* %ptr_unscaled monotonic, align 1
+  %tot2 = add i8 %tot1, %val_unscaled
+; CHECK: ldurb {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+  %val_random = load atomic i8* %ptr_random unordered, align 1
+  %tot3 = add i8 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldrb {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i8 %tot3
+}
+
+define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_16:
+  %ptr_unsigned = getelementptr i16* %p, i32 4095
+  %val_unsigned = load atomic i16* %ptr_unsigned monotonic, align 2
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr i16* %p, i32 %off32
+  %val_regoff = load atomic i16* %ptr_regoff unordered, align 2
+  %tot1 = add i16 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldrh {{w[0-9]+}}, [x0, x1, sxtw #1]
+
+  %ptr_unscaled = getelementptr i16* %p, i32 -128
+  %val_unscaled = load atomic i16* %ptr_unscaled monotonic, align 2
+  %tot2 = add i16 %tot1, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+  %val_random = load atomic i16* %ptr_random unordered, align 2
+  %tot3 = add i16 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldrh {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i16 %tot3
+}
+
+define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_32:
+  %ptr_unsigned = getelementptr i32* %p, i32 4095
+  %val_unsigned = load atomic i32* %ptr_unsigned monotonic, align 4
+; CHECK: ldr {{w[0-9]+}}, [x0, #16380]
+
+  %ptr_regoff = getelementptr i32* %p, i32 %off32
+  %val_regoff = load atomic i32* %ptr_regoff unordered, align 4
+  %tot1 = add i32 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldr {{w[0-9]+}}, [x0, x1, sxtw #2]
+
+  %ptr_unscaled = getelementptr i32* %p, i32 -64
+  %val_unscaled = load atomic i32* %ptr_unscaled monotonic, align 4
+  %tot2 = add i32 %tot1, %val_unscaled
+; CHECK: ldur {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+  %val_random = load atomic i32* %ptr_random unordered, align 4
+  %tot3 = add i32 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldr {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i32 %tot3
+}
+
+define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_64:
+  %ptr_unsigned = getelementptr i64* %p, i32 4095
+  %val_unsigned = load atomic i64* %ptr_unsigned monotonic, align 8
+; CHECK: ldr {{x[0-9]+}}, [x0, #32760]
+
+  %ptr_regoff = getelementptr i64* %p, i32 %off32
+  %val_regoff = load atomic i64* %ptr_regoff unordered, align 8
+  %tot1 = add i64 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldr {{x[0-9]+}}, [x0, x1, sxtw #3]
+
+  %ptr_unscaled = getelementptr i64* %p, i32 -32
+  %val_unscaled = load atomic i64* %ptr_unscaled monotonic, align 8
+  %tot2 = add i64 %tot1, %val_unscaled
+; CHECK: ldur {{x[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+  %val_random = load atomic i64* %ptr_random unordered, align 8
+  %tot3 = add i64 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldr {{x[0-9]+}}, [x[[ADDR]]]
+
+  ret i64 %tot3
+}
+
+
+define void @atomc_store(i32* %p) {
+   store atomic i32 4, i32* %p seq_cst, align 4
+   ret void
+   ; CHECK-LABEL: atomc_store:
+   ; CHECK: stlr
+}
+
+define void @atomic_store_relaxed_8(i8* %p, i32 %off32, i8 %val) {
+; CHECK-LABEL: atomic_store_relaxed_8:
+  %ptr_unsigned = getelementptr i8* %p, i32 4095
+  store atomic i8 %val, i8* %ptr_unsigned monotonic, align 1
+; CHECK: strb {{w[0-9]+}}, [x0, #4095]
+
+  %ptr_regoff = getelementptr i8* %p, i32 %off32
+  store atomic i8 %val, i8* %ptr_regoff unordered, align 1
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: strb {{w[0-9]+}}, [x0, x1, sxtw]
+
+  %ptr_unscaled = getelementptr i8* %p, i32 -256
+  store atomic i8 %val, i8* %ptr_unscaled monotonic, align 1
+; CHECK: sturb {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+  store atomic i8 %val, i8* %ptr_random unordered, align 1
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: strb {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_16(i16* %p, i32 %off32, i16 %val) {
+; CHECK-LABEL: atomic_store_relaxed_16:
+  %ptr_unsigned = getelementptr i16* %p, i32 4095
+  store atomic i16 %val, i16* %ptr_unsigned monotonic, align 2
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr i16* %p, i32 %off32
+  store atomic i16 %val, i16* %ptr_regoff unordered, align 2
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: strh {{w[0-9]+}}, [x0, x1, sxtw #1]
+
+  %ptr_unscaled = getelementptr i16* %p, i32 -128
+  store atomic i16 %val, i16* %ptr_unscaled monotonic, align 2
+; CHECK: sturh {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+  store atomic i16 %val, i16* %ptr_random unordered, align 2
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: strh {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_32(i32* %p, i32 %off32, i32 %val) {
+; CHECK-LABEL: atomic_store_relaxed_32:
+  %ptr_unsigned = getelementptr i32* %p, i32 4095
+  store atomic i32 %val, i32* %ptr_unsigned monotonic, align 4
+; CHECK: str {{w[0-9]+}}, [x0, #16380]
+
+  %ptr_regoff = getelementptr i32* %p, i32 %off32
+  store atomic i32 %val, i32* %ptr_regoff unordered, align 4
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: str {{w[0-9]+}}, [x0, x1, sxtw #2]
+
+  %ptr_unscaled = getelementptr i32* %p, i32 -64
+  store atomic i32 %val, i32* %ptr_unscaled monotonic, align 4
+; CHECK: stur {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+  store atomic i32 %val, i32* %ptr_random unordered, align 4
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: str {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_64(i64* %p, i32 %off32, i64 %val) {
+; CHECK-LABEL: atomic_store_relaxed_64:
+  %ptr_unsigned = getelementptr i64* %p, i32 4095
+  store atomic i64 %val, i64* %ptr_unsigned monotonic, align 8
+; CHECK: str {{x[0-9]+}}, [x0, #32760]
+
+  %ptr_regoff = getelementptr i64* %p, i32 %off32
+  store atomic i64 %val, i64* %ptr_regoff unordered, align 8
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: str {{x[0-9]+}}, [x0, x1, sxtw #3]
+
+  %ptr_unscaled = getelementptr i64* %p, i32 -32
+  store atomic i64 %val, i64* %ptr_unscaled monotonic, align 8
+; CHECK: stur {{x[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+  store atomic i64 %val, i64* %ptr_random unordered, align 8
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: str {{x[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+; rdar://11531169
+; rdar://11531308
+
+%"class.X::Atomic" = type { %struct.x_atomic_t }
+%struct.x_atomic_t = type { i32 }
+
+@counter = external hidden global %"class.X::Atomic", align 4
+
+define i32 @next_id() nounwind optsize ssp align 2 {
+entry:
+  %0 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+  %add.i = add i32 %0, 1
+  %tobool = icmp eq i32 %add.i, 0
+  br i1 %tobool, label %if.else, label %return
+
+if.else:                                          ; preds = %entry
+  %1 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+  %add.i2 = add i32 %1, 1
+  br label %return
+
+return:                                           ; preds = %if.else, %entry
+  %retval.0 = phi i32 [ %add.i2, %if.else ], [ %add.i, %entry ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/ARM64/basic-pic.ll b/test/CodeGen/ARM64/basic-pic.ll
new file mode 100644
index 0000000..9fdb1e9
--- /dev/null
+++ b/test/CodeGen/ARM64/basic-pic.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+
+@var = global i32 0
+
+define i32 @get_globalvar() {
+; CHECK-LABEL: get_globalvar:
+
+  %val = load i32* @var
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
+; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], :got_lo12:var]
+; CHECK: ldr w0, [x[[GOTLOC]]]
+
+  ret i32 %val
+}
+
+define i32* @get_globalvaraddr() {
+; CHECK-LABEL: get_globalvaraddr:
+
+  %val = load i32* @var
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:var]
+
+  ret i32* @var
+}
+
+@hiddenvar = hidden global i32 0
+
+define i32 @get_hiddenvar() {
+; CHECK-LABEL: get_hiddenvar:
+
+  %val = load i32* @hiddenvar
+; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
+; CHECK: ldr w0, [x[[HI]], :lo12:hiddenvar]
+
+  ret i32 %val
+}
+
+define i32* @get_hiddenvaraddr() {
+; CHECK-LABEL: get_hiddenvaraddr:
+
+  %val = load i32* @hiddenvar
+; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
+; CHECK: add x0, [[HI]], :lo12:hiddenvar
+
+  ret i32* @hiddenvar
+}
+
+define void()* @get_func() {
+; CHECK-LABEL: get_func:
+
+  ret void()* bitcast(void()*()* @get_func to void()*)
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
+; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:get_func]
+}
diff --git a/test/CodeGen/ARM64/big-imm-offsets.ll b/test/CodeGen/ARM64/big-imm-offsets.ll
new file mode 100644
index 0000000..a56df07
--- /dev/null
+++ b/test/CodeGen/ARM64/big-imm-offsets.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm64 < %s
+
+
+; Make sure large offsets aren't mistaken for valid immediate offsets.
+; <rdar://problem/13190511>
+define void @f(i32* nocapture %p) {
+entry:
+  %a = ptrtoint i32* %p to i64
+  %ao = add i64 %a, 25769803792
+  %b = inttoptr i64 %ao to i32*
+  store volatile i32 0, i32* %b, align 4
+  store volatile i32 0, i32* %b, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM64/big-stack.ll b/test/CodeGen/ARM64/big-stack.ll
new file mode 100644
index 0000000..56ca30c
--- /dev/null
+++ b/test/CodeGen/ARM64/big-stack.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Check that big stacks are generated correctly.
+; Currently, this is done by a sequence of sub instructions,
+; which can encode immediate with a 12 bits mask an optionally
+; shift left (up to 12). I.e., 16773120 is the biggest value.
+; <rdar://12513931>
+; CHECK-LABEL: foo:
+; CHECK: sub sp, sp, #16773120
+; CHECK: sub sp, sp, #16773120
+; CHECK: sub sp, sp, #8192
+define void @foo() nounwind ssp {
+entry:
+  %buffer = alloca [33554432 x i8], align 1
+  %arraydecay = getelementptr inbounds [33554432 x i8]* %buffer, i64 0, i64 0
+  call void @doit(i8* %arraydecay) nounwind
+  ret void
+}
+
+declare void @doit(i8*)
diff --git a/test/CodeGen/ARM64/bitfield-extract.ll b/test/CodeGen/ARM64/bitfield-extract.ll
new file mode 100644
index 0000000..96b6967
--- /dev/null
+++ b/test/CodeGen/ARM64/bitfield-extract.ll
@@ -0,0 +1,406 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+%struct.X = type { i8, i8, [2 x i8] }
+%struct.Y = type { i32, i8 }
+%struct.Z = type { i8, i8, [2 x i8], i16 }
+%struct.A = type { i64, i8 }
+
+define void @foo(%struct.X* nocapture %x, %struct.Y* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK: ubfm
+; CHECK-NOT: and
+; CHECK: ret
+
+  %tmp = bitcast %struct.X* %x to i32*
+  %tmp1 = load i32* %tmp, align 4
+  %b = getelementptr inbounds %struct.Y* %y, i64 0, i32 1
+  %bf.clear = lshr i32 %tmp1, 3
+  %bf.clear.lobit = and i32 %bf.clear, 1
+  %frombool = trunc i32 %bf.clear.lobit to i8
+  store i8 %frombool, i8* %b, align 1
+  ret void
+}
+
+define i32 @baz(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: baz:
+; CHECK: sbfm  w0, w0, #0, #3
+  %tmp = trunc i64 %cav1.coerce to i32
+  %tmp1 = shl i32 %tmp, 28
+  %bf.val.sext = ashr exact i32 %tmp1, 28
+  ret i32 %bf.val.sext
+}
+
+define i32 @bar(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sbfm  w0, w0, #4, #9
+  %tmp = trunc i64 %cav1.coerce to i32
+  %cav1.sroa.0.1.insert = shl i32 %tmp, 22
+  %tmp1 = ashr i32 %cav1.sroa.0.1.insert, 26
+  ret i32 %tmp1
+}
+
+define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: fct1:
+; CHECK: ubfm
+; CHECK-NOT: and
+; CHECK: ret
+
+  %tmp = bitcast %struct.Z* %x to i64*
+  %tmp1 = load i64* %tmp, align 4
+  %b = getelementptr inbounds %struct.A* %y, i64 0, i32 0
+  %bf.clear = lshr i64 %tmp1, 3
+  %bf.clear.lobit = and i64 %bf.clear, 1
+  store i64 %bf.clear.lobit, i64* %b, align 8
+  ret void
+}
+
+define i64 @fct2(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct2:
+; CHECK: sbfm  x0, x0, #0, #35
+  %tmp = shl i64 %cav1.coerce, 28
+  %bf.val.sext = ashr exact i64 %tmp, 28
+  ret i64 %bf.val.sext
+}
+
+define i64 @fct3(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct3:
+; CHECK: sbfm  x0, x0, #4, #41
+  %cav1.sroa.0.1.insert = shl i64 %cav1.coerce, 22
+  %tmp1 = ashr i64 %cav1.sroa.0.1.insert, 26
+  ret i64 %tmp1
+}
+
+define void @fct4(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct4:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #39
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -16777216
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 16777215
+  %or = or i64 %and, %and1
+  store i64 %or, i64* %y, align 8
+  ret void
+}
+
+define void @fct5(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct5:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  store i32 %or, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some low bits
+define void @fct6(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct6:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shr1 = lshr i32 %or, 2
+  store i32 %shr1, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+define void @fct7(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct7:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  store i32 %shl, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some low bits
+; (i64 version)
+define void @fct8(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct8:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shr1 = lshr i64 %or, 2
+  store i64 %shr1, i64* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; (i64 version)
+define void @fct9(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct9:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  store i64 %shl, i64* %y, align 8
+  ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i32 version)
+define void @fct10(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct10:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #0, #2
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %and1 = and i32 %x, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  store i32 %shl, i32* %y, align 8
+  ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i64 version)
+define void @fct11(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct11:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #0, #2
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %and1 = and i64 %x, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  store i64 %shl, i64* %y, align 8
+  ret void
+}
+
+define zeroext i1 @fct12bis(i32 %tmp2) unnamed_addr nounwind ssp align 2 {
+; CHECK-LABEL: fct12bis:
+; CHECK-NOT: and
+; CHECK: ubfm w0, w0, #11, #11
+  %and.i.i = and i32 %tmp2, 2048
+  %tobool.i.i = icmp ne i32 %and.i.i, 0
+  ret i1 %tobool.i.i
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct12(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct12:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG2:w[0-9]+]], [[REG1]], #2, #29
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  %shr2 = lshr i32 %shl, 4
+  store i32 %shr2, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct13(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct13:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG2:x[0-9]+]], [[REG1]], #2, #61
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  %shr2 = lshr i64 %shl, 4
+  store i64 %shr2, i64* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct14(i32* nocapture %y, i32 %x, i32 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct14:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #23
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfm [[REG2]], w2, #5, #7
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:w[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -256
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 255
+  %or = or i32 %and, %and1
+  %shl = lshr i32 %or, 4
+  %and2 = and i32 %shl, -8
+  %shr1 = lshr i32 %x1, 5
+  %and3 = and i32 %shr1, 7
+  %or1 = or i32 %and2, %and3
+  %shl1 = shl i32 %or1, 2
+  store i32 %shl1, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct15(i64* nocapture %y, i64 %x, i64 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct15:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #23
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfm [[REG2]], x2, #5, #7
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:x[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -256
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 255
+  %or = or i64 %and, %and1
+  %shl = lshr i64 %or, 4
+  %and2 = and i64 %shl, -8
+  %shr1 = lshr i64 %x1, 5
+  %and3 = and i64 %shr1, 7
+  %or1 = or i64 %and2, %and3
+  %shl1 = shl i64 %or1, 2
+  store i64 %shl1, i64* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+define void @fct16(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct16:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; Create the constant
+; CHECK: movz [[REGCST:w[0-9]+]], #26, lsl #16
+; CHECK: movk [[REGCST]], #33120
+; Do the masking
+; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]]
+; CHECK-NEXT: bfm [[REG2]], w1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG3:w[0-9]+]], [[REG2]], #2, #29
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, 1737056
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  %shr2 = lshr i32 %shl, 4
+  store i32 %shr2, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+; (i64 version)
+define void @fct17(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; Create the constant
+; CHECK: movz [[REGCST:x[0-9]+]], #26, lsl #16
+; CHECK: movk [[REGCST]], #33120
+; Do the masking
+; CHECK: and [[REG2:x[0-9]+]], [[REG1]], [[REGCST]]
+; CHECK-NEXT: bfm [[REG2]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG3:x[0-9]+]], [[REG2]], #2, #61
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, 1737056
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  %shr2 = lshr i64 %shl, 4
+  store i64 %shr2, i64* %y, align 8
+  ret void
+}
+
+define i64 @fct18(i32 %xor72) nounwind ssp {
+; CHECK-LABEL: fct18:
+; CHECK: ubfm x0, x0, #9, #16
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %result = and i64 %conv82, 255
+  ret i64 %result
+}
diff --git a/test/CodeGen/ARM64/blockaddress.ll b/test/CodeGen/ARM64/blockaddress.ll
new file mode 100644
index 0000000..ac4f19e
--- /dev/null
+++ b/test/CodeGen/ARM64/blockaddress.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -mtriple=arm64-linux-gnu -code-model=large| FileCheck %s --check-prefix=CHECK-LARGE
+
+; rdar://9188695
+
+define i64 @t() nounwind ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: adrp [[REG:x[0-9]+]], Ltmp1@PAGE
+; CHECK: add {{x[0-9]+}}, [[REG]], Ltmp1@PAGEOFF
+
+; CHECK-LINUX-LABEL: t:
+; CHECK-LINUX: adrp [[REG:x[0-9]+]], .Ltmp1
+; CHECK-LINUX: add {{x[0-9]+}}, [[REG]], :lo12:.Ltmp1
+
+; CHECK-LARGE-LABEL: t:
+; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g3:[[DEST_LBL:.Ltmp[0-9]+]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g1_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g0_nc:[[DEST_LBL]]
+
+  %recover = alloca i64, align 8
+  store volatile i64 ptrtoint (i8* blockaddress(@t, %mylabel) to i64), i64* %recover, align 8
+  br label %mylabel
+
+mylabel:
+  %tmp = load volatile i64* %recover, align 8
+  ret i64 %tmp
+}
diff --git a/test/CodeGen/ARM64/build-vector.ll b/test/CodeGen/ARM64/build-vector.ll
new file mode 100644
index 0000000..1d137ae
--- /dev/null
+++ b/test/CodeGen/ARM64/build-vector.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; Check that building up a vector w/ only one non-zero lane initializes
+; intelligently.
+define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind {
+; CHECK-LABEL: one_lane:
+; CHECK: dup.16b v[[REG:[0-9]+]], wzr
+; CHECK-NEXT: ins.b v[[REG]][0], w1
+; v and q are aliases, and str is prefered against st.16b when possible
+; rdar://11246289
+; CHECK: str q[[REG]], [x0]
+; CHECK: ret
+  %conv = trunc i32 %skip0 to i8
+  %vset_lane = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %conv, i32 0
+  %tmp = bitcast i32* %out_int to <4 x i32>*
+  %tmp1 = bitcast <16 x i8> %vset_lane to <4 x i32>
+  store <4 x i32> %tmp1, <4 x i32>* %tmp, align 16
+  ret void
+}
+
+; Check that building a vector from floats doesn't insert an unnecessary
+; copy for lane zero.
+define <4 x float>  @foo(float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: foo:
+; CHECK-NOT: ins.s v0[0], v0[0]
+; CHECK: ins.s v0[1], v1[0]
+; CHECK: ins.s v0[2], v2[0]
+; CHECK: ins.s v0[3], v3[0]
+; CHECK: ret
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float %b, i32 1
+  %3 = insertelement <4 x float> %2, float %c, i32 2
+  %4 = insertelement <4 x float> %3, float %d, i32 3
+  ret <4 x float> %4
+}
diff --git a/test/CodeGen/ARM64/call-tailcalls.ll b/test/CodeGen/ARM64/call-tailcalls.ll
new file mode 100644
index 0000000..487c1d9
--- /dev/null
+++ b/test/CodeGen/ARM64/call-tailcalls.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+@t = weak global i32 ()* null
+@x = external global i32, align 4
+
+define void @t2() {
+; CHECK-LABEL: t2:
+; CHECK: adrp	x[[GOTADDR:[0-9]+]], _t@GOTPAGE
+; CHECK: ldr	x[[ADDR:[0-9]+]], [x[[GOTADDR]], _t@GOTPAGEOFF]
+; CHECK: ldr	x[[DEST:[0-9]+]], [x[[ADDR]]]
+; CHECK: br	x[[DEST]]
+  %tmp = load i32 ()** @t
+  %tmp.upgrd.2 = tail call i32 %tmp()
+  ret void
+}
+
+define void @t3() {
+; CHECK-LABEL: t3:
+; CHECK: b	_t2
+  tail call void @t2()
+  ret void
+}
+
+define double @t4(double %a) nounwind readonly ssp {
+; CHECK-LABEL: t4:
+; CHECK: b	_sin
+  %tmp = tail call double @sin(double %a) nounwind readonly
+  ret double %tmp
+}
+
+define float @t5(float %a) nounwind readonly ssp {
+; CHECK-LABEL: t5:
+; CHECK: b	_sinf
+  %tmp = tail call float @sinf(float %a) nounwind readonly
+  ret float %tmp
+}
+
+define void @t7() nounwind {
+; CHECK-LABEL: t7:
+; CHECK: b	_foo
+; CHECK: b	_bar
+
+  br i1 undef, label %bb, label %bb1.lr.ph
+
+bb1.lr.ph:                                        ; preds = %entry
+  tail call void @bar() nounwind
+  ret void
+
+bb:                                               ; preds = %entry
+  tail call void @foo() nounwind
+  ret void
+}
+
+define i32 @t8(i32 %x) nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK: b	_a
+; CHECK: b	_b
+; CHECK: b	_c
+  %and = and i32 %x, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 @a(i32 %x) nounwind
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %and1 = and i32 %x, 2
+  %tobool2 = icmp eq i32 %and1, 0
+  br i1 %tobool2, label %if.end5, label %if.then3
+
+if.then3:                                         ; preds = %if.end
+  %call4 = tail call i32 @b(i32 %x) nounwind
+  br label %return
+
+if.end5:                                          ; preds = %if.end
+  %call6 = tail call i32 @c(i32 %x) nounwind
+  br label %return
+
+return:                                           ; preds = %if.end5, %if.then3, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call4, %if.then3 ], [ %call6, %if.end5 ]
+  ret i32 %retval.0
+}
+
+declare float @sinf(float) nounwind readonly
+declare double @sin(double) nounwind readonly
+declare void @bar() nounwind
+declare void @foo() nounwind
+declare i32 @a(i32)
+declare i32 @b(i32)
+declare i32 @c(i32)
diff --git a/test/CodeGen/ARM64/cast-opt.ll b/test/CodeGen/ARM64/cast-opt.ll
new file mode 100644
index 0000000..3d7f257
--- /dev/null
+++ b/test/CodeGen/ARM64/cast-opt.ll
@@ -0,0 +1,31 @@
+; RUN: llc -O3 -march=arm64 -mtriple arm64-apple-ios5.0.0 < %s | FileCheck %s
+; <rdar://problem/15992732>
+; Zero truncation is not necessary when the values are extended properly
+; already.
+
+@block = common global i8* null, align 8
+
+define zeroext i8 @foo(i32 %i1, i32 %i2) {
+; CHECK-LABEL: foo:
+; CHECK: csinc
+; CHECK-NOT: and
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv9 = zext i1 %cmp7 to i8
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i8 [ %conv9, %if.then ], [ 1, %entry ]
+  ret i8 %retval.0
+}
diff --git a/test/CodeGen/ARM64/ccmp-heuristics.ll b/test/CodeGen/ARM64/ccmp-heuristics.ll
new file mode 100644
index 0000000..5575997
--- /dev/null
+++ b/test/CodeGen/ARM64/ccmp-heuristics.ll
@@ -0,0 +1,190 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+@channelColumns = external global i64
+@channelTracks = external global i64
+@mazeRoute = external hidden unnamed_addr global i8*, align 8
+@TOP = external global i64*
+@BOT = external global i64*
+@netsAssign = external global i64*
+
+; Function from yacr2/maze.c
+; The branch at the end of %if.then is driven by %cmp5 and %cmp6.
+; Isel converts the and i1 into two branches, and arm64-ccmp should not convert
+; it back again. %cmp6 has much higher latency than %cmp5.
+; CHECK: Maze1
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+define i32 @Maze1() nounwind ssp {
+entry:
+  %0 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp90 = icmp eq i64 %0, 0
+  br i1 %cmp90, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %1 = phi i64 [ %0, %entry ], [ %37, %for.inc ]
+  %i.092 = phi i64 [ 1, %entry ], [ %inc53, %for.inc ]
+  %numLeft.091 = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+  %2 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx = getelementptr inbounds i8* %2, i64 %i.092
+  %3 = load i8* %arrayidx, align 1, !tbaa !1
+  %tobool = icmp eq i8 %3, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %4 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx1 = getelementptr inbounds i64* %4, i64 %i.092
+  %5 = load i64* %arrayidx1, align 8, !tbaa !0
+  %6 = load i64** @netsAssign, align 8, !tbaa !3
+  %arrayidx2 = getelementptr inbounds i64* %6, i64 %5
+  %7 = load i64* %arrayidx2, align 8, !tbaa !0
+  %8 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx3 = getelementptr inbounds i64* %8, i64 %i.092
+  %9 = load i64* %arrayidx3, align 8, !tbaa !0
+  %arrayidx4 = getelementptr inbounds i64* %6, i64 %9
+  %10 = load i64* %arrayidx4, align 8, !tbaa !0
+  %cmp5 = icmp ugt i64 %i.092, 1
+  %cmp6 = icmp ugt i64 %10, 1
+  %or.cond = and i1 %cmp5, %cmp6
+  br i1 %or.cond, label %land.lhs.true7, label %if.else
+
+land.lhs.true7:                                   ; preds = %if.then
+  %11 = load i64* @channelTracks, align 8, !tbaa !0
+  %add = add i64 %11, 1
+  %call = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add, i64 %10, i64 0, i64 %7, i32 -1, i32 -1)
+  %tobool8 = icmp eq i32 %call, 0
+  br i1 %tobool8, label %land.lhs.true7.if.else_crit_edge, label %if.then9
+
+land.lhs.true7.if.else_crit_edge:                 ; preds = %land.lhs.true7
+  %.pre = load i64* @channelColumns, align 8, !tbaa !0
+  br label %if.else
+
+if.then9:                                         ; preds = %land.lhs.true7
+  %12 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx10 = getelementptr inbounds i8* %12, i64 %i.092
+  store i8 0, i8* %arrayidx10, align 1, !tbaa !1
+  %13 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx11 = getelementptr inbounds i64* %13, i64 %i.092
+  %14 = load i64* %arrayidx11, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %14)
+  %15 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx12 = getelementptr inbounds i64* %15, i64 %i.092
+  %16 = load i64* %arrayidx12, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %16)
+  br label %for.inc
+
+if.else:                                          ; preds = %land.lhs.true7.if.else_crit_edge, %if.then
+  %17 = phi i64 [ %.pre, %land.lhs.true7.if.else_crit_edge ], [ %1, %if.then ]
+  %cmp13 = icmp ult i64 %i.092, %17
+  %or.cond89 = and i1 %cmp13, %cmp6
+  br i1 %or.cond89, label %land.lhs.true16, label %if.else24
+
+land.lhs.true16:                                  ; preds = %if.else
+  %18 = load i64* @channelTracks, align 8, !tbaa !0
+  %add17 = add i64 %18, 1
+  %call18 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add17, i64 %10, i64 0, i64 %7, i32 1, i32 -1)
+  %tobool19 = icmp eq i32 %call18, 0
+  br i1 %tobool19, label %if.else24, label %if.then20
+
+if.then20:                                        ; preds = %land.lhs.true16
+  %19 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx21 = getelementptr inbounds i8* %19, i64 %i.092
+  store i8 0, i8* %arrayidx21, align 1, !tbaa !1
+  %20 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx22 = getelementptr inbounds i64* %20, i64 %i.092
+  %21 = load i64* %arrayidx22, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %21)
+  %22 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx23 = getelementptr inbounds i64* %22, i64 %i.092
+  %23 = load i64* %arrayidx23, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %23)
+  br label %for.inc
+
+if.else24:                                        ; preds = %land.lhs.true16, %if.else
+  br i1 %cmp5, label %land.lhs.true26, label %if.else36
+
+land.lhs.true26:                                  ; preds = %if.else24
+  %24 = load i64* @channelTracks, align 8, !tbaa !0
+  %cmp27 = icmp ult i64 %7, %24
+  br i1 %cmp27, label %land.lhs.true28, label %if.else36
+
+land.lhs.true28:                                  ; preds = %land.lhs.true26
+  %add29 = add i64 %24, 1
+  %call30 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add29, i64 %10, i32 -1, i32 1)
+  %tobool31 = icmp eq i32 %call30, 0
+  br i1 %tobool31, label %if.else36, label %if.then32
+
+if.then32:                                        ; preds = %land.lhs.true28
+  %25 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx33 = getelementptr inbounds i8* %25, i64 %i.092
+  store i8 0, i8* %arrayidx33, align 1, !tbaa !1
+  %26 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx34 = getelementptr inbounds i64* %26, i64 %i.092
+  %27 = load i64* %arrayidx34, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %27)
+  %28 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx35 = getelementptr inbounds i64* %28, i64 %i.092
+  %29 = load i64* %arrayidx35, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %29)
+  br label %for.inc
+
+if.else36:                                        ; preds = %land.lhs.true28, %land.lhs.true26, %if.else24
+  %30 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp37 = icmp ult i64 %i.092, %30
+  br i1 %cmp37, label %land.lhs.true38, label %if.else48
+
+land.lhs.true38:                                  ; preds = %if.else36
+  %31 = load i64* @channelTracks, align 8, !tbaa !0
+  %cmp39 = icmp ult i64 %7, %31
+  br i1 %cmp39, label %land.lhs.true40, label %if.else48
+
+land.lhs.true40:                                  ; preds = %land.lhs.true38
+  %add41 = add i64 %31, 1
+  %call42 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add41, i64 %10, i32 1, i32 1)
+  %tobool43 = icmp eq i32 %call42, 0
+  br i1 %tobool43, label %if.else48, label %if.then44
+
+if.then44:                                        ; preds = %land.lhs.true40
+  %32 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx45 = getelementptr inbounds i8* %32, i64 %i.092
+  store i8 0, i8* %arrayidx45, align 1, !tbaa !1
+  %33 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx46 = getelementptr inbounds i64* %33, i64 %i.092
+  %34 = load i64* %arrayidx46, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %34)
+  %35 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx47 = getelementptr inbounds i64* %35, i64 %i.092
+  %36 = load i64* %arrayidx47, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %36)
+  br label %for.inc
+
+if.else48:                                        ; preds = %land.lhs.true40, %land.lhs.true38, %if.else36
+  %inc = add nsw i32 %numLeft.091, 1
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.else48, %if.then44, %if.then32, %if.then20, %if.then9, %for.body
+  %numLeft.1 = phi i32 [ %numLeft.091, %if.then9 ], [ %numLeft.091, %if.then20 ], [ %numLeft.091, %if.then32 ], [ %numLeft.091, %if.then44 ], [ %inc, %if.else48 ], [ %numLeft.091, %for.body ]
+  %inc53 = add i64 %i.092, 1
+  %37 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp = icmp ugt i64 %inc53, %37
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %numLeft.0.lcssa = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+  ret i32 %numLeft.0.lcssa
+}
+
+; Materializable
+declare hidden fastcc i32 @Maze1Mech(i64, i64, i64, i64, i64, i32, i32) nounwind ssp
+
+; Materializable
+declare hidden fastcc void @CleanNet(i64) nounwind ssp
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/ARM64/ccmp.ll b/test/CodeGen/ARM64/ccmp.ll
new file mode 100644
index 0000000..79e6f94
--- /dev/null
+++ b/test/CodeGen/ARM64/ccmp.ll
@@ -0,0 +1,289 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp -arm64-stress-ccmp | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; CHECK: single_same
+; CHECK: cmp w0, #5
+; CHECK-NEXT: ccmp w1, #17, #4, ne
+; CHECK-NEXT: b.ne
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  %cmp1 = icmp eq i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Different condition codes for the two compares.
+; CHECK: single_different
+; CHECK: cmp w0, #6
+; CHECK-NEXT: ccmp w1, #17, #0, ge
+; CHECK-NEXT: b.eq
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp sle i32 %a, 5
+  %cmp1 = icmp ne i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Second block clobbers the flags, can't convert (easily).
+; CHECK: single_flagclobber
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: b.gt
+define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp1 = icmp slt i32 %b, 7
+  %mul = shl nsw i32 %b, 1
+  %add = add nsw i32 %b, 1
+  %cond = select i1 %cmp1, i32 %mul, i32 %add
+  %cmp2 = icmp slt i32 %cond, 17
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  ret i32 7
+}
+
+; Second block clobbers the flags and ends with a tbz terminator.
+; CHECK: single_flagclobber_tbz
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: tbz
+define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp1 = icmp slt i32 %b, 7
+  %mul = shl nsw i32 %b, 1
+  %add = add nsw i32 %b, 1
+  %cond = select i1 %cmp1, i32 %mul, i32 %add
+  %and = and i32 %cond, 8
+  %cmp2 = icmp ne i32 %and, 0
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  ret i32 7
+}
+
+; Speculatively execute division by zero.
+; The sdiv/udiv instructions do not trap when the divisor is zero, so they are
+; safe to speculate.
+; CHECK: speculate_division
+; CHECK-NOT: cmp
+; CHECK: sdiv
+; CHECK: cmp
+; CHECK-NEXT: ccmp
+define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %div = sdiv i32 %b, %a
+  %cmp1 = icmp slt i32 %div, 17
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Floating point compare.
+; CHECK: single_fcmp
+; CHECK: cmp
+; CHECK-NOT: b.
+; CHECK: fccmp {{.*}}, #8, ge
+; CHECK: b.lt
+define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %conv = sitofp i32 %a to float
+  %div = fdiv float %b, %conv
+  %cmp1 = fcmp oge float %div, 1.700000e+01
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Chain multiple compares.
+; CHECK: multi_different
+; CHECK: cmp
+; CHECK: ccmp
+; CHECK: ccmp
+; CHECK: b.
+define void @multi_different(i32 %a, i32 %b, i32 %c) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, %b
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %div = sdiv i32 %b, %a
+  %cmp1 = icmp eq i32 %div, 5
+  %cmp4 = icmp sgt i32 %div, %c
+  %or.cond = and i1 %cmp1, %cmp4
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Convert a cbz in the head block.
+; CHECK: cbz_head
+; CHECK: cmp w0, #0
+; CHECK: ccmp
+define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp ne i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Check that the immediate operand is in range. The ccmp instruction encodes a
+; smaller range of immediates than subs/adds.
+; The ccmp immediates must be in the range 0-31.
+; CHECK: immediate_range
+; CHECK-NOT: ccmp
+define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  %cmp1 = icmp eq i32 %b, 32
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Convert a cbz in the second block.
+; CHECK: cbz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #0, ne
+; CHECK: b.eq
+define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp ne i32 %b, 0
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Convert a cbnz in the second block.
+; CHECK: cbnz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #4, ne
+; CHECK: b.ne
+define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp eq i32 %b, 0
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+declare i32 @foo()
+
+%str1 = type { %str2 }
+%str2 = type { [24 x i8], i8*, i32, %str1*, i32, [4 x i8], %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, i8*, i8, i8*, %str1*, i8* }
+
+; Test case distilled from 126.gcc.
+; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
+; CHECK: build_modify_expr
+define void @build_modify_expr() nounwind ssp {
+entry:
+  switch i32 undef, label %sw.bb.i.i [
+    i32 69, label %if.end85
+    i32 70, label %if.end85
+    i32 71, label %if.end85
+    i32 72, label %if.end85
+    i32 73, label %if.end85
+    i32 105, label %if.end85
+    i32 106, label %if.end85
+  ]
+
+if.end85:
+  ret void
+
+sw.bb.i.i:
+  %ref.tr.i.i = phi %str1* [ %0, %sw.bb.i.i ], [ undef, %entry ]
+  %operands.i.i = getelementptr inbounds %str1* %ref.tr.i.i, i64 0, i32 0, i32 2
+  %arrayidx.i.i = bitcast i32* %operands.i.i to %str1**
+  %0 = load %str1** %arrayidx.i.i, align 8
+  %code1.i.i.phi.trans.insert = getelementptr inbounds %str1* %0, i64 0, i32 0, i32 0, i64 16
+  br label %sw.bb.i.i
+}
diff --git a/test/CodeGen/ARM64/coalesce-ext.ll b/test/CodeGen/ARM64/coalesce-ext.ll
new file mode 100644
index 0000000..9e8d08e
--- /dev/null
+++ b/test/CodeGen/ARM64/coalesce-ext.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-darwin < %s | FileCheck %s
+; Check that the peephole optimizer knows about sext and zext instructions.
+; CHECK: test1sext
+define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
+  %C = add i64 %A, %B
+  ; CHECK: add x[[SUM:[0-9]+]], x0, x1
+  %D = trunc i64 %C to i32
+  %E = shl i64 %C, 32
+  %F = ashr i64 %E, 32
+  ; CHECK: sxtw x[[EXT:[0-9]+]], x[[SUM]]
+  store volatile i64 %F, i64 *%P2
+  ; CHECK: str x[[EXT]]
+  store volatile i32 %D, i32* %P
+  ; Reuse low bits of extended register, don't extend live range of SUM.
+  ; CHECK: str w[[SUM]]
+  ret i32 %D
+}
diff --git a/test/CodeGen/ARM64/code-model-large-abs.ll b/test/CodeGen/ARM64/code-model-large-abs.ll
new file mode 100644
index 0000000..264da2d
--- /dev/null
+++ b/test/CodeGen/ARM64/code-model-large-abs.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large < %s | FileCheck %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define i8* @global_addr() {
+; CHECK-LABEL: global_addr:
+  ret i8* @var8
+  ; The movz/movk calculation should end up returned directly in x0.
+; CHECK: movz x0, #:abs_g3:var8
+; CHECK: movk x0, #:abs_g2_nc:var8
+; CHECK: movk x0, #:abs_g1_nc:var8
+; CHECK: movk x0, #:abs_g0_nc:var8
+; CHECK-NEXT: ret
+}
+
+define i8 @global_i8() {
+; CHECK-LABEL: global_i8:
+  %val = load i8* @var8
+  ret i8 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var8
+; CHECK: ldrb w0, [x[[ADDR_REG]]]
+}
+
+define i16 @global_i16() {
+; CHECK-LABEL: global_i16:
+  %val = load i16* @var16
+  ret i16 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var16
+; CHECK: ldrh w0, [x[[ADDR_REG]]]
+}
+
+define i32 @global_i32() {
+; CHECK-LABEL: global_i32:
+  %val = load i32* @var32
+  ret i32 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var32
+; CHECK: ldr w0, [x[[ADDR_REG]]]
+}
+
+define i64 @global_i64() {
+; CHECK-LABEL: global_i64:
+  %val = load i64* @var64
+  ret i64 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var64
+; CHECK: ldr x0, [x[[ADDR_REG]]]
+}
+
+define <2 x i64> @constpool() {
+; CHECK-LABEL: constpool:
+  ret <2 x i64> <i64 123456789, i64 987654321100>
+
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:[[CPADDR:.LCPI[0-9]+_[0-9]+]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:[[CPADDR]]
+; CHECK: ldr q0, [x[[ADDR_REG]]]
+}
diff --git a/test/CodeGen/ARM64/collect-loh-garbage-crash.ll b/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
new file mode 100644
index 0000000..98cb625
--- /dev/null
+++ b/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=arm64-apple-ios -O3 -arm64-collect-loh -arm64-collect-loh-bb-only=true -arm64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
+; Check that the LOH analysis does not crash when the analysed chained
+; contains instructions that are filtered out.
+;
+; Before the fix for <rdar://problem/16041712>, these cases were removed
+; from the main container. Now, the deterministic container does not allow
+; to remove arbitrary values, so we have to live with garbage values.
+; <rdar://problem/16041712>
+
+%"class.H4ISP::H4ISPDevice" = type { i32 (%"class.H4ISP::H4ISPDevice"*, i32, i8*, i8*)*, i8*, i32*, %"class.H4ISP::H4ISPCameraManager"* }
+
+%"class.H4ISP::H4ISPCameraManager" = type opaque
+
+declare i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"*)
+
+@pH4ISPDevice = hidden global %"class.H4ISP::H4ISPDevice"* null, align 8
+
+; CHECK-LABEL: _foo:
+; CHECK: ret
+; CHECK-NOT: .loh AdrpLdrGotLdr
+define void @foo() {
+entry:
+  br label %if.then83
+if.then83:                                        ; preds = %if.end81
+  %tmp = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+  %call84 = call i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"* %tmp) #19
+  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"()
+  %tmp2 = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x28}"()
+  %pCameraManager.i268 = getelementptr inbounds %"class.H4ISP::H4ISPDevice"* %tmp2, i64 0, i32 3
+  %tmp3 = load %"class.H4ISP::H4ISPCameraManager"** %pCameraManager.i268, align 8
+  %tobool.i269 = icmp eq %"class.H4ISP::H4ISPCameraManager"* %tmp3, null
+  br i1 %tobool.i269, label %if.then83, label %end
+end:
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/collect-loh-str.ll b/test/CodeGen/ARM64/collect-loh-str.ll
new file mode 100644
index 0000000..fc63f8b
--- /dev/null
+++ b/test/CodeGen/ARM64/collect-loh-str.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; Test case for <rdar://problem/15942912>.
+; AdrpAddStr cannot be used when the store uses same
+; register as address and value. Indeed, the related
+; if applied, may completely remove the definition or
+; at least provide a wrong one (with the offset folded
+; into the definition).
+
+%struct.anon = type { i32*, i32** }
+
+@pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
+
+; CHECK-LABEL: _pptp_wan_init
+; CHECK: ret
+; CHECK-NOT: AdrpAddStr
+define i32 @pptp_wan_init() {
+entry:
+  store i32* null, i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), align 8
+  store i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), i32*** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 1), align 8
+  ret i32 0
+}
+
+
diff --git a/test/CodeGen/ARM64/collect-loh.ll b/test/CodeGen/ARM64/collect-loh.ll
new file mode 100644
index 0000000..08ab062
--- /dev/null
+++ b/test/CodeGen/ARM64/collect-loh.ll
@@ -0,0 +1,47 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+
+@a = internal unnamed_addr global i32 0, align 4
+@b = external global i32
+
+; Function Attrs: noinline nounwind ssp
+define void @foo(i32 %t) {
+entry:
+  %tmp = load i32* @a, align 4
+  %add = add nsw i32 %tmp, %t
+  store i32 %add, i32* @a, align 4
+  ret void
+}
+
+; Function Attrs: nounwind ssp
+; Testcase for <rdar://problem/15438605>, AdrpAdrp reuse is valid only when the first adrp
+; dominates the second.
+; The first adrp comes from the loading of 'a' and the second the loading of 'b'.
+; 'a' is loaded in if.then, 'b' in if.end4, if.then does not dominates if.end4.
+; CHECK-LABEL: _test
+; CHECK: ret
+; CHECK-NOT: .loh AdrpAdrp
+define i32 @test(i32 %t) {
+entry:
+  %cmp = icmp sgt i32 %t, 5
+  br i1 %cmp, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %entry
+  %tmp = load i32* @a, align 4
+  %add = add nsw i32 %tmp, %t
+  %cmp1 = icmp sgt i32 %add, 12
+  br i1 %cmp1, label %if.then2, label %if.end4
+
+if.then2:                                         ; preds = %if.then
+  tail call void @foo(i32 %add)
+  %tmp1 = load i32* @a, align 4
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then2, %if.then, %entry
+  %t.addr.0 = phi i32 [ %tmp1, %if.then2 ], [ %t, %if.then ], [ %t, %entry ]
+  %tmp2 = load i32* @b, align 4
+  %add5 = add nsw i32 %tmp2, %t.addr.0
+  tail call void @foo(i32 %add5)
+  %tmp3 = load i32* @b, align 4
+  %add6 = add nsw i32 %tmp3, %t.addr.0
+  ret i32 %add6
+}
diff --git a/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S b/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
new file mode 100644
index 0000000..250732d
--- /dev/null
+++ b/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
@@ -0,0 +1,17 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o /dev/null %s
+
+        .text
+        .globl _foo
+        .cfi_startproc
+_foo:
+        stp x29, x30, [sp, #-16]!
+ .cfi_adjust_cfa_offset 16
+
+        ldp x29, x30, [sp], #16
+ .cfi_adjust_cfa_offset -16
+        .cfi_restore x29
+        .cfi_restore x30
+
+        ret
+
+        .cfi_endproc
diff --git a/test/CodeGen/ARM64/complex-ret.ll b/test/CodeGen/ARM64/complex-ret.ll
new file mode 100644
index 0000000..93d50a5
--- /dev/null
+++ b/test/CodeGen/ARM64/complex-ret.ll
@@ -0,0 +1,7 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+define { i192, i192, i21, i192 } @foo(i192) {
+; CHECK-LABEL: foo:
+; CHECK: stp xzr, xzr, [x8]
+  ret { i192, i192, i21, i192 } {i192 0, i192 1, i21 2, i192 3}
+}
diff --git a/test/CodeGen/ARM64/convert-v2f64-v2i32.ll b/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
new file mode 100644
index 0000000..1a07c98
--- /dev/null
+++ b/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; CHECK: fptosi_1
+; CHECK: fcvtzs.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptosi_1() nounwind noinline ssp {
+entry:
+  %0 = fptosi <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
+
+; CHECK: fptoui_1
+; CHECK: fcvtzu.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptoui_1() nounwind noinline ssp {
+entry:
+  %0 = fptoui <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/convert-v2i32-v2f64.ll b/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
new file mode 100644
index 0000000..63129a4
--- /dev/null
+++ b/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f1:
+; CHECK: sshll.2d v0, v0, #0
+; CHECK-NEXT: scvtf.2d v0, v0
+; CHECK-NEXT: ret
+  %conv = sitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+define <2 x double> @f2(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f2:
+; CHECK: ushll.2d v0, v0, #0
+; CHECK-NEXT: ucvtf.2d v0, v0
+; CHECK-NEXT: ret
+  %conv = uitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+; CHECK: autogen_SD19655
+; CHECK: scvtf
+; CHECK: ret
+define void @autogen_SD19655() {
+  %T = load <2 x i64>* undef
+  %F = sitofp <2 x i64> undef to <2 x float>
+  store <2 x float> %F, <2 x float>* undef
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/copy-tuple.ll b/test/CodeGen/ARM64/copy-tuple.ll
new file mode 100644
index 0000000..6325c3f
--- /dev/null
+++ b/test/CodeGen/ARM64/copy-tuple.ll
@@ -0,0 +1,146 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
+
+; The main purpose of this test is to find out whether copyPhysReg can deal with
+; the memmove-like situation arising in tuples, where an early copy can clobber
+; the value needed by a later one if the tuples overlap.
+
+; We use dummy inline asm to force LLVM to generate a COPY between the registers
+; we want by clobbering all the others.
+
+define void @test_D1D2_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D1D2_from_D0D1:
+; CHECK: orr.8b v2, v1
+; CHECK: orr.8b v1, v0
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D0D1_from_D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D1D2:
+; CHECK: orr.8b v0, v1
+; CHECK: orr.8b v1, v2
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D0D1_from_D31D0(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D31D0:
+; CHECK: orr.8b v1, v0
+; CHECK: orr.8b v0, v31
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D31D0_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D31D0_from_D0D1:
+; CHECK: orr.8b v31, v0
+; CHECK: orr.8b v0, v1
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D2D3D4_from_D0D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D2D3D4_from_D0D1D2:
+; CHECK: orr.8b v4, v2
+; CHECK: orr.8b v3, v1
+; CHECK: orr.8b v2, v0
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1
+  %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2
+
+  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+  ret void
+}
+
+define void @test_Q0Q1Q2_from_Q1Q2Q3(i8* %addr) #0 {
+; CHECK-LABEL: test_Q0Q1Q2_from_Q1Q2Q3:
+; CHECK: orr.16b v0, v1
+; CHECK: orr.16b v1, v2
+; CHECK: orr.16b v2, v3
+entry:
+  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+  tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+  ret void
+}
+
+define void @test_Q1Q2Q3Q4_from_Q30Q31Q0Q1(i8* %addr) #0 {
+; CHECK-LABEL: test_Q1Q2Q3Q4_from_Q30Q31Q0Q1:
+; CHECK: orr.16b v4, v1
+; CHECK: orr.16b v3, v0
+; CHECK: orr.16b v2, v31
+; CHECK: orr.16b v1, v30
+  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+  %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"()
+  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
+
+declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
diff --git a/test/CodeGen/ARM64/crc32.ll b/test/CodeGen/ARM64/crc32.ll
new file mode 100644
index 0000000..609eb44
--- /dev/null
+++ b/test/CodeGen/ARM64/crc32.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+define i32 @test_crc32b(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32b:
+; CHECK: crc32b w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.arm64.crc32b(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32h(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32h:
+; CHECK: crc32h w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.arm64.crc32h(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32w(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32w:
+; CHECK: crc32w w0, w0, w1
+  %val = call i32 @llvm.arm64.crc32w(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32x(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32x:
+; CHECK: crc32x w0, w0, x1
+  %val = call i32 @llvm.arm64.crc32x(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cb(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32cb:
+; CHECK: crc32cb w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.arm64.crc32cb(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32ch(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32ch:
+; CHECK: crc32ch w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.arm64.crc32ch(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32cw(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32cw:
+; CHECK: crc32cw w0, w0, w1
+  %val = call i32 @llvm.arm64.crc32cw(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cx(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32cx:
+; CHECK: crc32cx w0, w0, x1
+  %val = call i32 @llvm.arm64.crc32cx(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+declare i32 @llvm.arm64.crc32b(i32, i32)
+declare i32 @llvm.arm64.crc32h(i32, i32)
+declare i32 @llvm.arm64.crc32w(i32, i32)
+declare i32 @llvm.arm64.crc32x(i32, i64)
+
+declare i32 @llvm.arm64.crc32cb(i32, i32)
+declare i32 @llvm.arm64.crc32ch(i32, i32)
+declare i32 @llvm.arm64.crc32cw(i32, i32)
+declare i32 @llvm.arm64.crc32cx(i32, i64)
diff --git a/test/CodeGen/ARM64/crypto.ll b/test/CodeGen/ARM64/crypto.ll
new file mode 100644
index 0000000..3804310
--- /dev/null
+++ b/test/CodeGen/ARM64/crypto.ll
@@ -0,0 +1,135 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s | FileCheck %s
+
+declare <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
+declare <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
+
+define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aese:
+; CHECK: aese.16b v0, v1
+  %res = call <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aesd:
+; CHECK: aesd.16b v0, v1
+  %res = call <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesmc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesmc:
+; CHECK: aesmc.16b v0, v0
+ %res = call <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesimc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesimc:
+; CHECK: aesimc.16b v0, v0
+ %res = call <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+declare <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
+declare <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+declare <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+
+define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+; <rdar://problem/14742333> Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1
+define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c_in_a_row:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
+; CHECK-NOT: fmov
+; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %extract = extractelement <4 x i32> %res, i32 0
+  %res2 = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
+  ret <4 x i32> %res2
+}
+
+define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1p:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1p.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1m:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1m.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define i32 @test_sha1h(i32 %hash_e) {
+; CHECK-LABEL: test_sha1h:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
+; CHECK: fmov w0, [[RES]]
+  %res = call i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
+  ret i32 %res
+}
+
+define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
+; CHECK-LABEL: test_sha1su0:
+; CHECK: sha1su0.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
+; CHECK-LABEL: test_sha1su1:
+; CHECK: sha1su1.4s v0, v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+declare <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+
+define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h:
+; CHECK: sha256h.4s q0, q1, v2
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h2:
+; CHECK: sha256h2.4s q0, q1, v2
+
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
+; CHECK-LABEL: test_sha256su0:
+; CHECK: sha256su0.4s v0, v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
+; CHECK-LABEL: test_sha256su1:
+; CHECK: sha256su1.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+  ret <4 x i32> %res
+}
diff --git a/test/CodeGen/ARM64/cse.ll b/test/CodeGen/ARM64/cse.ll
new file mode 100644
index 0000000..d98bfd6
--- /dev/null
+++ b/test/CodeGen/ARM64/cse.ll
@@ -0,0 +1,59 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; rdar://12462006
+; CSE between "icmp reg reg" and "sub reg reg".
+; Both can be in the same basic block or in different basic blocks.
+define i8* @t1(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.ge
+; CHECK: sub
+; CHECK: sub
+; CHECK_NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, %size
+ %s = sub nsw i32 %0, %size
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, %size
+ %s2 = sub nsw i32 %s, %size
+ %s3 = sub nsw i32 %sub, %s2
+ store i32 %s3, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}
+
+; CSE between "icmp reg imm" and "sub reg imm".
+define i8* @t2(i8* %base, i32* nocapture %offset) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.lt
+; CHECK-NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, 1
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, 1
+ store i32 %sub, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}
diff --git a/test/CodeGen/ARM64/csel.ll b/test/CodeGen/ARM64/csel.ll
new file mode 100644
index 0000000..cbf1769
--- /dev/null
+++ b/test/CodeGen/ARM64/csel.ll
@@ -0,0 +1,222 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-unknown-unknown"
+
+; CHECK: foo1
+; CHECK: csinc w{{[0-9]+}}, w[[REG:[0-9]+]],
+; CHECK:                                     w[[REG]], eq
+define i32 @foo1(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %not.tobool = icmp ne i32 %c, 0
+  %add = zext i1 %not.tobool to i32
+  %b.add = add i32 %c, %b
+  %add1 = add i32 %b.add, %add
+  ret i32 %add1
+}
+
+; CHECK: foo2
+; CHECK: csneg w{{[0-9]+}}, w[[REG:[0-9]+]],
+; CHECK:                                     w[[REG]], eq
+define i32 @foo2(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %mul = sub i32 0, %b
+  %tobool = icmp eq i32 %c, 0
+  %b.mul = select i1 %tobool, i32 %b, i32 %mul
+  %add = add nsw i32 %b.mul, %c
+  ret i32 %add
+}
+
+; CHECK: foo3
+; CHECK: csinv w{{[0-9]+}}, w[[REG:[0-9]+]],
+; CHECK:                                     w[[REG]], eq
+define i32 @foo3(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %not.tobool = icmp ne i32 %c, 0
+  %xor = sext i1 %not.tobool to i32
+  %b.xor = xor i32 %xor, %b
+  %add = add nsw i32 %b.xor, %c
+  ret i32 %add
+}
+
+; rdar://11632325
+define i32@foo4(i32 %a) nounwind ssp {
+; CHECK: foo4
+; CHECK: csneg
+; CHECK-NEXT: ret
+  %cmp = icmp sgt i32 %a, -1
+  %neg = sub nsw i32 0, %a
+  %cond = select i1 %cmp, i32 %a, i32 %neg
+  ret i32 %cond
+}
+
+define i32@foo5(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: foo5
+; CHECK: subs
+; CHECK-NEXT: csneg
+; CHECK-NEXT: ret
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub3 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub3
+  ret i32 %cond
+}
+
+; make sure we can handle branch instruction in optimizeCompare.
+define i32@foo6(i32 %a, i32 %b) nounwind ssp {
+; CHECK: foo6
+; CHECK: b
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, 0
+  br i1 %cmp, label %l.if, label %l.else
+
+l.if:
+  ret i32 1
+
+l.else:
+  ret i32 %sub
+}
+
+; If CPSR is used multiple times and V flag is used, we don't remove cmp.
+define i32 @foo7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: foo7:
+; CHECK: sub
+; CHECK-next: adds
+; CHECK-next: csneg
+; CHECK-next: b
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub3 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub3
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = icmp slt i32 %sub, -1
+  %sel = select i1 %cmp2, i32 %cond, i32 %a
+  ret i32 %sel
+
+if.else:
+  ret i32 %cond
+}
+
+define i32 @foo8(i32 %v, i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: foo8:
+; CHECK: cmp w0, #0
+; CHECK: csinv w0, w1, w2, ne
+  %tobool = icmp eq i32 %v, 0
+  %neg = xor i32 -1, %b
+  %cond = select i1 %tobool, i32 %neg, i32 %a
+  ret i32 %cond
+}
+
+define i32 @foo9(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo9:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: csinv w0, w[[REG]], w[[REG]], ne
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 4, i32 -5
+  ret i32 %cond
+}
+
+define i64 @foo10(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo10:
+; CHECK: cmp x0, #0
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x4
+; CHECK: csinv x0, x[[REG]], x[[REG]], ne
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 4, i64 -5
+  ret i64 %cond
+}
+
+define i32 @foo11(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo11:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: csneg w0, w[[REG]], w[[REG]], ne
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 4, i32 -4
+  ret i32 %cond
+}
+
+define i64 @foo12(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo12:
+; CHECK: cmp x0, #0
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x4
+; CHECK: csneg x0, x[[REG]], x[[REG]], ne
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 4, i64 -4
+  ret i64 %cond
+}
+
+define i32 @foo13(i32 %v, i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo13:
+; CHECK: cmp w0, #0
+; CHECK: csneg w0, w1, w2, ne
+  %tobool = icmp eq i32 %v, 0
+  %sub = sub i32 0, %b
+  %cond = select i1 %tobool, i32 %sub, i32 %a
+  ret i32 %cond
+}
+
+define i64 @foo14(i64 %v, i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo14:
+; CHECK: cmp x0, #0
+; CHECK: csneg x0, x1, x2, ne
+  %tobool = icmp eq i64 %v, 0
+  %sub = sub i64 0, %b
+  %cond = select i1 %tobool, i64 %sub, i64 %a
+  ret i64 %cond
+}
+
+define i32 @foo15(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo15:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: csinc w0, w[[REG]], w[[REG]], le
+  %cmp = icmp sgt i32 %a, %b
+  %. = select i1 %cmp, i32 2, i32 1
+  ret i32 %.
+}
+
+define i32 @foo16(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo16:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: csinc w0, w[[REG]], w[[REG]], gt
+  %cmp = icmp sgt i32 %a, %b
+  %. = select i1 %cmp, i32 1, i32 2
+  ret i32 %.
+}
+
+define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo17:
+; CHECK: cmp x0, x1
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x1
+; CHECK: csinc x0, x[[REG]], x[[REG]], le
+  %cmp = icmp sgt i64 %a, %b
+  %. = select i1 %cmp, i64 2, i64 1
+  ret i64 %.
+}
+
+define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo18:
+; CHECK: cmp x0, x1
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x1
+; CHECK: csinc x0, x[[REG]], x[[REG]], gt
+  %cmp = icmp sgt i64 %a, %b
+  %. = select i1 %cmp, i64 1, i64 2
+  ret i64 %.
+}
diff --git a/test/CodeGen/ARM64/cvt.ll b/test/CodeGen/ARM64/cvt.ll
new file mode 100644
index 0000000..b55a42f
--- /dev/null
+++ b/test/CodeGen/ARM64/cvt.ll
@@ -0,0 +1,401 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to away)
+;
+define i32 @fcvtas_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1s:
+;CHECK: fcvtas w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1s:
+;CHECK: fcvtas x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtas_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1d:
+;CHECK: fcvtas w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1d:
+;CHECK: fcvtas x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtas.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtas.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtas.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtas.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer
+;
+define i32 @fcvtau_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1s:
+;CHECK: fcvtau w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1s:
+;CHECK: fcvtau x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtau_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1d:
+;CHECK: fcvtau w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1d:
+;CHECK: fcvtau x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtau.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtau.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtau.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtau.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward -Inf)
+;
+define i32 @fcvtms_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1s:
+;CHECK: fcvtms w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1s:
+;CHECK: fcvtms x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtms_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1d:
+;CHECK: fcvtms w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1d:
+;CHECK: fcvtms x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtms.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtms.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtms.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtms.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward -Inf)
+;
+define i32 @fcvtmu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1s:
+;CHECK: fcvtmu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1s:
+;CHECK: fcvtmu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtmu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1d:
+;CHECK: fcvtmu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1d:
+;CHECK: fcvtmu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtmu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtmu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtmu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtmu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to even)
+;
+define i32 @fcvtns_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1s:
+;CHECK: fcvtns w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1s:
+;CHECK: fcvtns x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtns_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1d:
+;CHECK: fcvtns w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1d:
+;CHECK: fcvtns x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtns.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtns.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtns.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtns.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
+;
+define i32 @fcvtnu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1s:
+;CHECK: fcvtnu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1s:
+;CHECK: fcvtnu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtnu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1d:
+;CHECK: fcvtnu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1d:
+;CHECK: fcvtnu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtnu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtnu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtnu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtnu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward +Inf)
+;
+define i32 @fcvtps_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1s:
+;CHECK: fcvtps w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1s:
+;CHECK: fcvtps x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtps_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1d:
+;CHECK: fcvtps w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1d:
+;CHECK: fcvtps x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtps.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtps.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtps.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtps.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward +Inf)
+;
+define i32 @fcvtpu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1s:
+;CHECK: fcvtpu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1s:
+;CHECK: fcvtpu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtpu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1d:
+;CHECK: fcvtpu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1d:
+;CHECK: fcvtpu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtpu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtpu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtpu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtpu.i64.f64(double) nounwind readnone
+
+;
+;  Floating-point scalar convert to signed integer (toward zero)
+;
+define i32 @fcvtzs_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1s:
+;CHECK: fcvtzs w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1s:
+;CHECK: fcvtzs x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzs_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1d:
+;CHECK: fcvtzs w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1d:
+;CHECK: fcvtzs x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtzs.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzs.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtzs.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzs.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward zero)
+;
+define i32 @fcvtzu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1s:
+;CHECK: fcvtzu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1s:
+;CHECK: fcvtzu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1d:
+;CHECK: fcvtzu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1d:
+;CHECK: fcvtzu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtzu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtzu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzu.i64.f64(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/dagcombiner-convergence.ll b/test/CodeGen/ARM64/dagcombiner-convergence.ll
new file mode 100644
index 0000000..a45e313
--- /dev/null
+++ b/test/CodeGen/ARM64/dagcombiner-convergence.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -o /dev/null
+; rdar://10795250
+; DAGCombiner should converge.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-macosx10.8.0"
+
+define i64 @foo(i128 %Params.coerce, i128 %SelLocs.coerce) {
+entry:
+  %tmp = lshr i128 %Params.coerce, 61
+  %.tr38.i = trunc i128 %tmp to i64
+  %mul.i = and i64 %.tr38.i, 4294967288
+  %tmp1 = lshr i128 %SelLocs.coerce, 62
+  %.tr.i = trunc i128 %tmp1 to i64
+  %mul7.i = and i64 %.tr.i, 4294967292
+  %add.i = add i64 %mul7.i, %mul.i
+  %conv.i.i = and i64 %add.i, 4294967292
+  ret i64 %conv.i.i
+}
diff --git a/test/CodeGen/ARM64/dagcombiner-load-slicing.ll b/test/CodeGen/ARM64/dagcombiner-load-slicing.ll
new file mode 100644
index 0000000..0679014
--- /dev/null
+++ b/test/CodeGen/ARM64/dagcombiner-load-slicing.ll
@@ -0,0 +1,102 @@
+; RUN: llc -mtriple arm64-apple-ios -O3 -o - < %s | FileCheck %s
+; <rdar://problem/14477220>
+
+%class.Complex = type { float, float }
+%class.Complex_int = type { i32, i32 }
+%class.Complex_long = type { i64, i64 }
+
+; CHECK-LABEL: @test
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test(%class.Complex* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %0 = bitcast %class.Complex* %arrayidx to i64*
+  %1 = load i64* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
+  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
+  %4 = load float* %i.i, align 4
+  %add.i = fadd float %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
+  %5 = load float* %r.i, align 4
+  %add5.i = fadd float %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
+  store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; CHECK-LABEL: @test_int
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_int(%class.Complex_int* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex_int* %out, i64 %out_start
+  %0 = bitcast %class.Complex_int* %arrayidx to i64*
+  %1 = load i64* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to i32
+  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to i32
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex_int* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 0
+  %4 = load i32* %i.i, align 4
+  %add.i = add i32 %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i32> undef, i32 %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 1
+  %5 = load i32* %r.i, align 4
+  %add5.i = add i32 %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i32> %retval.sroa.0.0.vec.insert.i, i32 %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_int* %arrayidx to <2 x i32>*
+  store <2 x i32> %retval.sroa.0.4.vec.insert.i, <2 x i32>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; CHECK-LABEL: @test_long
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #4
+; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], {{\[}}[[BASE]], #128]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_long(%class.Complex_long* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex_long* %out, i64 %out_start
+  %0 = bitcast %class.Complex_long* %arrayidx to i128*
+  %1 = load i128* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i128 %1 to i64
+  %2 = bitcast i64 %t0.sroa.0.0.extract.trunc to i64
+  %t0.sroa.2.0.extract.shift = lshr i128 %1, 64
+  %t0.sroa.2.0.extract.trunc = trunc i128 %t0.sroa.2.0.extract.shift to i64
+  %3 = bitcast i64 %t0.sroa.2.0.extract.trunc to i64
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex_long* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 0
+  %4 = load i64* %i.i, align 4
+  %add.i = add i64 %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i64> undef, i64 %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 1
+  %5 = load i64* %r.i, align 4
+  %add5.i = add i64 %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i64> %retval.sroa.0.0.vec.insert.i, i64 %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_long* %arrayidx to <2 x i64>*
+  store <2 x i64> %retval.sroa.0.4.vec.insert.i, <2 x i64>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM64/dup.ll b/test/CodeGen/ARM64/dup.ll
new file mode 100644
index 0000000..e659575
--- /dev/null
+++ b/test/CodeGen/ARM64/dup.ll
@@ -0,0 +1,322 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @v_dup8(i8 %A) nounwind {
+;CHECK-LABEL: v_dup8:
+;CHECK: dup.8b
+	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
+	ret <8 x i8> %tmp8
+}
+
+define <4 x i16> @v_dup16(i16 %A) nounwind {
+;CHECK-LABEL: v_dup16:
+;CHECK: dup.4h
+	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_dup32(i32 %A) nounwind {
+;CHECK-LABEL: v_dup32:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_dupfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupfloat:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_dupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_dupQ8:
+;CHECK: dup.16b
+	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
+	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
+	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
+	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
+	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
+	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
+	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
+	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
+	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
+	ret <16 x i8> %tmp16
+}
+
+define <8 x i16> @v_dupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_dupQ16:
+;CHECK: dup.8h
+	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
+	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
+	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
+	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
+	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
+	ret <8 x i16> %tmp8
+}
+
+define <4 x i32> @v_dupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_dupQ32:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
+	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
+	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
+	ret <4 x i32> %tmp4
+}
+
+define <4 x float> @v_dupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupQfloat:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
+	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
+	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
+	ret <4 x float> %tmp4
+}
+
+; Check to make sure it works with shuffles, too.
+
+define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledup8:
+;CHECK: dup.8b
+	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledup16:
+;CHECK: dup.4h
+	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledup32:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_shuffledupfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupfloat:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ8:
+;CHECK: dup.16b
+	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ16:
+;CHECK: dup.8h
+	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ32:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupQfloat:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+	ret <4 x float> %tmp2
+}
+
+define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplane8:
+;CHECK: dup.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplane16:
+;CHECK: dup.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplane32:
+;CHECK: dup.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplanefloat:
+;CHECK: dup.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ8:
+;CHECK: dup.16b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ16:
+;CHECK: dup.8h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ32:
+;CHECK: dup.4s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplaneQfloat:
+;CHECK: dup.4s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x float> %tmp2
+}
+
+define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: foo:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: bar:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x i64> %0
+}
+
+define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: baz:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %0
+}
+
+define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: qux:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %0
+}
+
+define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone  {
+; CHECK-LABEL: f:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ret
+  %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
+  ret <2 x i32> %vecinit1
+}
+
+define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
+; CHECK-LABEL: g:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ins.s v0[2], w1
+; CHECK-NEXT: ins.s v0[3], w0
+; CHECK-NEXT: ret
+  %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
+  ret <4 x i32> %vecinit3
+}
+
+define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
+; CHECK-LABEL: h:
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: ins.d v0[1], x1
+; CHECK-NEXT: ret
+  %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
+  %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
+  ret <2 x i64> %vecinit1
+}
+
+; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
+; the single value needed was of the same type as the vector. This is false if
+; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
+; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
+; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
+define <4 x i16> @test_build_illegal(<4 x i32> %in) {
+; CHECK-LABEL: test_build_illegal:
+; CHECK: umov.s [[WTMP:w[0-9]+]], v0[3]
+; CHECK: dup.4h v0, [[WTMP]]
+  %val = extractelement <4 x i32> %in, i32 3
+  %smallval = trunc i32 %val to i16
+  %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
+
+  ret <4 x i16> %vec
+}
+
+; We used to inherit an already extract_subvectored v4i16 from
+; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
+; the formation of an indexed-by-7 MLS.
+define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+; CHECK-LABEL: test_high_splat:
+; CHECK: mls.4h v0, v1, v2[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
diff --git a/test/CodeGen/ARM64/early-ifcvt.ll b/test/CodeGen/ARM64/early-ifcvt.ll
new file mode 100644
index 0000000..a5c1e26
--- /dev/null
+++ b/test/CodeGen/ARM64/early-ifcvt.ll
@@ -0,0 +1,423 @@
+; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
+target triple = "arm64-apple-macosx"
+
+; CHECK: mm2
+define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  br label %do.body
+
+; CHECK: do.body
+; Loop body has no branches before the backedge.
+; CHECK-NOT: LBB
+do.body:
+  %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ]
+  %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ]
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ]
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ]
+  %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1
+  %0 = load i32* %p.addr.0, align 4
+  %cmp = icmp sgt i32 %0, %max.0
+  br i1 %cmp, label %do.cond, label %if.else
+
+if.else:
+  %cmp1 = icmp slt i32 %0, %min.0
+  %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0
+  br label %do.cond
+
+do.cond:
+  %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ]
+  %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ]
+; CHECK: cbnz
+  %dec = add i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:
+  %sub = sub nsw i32 %max.1, %min.1
+  ret i32 %sub
+}
+
+; CHECK-LABEL: fold_inc_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inc_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inc = add nsw i32 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %inc, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inc_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inc = add nsw i64 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %inc, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inc_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inc = add nsw i32 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %inc, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inc_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inc = add nsw i64 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %inc, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inv_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inv = xor i32 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %inv, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inv_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inv = xor i64 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %inv, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inv_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inv = xor i32 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %inv, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inv_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inv = xor i64 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %inv, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_neg_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %neg = sub nsw i32 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %neg, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_neg_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %neg = sub nsw i64 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %neg, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_neg_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %neg = sub nsw i32 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %neg, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_neg_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %neg = sub nsw i64 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %neg, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: cbnz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @cbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: cbnz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @cbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: cbz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @cbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp ne i32 %c, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: cbz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @cbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp ne i64 %c, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: tbnz_32
+; CHECK: {{ands.*xzr,|tst}} x2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @tbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %mask = and i32 %c, 128
+  %tobool = icmp eq i32 %mask, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: tbnz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @tbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %mask = and i64 %c, 9223372036854775808
+  %tobool = icmp eq i64 %mask, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: tbz_32
+; CHECK: {{ands.*xzr,|tst}} x2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @tbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %mask = and i32 %c, 128
+  %tobool = icmp ne i32 %mask, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: tbz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @tbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %mask = and i64 %c, 9223372036854775808
+  %tobool = icmp ne i64 %mask, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; This function from 175.vpr folds an ADDWri into a CSINC.
+; Remember to clear the kill flag on the ADDWri.
+define i32 @get_ytrack_to_xtracks() nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:
+  %x0 = load i32* undef, align 4
+  br i1 undef, label %if.then.i146, label %is_sbox.exit155
+
+if.then.i146:
+  %add8.i143 = add nsw i32 0, %x0
+  %rem.i144 = srem i32 %add8.i143, %x0
+  %add9.i145 = add i32 %rem.i144, 1
+  br label %is_sbox.exit155
+
+is_sbox.exit155:                                  ; preds = %if.then.i146, %for.body
+  %seg_offset.0.i151 = phi i32 [ %add9.i145, %if.then.i146 ], [ undef, %for.body ]
+  %idxprom15.i152 = sext i32 %seg_offset.0.i151 to i64
+  %arrayidx18.i154 = getelementptr inbounds i32* null, i64 %idxprom15.i152
+  %x1 = load i32* %arrayidx18.i154, align 4
+  br i1 undef, label %for.body51, label %for.body
+
+for.body51:                                       ; preds = %is_sbox.exit155
+  call fastcc void @get_switch_type(i32 %x1, i32 undef, i16 signext undef, i16 signext undef, i16* undef)
+  unreachable
+}
+declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, i16* nocapture) nounwind ssp
diff --git a/test/CodeGen/ARM64/elf-calls.ll b/test/CodeGen/ARM64/elf-calls.ll
new file mode 100644
index 0000000..8c40203
--- /dev/null
+++ b/test/CodeGen/ARM64/elf-calls.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
+
+declare void @callee()
+
+define void @caller() {
+  call void @callee()
+  ret void
+; CHECK-LABEL: caller:
+; CHECK:     bl callee
+; CHECK-OBJ: R_AARCH64_CALL26 callee
+}
+
+define void @tail_caller() {
+  tail call void @callee()
+  ret void
+; CHECK-LABEL: tail_caller:
+; CHECK:     b callee
+; CHECK-OBJ: R_AARCH64_JUMP26 callee
+}
diff --git a/test/CodeGen/ARM64/elf-constpool.ll b/test/CodeGen/ARM64/elf-constpool.ll
new file mode 100644
index 0000000..95d3343
--- /dev/null
+++ b/test/CodeGen/ARM64/elf-constpool.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -o - %s | FileCheck %s
+
+; O0 checked for fastisel purposes. It has a separate path which
+; creates a constpool entry for floating values.
+
+define double @needs_const() {
+  ret double 3.14159
+; CHECK: .LCPI0_0:
+
+; CHECK: adrp {{x[0-9]+}}, .LCPI0_0
+; CHECK: ldr d0, [{{x[0-9]+}}, :lo12:.LCPI0_0]
+}
diff --git a/test/CodeGen/ARM64/elf-globals.ll b/test/CodeGen/ARM64/elf-globals.ll
new file mode 100644
index 0000000..598c96a
--- /dev/null
+++ b/test/CodeGen/ARM64/elf-globals.ll
@@ -0,0 +1,115 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 | FileCheck %s --check-prefix=CHECK-FAST
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s | FileCheck %s --check-prefix=CHECK-FAST-PIC
+
+@var8 = external global i8, align 1
+@var16 = external global i16, align 2
+@var32 = external global i32, align 4
+@var64 = external global i64, align 8
+
+define i8 @test_i8(i8 %new) {
+  %val = load i8* @var8, align 1
+  store i8 %new, i8* @var8
+  ret i8 %val
+; CHECK-LABEL: test_i8:
+; CHECK: adrp x[[HIREG:[0-9]+]], var8
+; CHECK: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+; CHECK: strb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-PIC-LABEL: test_i8:
+; CHECK-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-PIC: ldr x[[VAR_ADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-PIC: ldrb {{w[0-9]+}}, [x[[VAR_ADDR]]]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var8
+; CHECK-FAST: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-FAST-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-FAST-PIC: ldr x[[VARADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-FAST-PIC: ldr {{w[0-9]+}}, [x[[VARADDR]]]
+}
+
+define i16 @test_i16(i16 %new) {
+  %val = load i16* @var16, align 2
+  store i16 %new, i16* @var16
+  ret i16 %val
+; CHECK-LABEL: test_i16:
+; CHECK: adrp x[[HIREG:[0-9]+]], var16
+; CHECK: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+; CHECK: strh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var16
+; CHECK-FAST: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+}
+
+define i32 @test_i32(i32 %new) {
+  %val = load i32* @var32, align 4
+  store i32 %new, i32* @var32
+  ret i32 %val
+; CHECK-LABEL: test_i32:
+; CHECK: adrp x[[HIREG:[0-9]+]], var32
+; CHECK: ldr {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+; CHECK: str {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var32
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var32
+}
+
+define i64 @test_i64(i64 %new) {
+  %val = load i64* @var64, align 8
+  store i64 %new, i64* @var64
+  ret i64 %val
+; CHECK-LABEL: test_i64:
+; CHECK: adrp x[[HIREG:[0-9]+]], var64
+; CHECK: ldr {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+; CHECK: str {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var64
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var64
+}
+
+define i64* @test_addr() {
+  ret i64* @var64
+; CHECK-LABEL: test_addr:
+; CHECK: adrp [[HIREG:x[0-9]+]], var64
+; CHECK: add x0, [[HIREG]], :lo12:var64
+
+; CHECK-FAST: adrp [[HIREG:x[0-9]+]], var64
+; CHECK-FAST: add x0, [[HIREG]], :lo12:var64
+}
+
+@hiddenvar = hidden global i32 0, align 4
+@protectedvar = protected global i32 0, align 4
+
+define i32 @test_vis() {
+  %lhs = load i32* @hiddenvar, align 4
+  %rhs = load i32* @protectedvar, align 4
+  %ret = add i32 %lhs, %rhs
+  ret i32 %ret
+; CHECK-PIC: adrp {{x[0-9]+}}, hiddenvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:hiddenvar]
+; CHECK-PIC: adrp {{x[0-9]+}}, protectedvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:protectedvar]
+}
+
+@var_default = external global [2 x i32]
+
+define i32 @test_default_align() {
+  %addr = getelementptr [2 x i32]* @var_default, i32 0, i32 0
+  %val = load i32* %addr
+  ret i32 %val
+; CHECK-LABEL: test_default_align:
+; CHECK: adrp x[[HIREG:[0-9]+]], var_default
+; CHECK: ldr w0, [x[[HIREG]], :lo12:var_default]
+}
+
+define i64 @test_default_unaligned() {
+  %addr = bitcast [2 x i32]* @var_default to i64*
+  %val = load i64* %addr
+  ret i64 %val
+; CHECK-LABEL: test_default_unaligned:
+; CHECK: adrp [[HIREG:x[0-9]+]], var_default
+; CHECK: add x[[ADDR:[0-9]+]], [[HIREG]], :lo12:var_default
+; CHECK: ldr x0, [x[[ADDR]]]
+}
diff --git a/test/CodeGen/ARM64/ext.ll b/test/CodeGen/ARM64/ext.ll
new file mode 100644
index 0000000..57d6e0c
--- /dev/null
+++ b/test/CodeGen/ARM64/ext.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd:
+;CHECK: {{ext.8b.*#3}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+	ret <8 x i8> %tmp3
+}
+
+define <8 x i8> @test_vextRd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRd:
+;CHECK: {{ext.8b.*#5}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextq:
+;CHECK: {{ext.16b.*3}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+	ret <16 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq:
+;CHECK: {{ext.16b.*7}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @test_vextd16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: test_vextd16:
+;CHECK: {{ext.8b.*#6}}
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+	ret <4 x i16> %tmp3
+}
+
+define <4 x i32> @test_vextq32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: test_vextq32:
+;CHECK: {{ext.16b.*12}}
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+	ret <4 x i32> %tmp3
+}
+
+; Undef shuffle indices should not prevent matching to VEXT:
+
+define <8 x i8> @test_vextd_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd_undef:
+;CHECK: {{ext.8b.*}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq_undef:
+;CHECK: {{ext.16b.*#7}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 undef, i32 undef, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 undef, i32 6>
+	ret <16 x i8> %tmp3
+}
+
+; Tests for ReconstructShuffle function. Indices have to be carefully
+; chosen to reach lowering phase as a BUILD_VECTOR.
+
+; One vector needs vext, the other can be handled by extract_subvector
+; Also checks interleaving of sources is handled correctly.
+; Essence: a vext is used on %A and something saner than stack load/store for final result.
+define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_interleaved:
+;CHECK: ext.8b
+;CHECK: zip1.4h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 3, i32 8, i32 5, i32 9>
+        ret <4 x i16> %tmp3
+}
+
+; An undef in the shuffle list should still be optimizable
+define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_undef:
+;CHECK: zip1.4h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 undef, i32 8, i32 5, i32 9>
+        ret <4 x i16> %tmp3
+}
diff --git a/test/CodeGen/ARM64/extend-int-to-fp.ll b/test/CodeGen/ARM64/extend-int-to-fp.ll
new file mode 100644
index 0000000..599a697
--- /dev/null
+++ b/test/CodeGen/ARM64/extend-int-to-fp.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <4 x float> @foo(<4 x i16> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: ushll.4s	v0, v0, #0
+; CHECK-NEXT: ucvtf.4s	v0, v0
+; CHECK-NEXT: ret
+  %vcvt.i = uitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @bar(<4 x i16> %a) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sshll.4s	v0, v0, #0
+; CHECK-NEXT: scvtf.4s	v0, v0
+; CHECK-NEXT: ret
+  %vcvt.i = sitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
diff --git a/test/CodeGen/ARM64/extend.ll b/test/CodeGen/ARM64/extend.ll
new file mode 100644
index 0000000..4d20543
--- /dev/null
+++ b/test/CodeGen/ARM64/extend.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+@array = external global [0 x i32]
+
+define i64 @foo(i32 %i) {
+; CHECK: foo
+; CHECK:  adrp  x[[REG:[0-9]+]], _array@GOTPAGE
+; CHECK:  ldr x[[REG1:[0-9]+]], [x[[REG]], _array@GOTPAGEOFF]
+; CHECK:  ldrsw x0, [x[[REG1]], x0, sxtw #2]
+; CHECK:  ret
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds [0 x i32]* @array, i64 0, i64 %idxprom
+  %tmp1 = load i32* %arrayidx, align 4
+  %conv = sext i32 %tmp1 to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/ARM64/extern-weak.ll b/test/CodeGen/ARM64/extern-weak.ll
new file mode 100644
index 0000000..a239403
--- /dev/null
+++ b/test/CodeGen/ARM64/extern-weak.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -o - < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
+
+declare extern_weak i32 @var()
+
+define i32()* @foo() {
+; The usual ADRP/ADD pair can't be used for a weak reference because it must
+; evaluate to 0 if the symbol is undefined. We use a litpool entry.
+  ret i32()* @var
+
+; CHECK: adrp x[[VAR:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[VAR]], :got_lo12:var]
+
+  ; In the large model, the usual relocations are absolute and can
+  ; materialise 0.
+; CHECK-LARGE: movz x0, #:abs_g3:var
+; CHECK-LARGE: movk x0, #:abs_g2_nc:var
+; CHECK-LARGE: movk x0, #:abs_g1_nc:var
+; CHECK-LARGE: movk x0, #:abs_g0_nc:var
+}
+
+
+@arr_var = extern_weak global [10 x i32]
+
+define i32* @bar() {
+  %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
+; CHECK: adrp x[[ARR_VAR_HI:[0-9]+]], :got:arr_var
+; CHECK: ldr [[ARR_VAR:x[0-9]+]], [x[[ARR_VAR_HI]], :got_lo12:arr_var]
+; CHECK: add x0, [[ARR_VAR]], #20
+  ret i32* %addr
+
+  ; In the large model, the usual relocations are absolute and can
+  ; materialise 0.
+; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g2_nc:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g1_nc:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g0_nc:arr_var
+}
+
+@defined_weak_var = internal unnamed_addr global i32 0
+
+define i32* @wibble() {
+  ret i32* @defined_weak_var
+; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
+
+; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g0_nc:defined_weak_var
+}
diff --git a/test/CodeGen/ARM64/extload-knownzero.ll b/test/CodeGen/ARM64/extload-knownzero.ll
new file mode 100644
index 0000000..14e5fd3
--- /dev/null
+++ b/test/CodeGen/ARM64/extload-knownzero.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://12771555
+
+define void @foo(i16* %ptr, i32 %a) nounwind {
+entry:
+; CHECK-LABEL: foo:
+  %tmp1 = icmp ult i32 %a, 100
+  br i1 %tmp1, label %bb1, label %bb2
+bb1:
+; CHECK: %bb1
+; CHECK: ldrh [[REG:w[0-9]+]]
+  %tmp2 = load i16* %ptr, align 2
+  br label %bb2
+bb2:
+; CHECK: %bb2
+; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
+; CHECK: cmp [[REG]], #23
+  %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]
+  %cmp = icmp ult i16 %tmp3, 24
+  br i1 %cmp, label %bb3, label %exit
+bb3:
+  call void @bar() nounwind
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @bar ()
diff --git a/test/CodeGen/ARM64/extract.ll b/test/CodeGen/ARM64/extract.ll
new file mode 100644
index 0000000..119751c
--- /dev/null
+++ b/test/CodeGen/ARM64/extract.ll
@@ -0,0 +1,58 @@
+; RUN: llc -arm64-extr-generation=true -verify-machineinstrs < %s \
+; RUN: -march=arm64 | FileCheck %s
+
+define i64 @ror_i64(i64 %in) {
+; CHECK-LABEL: ror_i64:
+    %left = shl i64 %in, 19
+    %right = lshr i64 %in, 45
+    %val5 = or i64 %left, %right
+; CHECK: extr {{x[0-9]+}}, x0, x0, #45
+    ret i64 %val5
+}
+
+define i32 @ror_i32(i32 %in) {
+; CHECK-LABEL: ror_i32:
+    %left = shl i32 %in, 9
+    %right = lshr i32 %in, 23
+    %val5 = or i32 %left, %right
+; CHECK: extr {{w[0-9]+}}, w0, w0, #23
+    ret i32 %val5
+}
+
+define i32 @extr_i32(i32 %lhs, i32 %rhs) {
+; CHECK-LABEL: extr_i32:
+  %left = shl i32 %lhs, 6
+  %right = lshr i32 %rhs, 26
+  %val = or i32 %left, %right
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{w[0-9]+}}, w0, w1, #26
+
+  ret i32 %val
+}
+
+define i64 @extr_i64(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: extr_i64:
+  %right = lshr i64 %rhs, 40
+  %left = shl i64 %lhs, 24
+  %val = or i64 %right, %left
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{x[0-9]+}}, x0, x1, #40
+
+  ret i64 %val
+}
+
+; Regression test: a bad experimental pattern crept into git which optimised
+; this pattern to a single EXTR.
+define i32 @extr_regress(i32 %a, i32 %b) {
+; CHECK-LABEL: extr_regress:
+
+    %sh1 = shl i32 %a, 14
+    %sh2 = lshr i32 %b, 14
+    %val = or i32 %sh2, %sh1
+; CHECK-NOT: extr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, #{{[0-9]+}}
+
+    ret i32 %val
+; CHECK: ret
+}
diff --git a/test/CodeGen/ARM64/extract_subvector.ll b/test/CodeGen/ARM64/extract_subvector.ll
new file mode 100644
index 0000000..20c05fb
--- /dev/null
+++ b/test/CodeGen/ARM64/extract_subvector.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
+
+define <8 x i8> @v8i8(<16 x i8> %a) nounwind {
+; CHECK: v8i8
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32>  <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %ret
+}
+
+define <4 x i16> @v4i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: v4i16:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32>  <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %ret
+}
+
+define <2 x i32> @v2i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: v2i32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32>  <i32 2, i32 3>
+  ret <2 x i32> %ret
+}
+
+define <1 x i64> @v1i64(<2 x i64> %a) nounwind {
+; CHECK-LABEL: v1i64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32>  <i32 1>
+  ret <1 x i64> %ret
+}
+
+define <2 x float> @v2f32(<4 x float> %a) nounwind {
+; CHECK-LABEL: v2f32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32>  <i32 2, i32 3>
+  ret <2 x float> %ret
+}
+
+define <1 x double> @v1f64(<2 x double> %a) nounwind {
+; CHECK-LABEL: v1f64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32>  <i32 1>
+  ret <1 x double> %ret
+}
diff --git a/test/CodeGen/ARM64/fast-isel-addr-offset.ll b/test/CodeGen/ARM64/fast-isel-addr-offset.ll
new file mode 100644
index 0000000..a4326dc
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-addr-offset.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+@sortlist = common global [5001 x i32] zeroinitializer, align 16
+@sortlist2 = common global [5001 x i64] zeroinitializer, align 16
+
+; Load an address with an offset larget then LDR imm can handle
+define i32 @foo() nounwind {
+entry:
+; CHECK: @foo
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist@GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist@GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #20000
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr w0, [x[[REG3]]]
+; CHECK: ret
+  %0 = load i32* getelementptr inbounds ([5001 x i32]* @sortlist, i32 0, i64 5000), align 4
+  ret i32 %0
+}
+
+define i64 @foo2() nounwind {
+entry:
+; CHECK: @foo2
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist2@GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist2@GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #40000
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr x0, [x[[REG3]]]
+; CHECK: ret
+  %0 = load i64* getelementptr inbounds ([5001 x i64]* @sortlist2, i32 0, i64 5000), align 4
+  ret i64 %0
+}
+
+; Load an address with a ridiculously large offset.
+; rdar://12505553
+@pd2 = common global i8* null, align 8
+
+define signext i8 @foo3() nounwind ssp {
+entry:
+; CHECK: @foo3
+; CHECK: movz x[[REG:[0-9]+]], #2874, lsl #32
+; CHECK: movk x[[REG]], #29646, lsl #16
+; CHECK: movk x[[REG]], #12274
+  %0 = load i8** @pd2, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 12345678901234
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
diff --git a/test/CodeGen/ARM64/fast-isel-alloca.ll b/test/CodeGen/ARM64/fast-isel-alloca.ll
new file mode 100644
index 0000000..8bbee16
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-alloca.ll
@@ -0,0 +1,24 @@
+; This test should cause the TargetMaterializeAlloca to be invoked
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+%struct.S1Ty = type { i64 }
+%struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
+
+define void @takeS1(%struct.S1Ty* %V) nounwind {
+entry:
+  %V.addr = alloca %struct.S1Ty*, align 8
+  store %struct.S1Ty* %V, %struct.S1Ty** %V.addr, align 8
+  ret void
+}
+
+define void @main() nounwind {
+entry:
+; CHECK: main
+; CHECK: mov x[[REG:[0-9]+]], sp
+; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
+; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
+  %E = alloca %struct.S2Ty, align 4
+  %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1
+  call void @takeS1(%struct.S1Ty* %B)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/fast-isel-br.ll b/test/CodeGen/ARM64/fast-isel-br.ll
new file mode 100644
index 0000000..8fd32fd
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-br.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @branch1() nounwind uwtable ssp {
+  %x = alloca i32, align 4
+  store i32 0, i32* %x, align 4
+  %1 = load i32* %x, align 4
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %3, label %4
+
+; <label>:3                                       ; preds = %0
+  br label %4
+
+; <label>:4                                       ; preds = %3, %0
+  ret void
+}
+
+define void @branch2() nounwind uwtable ssp {
+  %1 = alloca i32, align 4
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  store i32 0, i32* %1
+  store i32 1, i32* %y, align 4
+  store i32 1, i32* %x, align 4
+  store i32 0, i32* %z, align 4
+  %2 = load i32* %x, align 4
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %4, label %5
+
+; <label>:4                                       ; preds = %0
+  store i32 0, i32* %1
+  br label %14
+
+; <label>:5                                       ; preds = %0
+  %6 = load i32* %y, align 4
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %8, label %13
+
+; <label>:8                                       ; preds = %5
+  %9 = load i32* %z, align 4
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %11, label %12
+
+; <label>:11                                      ; preds = %8
+  store i32 1, i32* %1
+  br label %14
+
+; <label>:12                                      ; preds = %8
+  store i32 0, i32* %1
+  br label %14
+
+; <label>:13                                      ; preds = %5
+  br label %14
+
+; <label>:14                                      ; preds = %4, %11, %12, %13
+  %15 = load i32* %1
+  ret void
+}
+
+define void @true_() nounwind uwtable ssp {
+; CHECK: @true_
+; CHECK: b LBB2_1
+  br i1 true, label %1, label %2
+
+; <label>:1
+; CHECK: LBB2_1
+  br label %2
+
+; <label>:2
+  ret void
+}
+
+define void @false_() nounwind uwtable ssp {
+; CHECK: @false_
+; CHECK: b LBB3_2
+  br i1 false, label %1, label %2
+
+; <label>:1
+  br label %2
+
+; <label>:2
+; CHECK: LBB3_2
+  ret void
+}
+
+define zeroext i8 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) {
+entry:
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %0 = load i16* %b.addr, align 2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: b.eq LBB4_2
+  %conv = trunc i16 %0 to i1
+  br i1 %conv, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo1()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %1 = load i32* %c.addr, align 4
+; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
+; CHECK: subs w{{[0-9]+}}, w[[REG]], #0
+; CHECK: b.eq LBB4_4
+  %conv1 = trunc i32 %1 to i1
+  br i1 %conv1, label %if.then3, label %if.end4
+
+if.then3:                                         ; preds = %if.end
+  call void @foo1()
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then3, %if.end
+  %2 = load i64* %d.addr, align 8
+; CHECK: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
+; CHECK: b.eq LBB4_6
+  %conv5 = trunc i64 %2 to i1
+  br i1 %conv5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.end4
+  call void @foo1()
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.then7, %if.end4
+  %3 = load i8* %a.addr, align 1
+  ret i8 %3
+}
+
+declare void @foo1()
+
+; rdar://15174028
+define i32 @trunc64(i64 %foo) nounwind {
+; CHECK: trunc64
+; CHECK: orr  [[REG:x[0-9]+]], xzr, #0x1
+; CHECK: and  [[REG2:x[0-9]+]], x0, [[REG]]
+; CHECK: mov  x[[REG3:[0-9]+]], [[REG2]]
+; CHECK: and  [[REG4:w[0-9]+]], w[[REG3]], #0x1
+; CHECK: subs {{w[0-9]+}}, [[REG4]], #0
+; CHECK: b.eq LBB5_2
+  %a = and i64 %foo, 1
+  %b = trunc i64 %a to i1
+  br i1 %b, label %if.then, label %if.else
+
+if.then:
+  ret i32 1
+
+if.else:
+  ret i32 0
+}
diff --git a/test/CodeGen/ARM64/fast-isel-call.ll b/test/CodeGen/ARM64/fast-isel-call.ll
new file mode 100644
index 0000000..be0ca68
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-call.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @call0() nounwind {
+entry:
+  ret void
+}
+
+define void @foo0() nounwind {
+entry:
+; CHECK: foo0
+; CHECK: bl _call0
+  call void @call0()
+  ret void
+}
+
+define i32 @call1(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  ret i32 %tmp
+}
+
+define i32 @foo1(i32 %a) nounwind {
+entry:
+; CHECK: foo1
+; CHECK: stur w0, [fp, #-4]
+; CHECK-NEXT: ldur w0, [fp, #-4]
+; CHECK-NEXT: bl _call1
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %call = call i32 @call1(i32 %tmp)
+  ret i32 %call
+}
+
+define i32 @sext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @sext_
+; CHECK: sxtb w0, w0
+; CHECK: sxth w1, w1
+; CHECK: bl _foo_sext_
+  call void @foo_sext_(i8 signext %a, i16 signext %b)
+  ret i32 0
+}
+
+declare void @foo_sext_(i8 %a, i16 %b)
+
+define i32 @zext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @zext_
+; CHECK: uxtb w0, w0
+; CHECK: uxth w1, w1
+  call void @foo_zext_(i8 zeroext %a, i16 zeroext %b)
+  ret i32 0
+}
+
+declare void @foo_zext_(i8 %a, i16 %b)
+
+define i32 @t1(i32 %argc, i8** nocapture %argv) {
+entry:
+; CHECK: @t1
+; The last parameter will be passed on stack via i8.
+; CHECK: strb w{{[0-9]+}}, [sp]
+; CHECK-NEXT: bl _bar
+  %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70, i8 zeroext 28, i8 zeroext 39, i8 zeroext -41)
+  ret i32 0
+}
+
+declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+; Test materialization of integers.  Target-independent selector handles this.
+define i32 @t2() {
+entry:
+; CHECK: @t2
+; CHECK: movz x0, #0
+; CHECK: orr w1, wzr, #0xfffffff8
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x3ff
+; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2
+; CHECK: movz w[[REG3:[0-9]+]], #0
+; CHECK: orr w[[REG4:[0-9]+]], wzr, #0x1
+; CHECK: uxth w2, w[[REG]]
+; CHECK: sxtb w3, w[[REG2]]
+; CHECK: and w4, w[[REG3]], #0x1
+; CHECK: and w5, w[[REG4]], #0x1
+; CHECK: bl	_func2
+  %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1)
+  ret i32 0
+}
+
+declare i32 @func2(i64 zeroext, i32 signext, i16 zeroext, i8 signext, i1 zeroext, i1 zeroext)
diff --git a/test/CodeGen/ARM64/fast-isel-conversion.ll b/test/CodeGen/ARM64/fast-isel-conversion.ll
new file mode 100644
index 0000000..4e62e33
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-conversion.ll
@@ -0,0 +1,416 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+;; Test various conversions.
+define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: trunc_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldr x3, [sp]
+; CHECK: mov x0, x3
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: add sp, sp, #16
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i64* %d.addr, align 8
+  %conv = trunc i64 %tmp to i32
+  store i32 %conv, i32* %c.addr, align 4
+  %tmp1 = load i32* %c.addr, align 4
+  %conv2 = trunc i32 %tmp1 to i16
+  store i16 %conv2, i16* %b.addr, align 2
+  %tmp3 = load i16* %b.addr, align 2
+  %conv4 = trunc i16 %tmp3 to i8
+  store i8 %conv4, i8* %a.addr, align 1
+  %tmp5 = load i8* %a.addr, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  ret i32 %conv6
+}
+
+define i64 @zext_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: zext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: uxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: uxtw x3, w0
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp], #16
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i8* %a.addr, align 1
+  %conv = zext i8 %tmp to i16
+  store i16 %conv, i16* %b.addr, align 2
+  %tmp1 = load i16* %b.addr, align 2
+  %conv2 = zext i16 %tmp1 to i32
+  store i32 %conv2, i32* %c.addr, align 4
+  %tmp3 = load i32* %c.addr, align 4
+  %conv4 = zext i32 %tmp3 to i64
+  store i64 %conv4, i64* %d.addr, align 8
+  %tmp5 = load i64* %d.addr, align 8
+  ret i64 %tmp5
+}
+
+define i32 @zext_i1_i32(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i32
+; CHECK: and w0, w0, #0x1
+  %conv = zext i1 %a to i32
+  ret i32 %conv;
+}
+
+define i64 @zext_i1_i64(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i64
+; CHECK: and w0, w0, #0x1
+  %conv = zext i1 %a to i64
+  ret i64 %conv;
+}
+
+define i64 @sext_(i8 signext %a, i16 signext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: sext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: sxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: sxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: sxtw x3, w0
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp], #16
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i8* %a.addr, align 1
+  %conv = sext i8 %tmp to i16
+  store i16 %conv, i16* %b.addr, align 2
+  %tmp1 = load i16* %b.addr, align 2
+  %conv2 = sext i16 %tmp1 to i32
+  store i32 %conv2, i32* %c.addr, align 4
+  %tmp3 = load i32* %c.addr, align 4
+  %conv4 = sext i32 %tmp3 to i64
+  store i64 %conv4, i64* %d.addr, align 8
+  %tmp5 = load i64* %d.addr, align 8
+  ret i64 %tmp5
+}
+
+; Test sext i8 to i64
+define i64 @sext_2(i8 signext %a) nounwind ssp {
+entry:
+; CHECK: sext_2
+; CHECK: sxtb x0, w0
+  %conv = sext i8 %a to i64
+  ret i64 %conv
+}
+
+; Test sext i1 to i32
+define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i32
+; CHECK: sbfm w0, w0, #0, #0
+  %conv = sext i1 %a to i32
+  ret i32 %conv
+}
+
+; Test sext i1 to i16
+define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i16
+; CHECK: sbfm w0, w0, #0, #0
+  %conv = sext i1 %a to i16
+  ret i16 %conv
+}
+
+; Test sext i1 to i8
+define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i8
+; CHECK: sbfm w0, w0, #0, #0
+  %conv = sext i1 %a to i8
+  ret i8 %conv
+}
+
+; Test fpext
+define double @fpext_(float %a) nounwind ssp {
+entry:
+; CHECK: fpext_
+; CHECK: fcvt d0, s0
+  %conv = fpext float %a to double
+  ret double %conv
+}
+
+; Test fptrunc
+define float @fptrunc_(double %a) nounwind ssp {
+entry:
+; CHECK: fptrunc_
+; CHECK: fcvt s0, d0
+  %conv = fptrunc double %a to float
+  ret float %conv
+}
+
+; Test fptosi
+define i32 @fptosi_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptosi_ws
+; CHECK: fcvtzs w0, s0
+  %conv = fptosi float %a to i32
+  ret i32 %conv
+}
+
+; Test fptosi
+define i32 @fptosi_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptosi_wd
+; CHECK: fcvtzs w0, d0
+  %conv = fptosi double %a to i32
+  ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptoui_ws
+; CHECK: fcvtzu w0, s0
+  %conv = fptoui float %a to i32
+  ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptoui_wd
+; CHECK: fcvtzu w0, d0
+  %conv = fptoui double %a to i32
+  ret i32 %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i1
+; CHECK: sbfm w0, w0, #0, #0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i1 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i8
+; CHECK: sxtb w0, w0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i8 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i16
+; CHECK: sxth w0, w0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i16 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw
+; CHECK: scvtf s0, w0
+  %conv = sitofp i32 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sx
+; CHECK: scvtf s0, x0
+  %conv = sitofp i64 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define double @sitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dw
+; CHECK: scvtf d0, w0
+  %conv = sitofp i32 %a to double
+  ret double %conv
+}
+
+; Test sitofp
+define double @sitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dx
+; CHECK: scvtf d0, x0
+  %conv = sitofp i64 %a to double
+  ret double %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i1
+; CHECK: and w0, w0, #0x1
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i1 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i8
+; CHECK: uxtb w0, w0
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i8 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i16
+; CHECK: uxth w0, w0
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i16 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i32 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sx
+; CHECK: ucvtf s0, x0
+  %conv = uitofp i64 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define double @uitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dw
+; CHECK: ucvtf d0, w0
+  %conv = uitofp i32 %a to double
+  ret double %conv
+}
+
+; Test uitofp
+define double @uitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dx
+; CHECK: ucvtf d0, x0
+  %conv = uitofp i64 %a to double
+  ret double %conv
+}
+
+define i32 @i64_trunc_i32(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i32
+; CHECK: mov x1, x0
+  %conv = trunc i64 %a to i32
+  ret i32 %conv
+}
+
+define zeroext i16 @i64_trunc_i16(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i16
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xffff
+; CHECK: uxth w0, [[REG2]]
+  %conv = trunc i64 %a to i16
+  ret i16 %conv
+}
+
+define zeroext i8 @i64_trunc_i8(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i8
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xff
+; CHECK: uxtb w0, [[REG2]]
+  %conv = trunc i64 %a to i8
+  ret i8 %conv
+}
+
+define zeroext i1 @i64_trunc_i1(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i1
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0x1
+; CHECK: and w0, [[REG2]], #0x1
+  %conv = trunc i64 %a to i1
+  ret i1 %conv
+}
+
+; rdar://15101939
+define void @stack_trunc() nounwind {
+; CHECK: stack_trunc
+; CHECK: sub  sp, sp, #16
+; CHECK: ldr  [[REG:x[0-9]+]], [sp]
+; CHECK: mov  x[[REG2:[0-9]+]], [[REG]]
+; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0xff
+; CHECK: strb [[REG3]], [sp, #15]
+; CHECK: add  sp, sp, #16
+  %a = alloca i8, align 1
+  %b = alloca i64, align 8
+  %c = load i64* %b, align 8
+  %d = trunc i64 %c to i8
+  store i8 %d, i8* %a, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM64/fast-isel-fcmp.ll b/test/CodeGen/ARM64/fast-isel-fcmp.ll
new file mode 100644
index 0000000..cf71fab
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-fcmp.ll
@@ -0,0 +1,146 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une double %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une double %a, %b
+  ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ne
+  %cmp = fcmp oeq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, le
+  %cmp = fcmp ogt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, lt
+  %cmp = fcmp oge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, pl
+  %cmp = fcmp olt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, hi
+  %cmp = fcmp ole float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, vs
+  %cmp = fcmp ord float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, vc
+  %cmp = fcmp uno float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ls
+  %cmp = fcmp ugt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, mi
+  %cmp = fcmp uge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ge
+  %cmp = fcmp ult float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, gt
+  %cmp = fcmp ule float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
diff --git a/test/CodeGen/ARM64/fast-isel-gv.ll b/test/CodeGen/ARM64/fast-isel-gv.ll
new file mode 100644
index 0000000..cb3df14
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-gv.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Test load/store of global value from global offset table.
+@seed = common global i64 0, align 8
+
+define void @Initrand() nounwind {
+entry:
+; CHECK: @Initrand
+; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
+; CHECK: str x{{[0-9]+}}, [x[[REG2]]]
+  store i64 74755, i64* @seed, align 8
+  ret void
+}
+
+define i32 @Rand() nounwind {
+entry:
+; CHECK: @Rand
+; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
+; CHECK: movz x[[REG3:[0-9]+]], #1309
+; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
+; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
+; CHECK: movz x[[REG6:[0-9]+]], #13849
+; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
+; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
+; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
+; CHECK: str x[[REG9]], [x[[REG]]]
+; CHECK: ldr x{{[0-9]+}}, [x[[REG]]]
+  %0 = load i64* @seed, align 8
+  %mul = mul nsw i64 %0, 1309
+  %add = add nsw i64 %mul, 13849
+  %and = and i64 %add, 65535
+  store i64 %and, i64* @seed, align 8
+  %1 = load i64* @seed, align 8
+  %conv = trunc i64 %1 to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/ARM64/fast-isel-icmp.ll b/test/CodeGen/ARM64/fast-isel-icmp.ll
new file mode 100644
index 0000000..22af542
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-icmp.ll
@@ -0,0 +1,214 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_imm
+; CHECK: cmp  w0, #31
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i32 %a, 31
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_neg_imm
+; CHECK: cmn  w0, #7
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i32 %a, -7
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ne
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = icmp ne i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ugt
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ls
+  %cmp = icmp ugt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_uge
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, cc
+  %cmp = icmp uge i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ult
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, cs
+  %cmp = icmp ult i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ule
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, hi
+  %cmp = icmp ule i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sgt
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, le
+  %cmp = icmp sgt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sge
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, lt
+  %cmp = icmp sge i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_slt
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ge
+  %cmp = icmp slt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sle
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, gt
+  %cmp = icmp sle i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
+entry:
+; CHECK: icmp_i64
+; CHECK: cmp  x0, x1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, gt
+  %cmp = icmp sle i64 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i16
+; CHECK: sxth w0, w0
+; CHECK: sxth w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i16 %a, %b
+  ret i1 %cmp
+}
+
+define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i8
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i8 %a, %b
+  ret i1 %cmp
+}
+
+define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
+entry:
+; CHECK: icmp_i16_unsigned
+; CHECK: uxth w0, w0
+; CHECK: uxth w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, cs
+  %cmp = icmp ult i16 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
+entry:
+; CHECK: @icmp_i8_signed
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, le
+  %cmp = icmp sgt i8 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+
+define i32 @icmp_i16_signed_const(i16 %a) nounwind {
+entry:
+; CHECK: icmp_i16_signed_const
+; CHECK: sxth w0, w0
+; CHECK: cmn  w0, #233
+; CHECK: csinc w0, wzr, wzr, ge
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp slt i16 %a, -233
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed_const(i8 %a) nounwind {
+entry:
+; CHECK: icmp_i8_signed_const
+; CHECK: sxtb w0, w0
+; CHECK: cmp  w0, #124
+; CHECK: csinc w0, wzr, wzr, le
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp sgt i8 %a, 124
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
+entry:
+; CHECK: icmp_i1_unsigned_const
+; CHECK: and w0, w0, #0x1
+; CHECK: cmp  w0, #0
+; CHECK: csinc w0, wzr, wzr, cs
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp ult i1 %a, 0
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
diff --git a/test/CodeGen/ARM64/fast-isel-indirectbr.ll b/test/CodeGen/ARM64/fast-isel-indirectbr.ll
new file mode 100644
index 0000000..70335ac
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-indirectbr.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+@fn.table = internal global [2 x i8*] [i8* blockaddress(@fn, %ZERO), i8* blockaddress(@fn, %ONE)], align 8
+
+define i32 @fn(i32 %target) nounwind {
+entry:
+; CHECK: @fn
+  %retval = alloca i32, align 4
+  %target.addr = alloca i32, align 4
+  store i32 %target, i32* %target.addr, align 4
+  %0 = load i32* %target.addr, align 4
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds [2 x i8*]* @fn.table, i32 0, i64 %idxprom
+  %1 = load i8** %arrayidx, align 8
+  br label %indirectgoto
+
+ZERO:                                             ; preds = %indirectgoto
+; CHECK: LBB0_1
+  store i32 0, i32* %retval
+  br label %return
+
+ONE:                                              ; preds = %indirectgoto
+; CHECK: LBB0_2
+  store i32 1, i32* %retval
+  br label %return
+
+return:                                           ; preds = %ONE, %ZERO
+  %2 = load i32* %retval
+  ret i32 %2
+
+indirectgoto:                                     ; preds = %entry
+; CHECK: ldr x0, [sp]
+; CHECK: br x0
+  %indirect.goto.dest = phi i8* [ %1, %entry ]
+  indirectbr i8* %indirect.goto.dest, [label %ZERO, label %ONE]
+}
diff --git a/test/CodeGen/ARM64/fast-isel-intrinsic.ll b/test/CodeGen/ARM64/fast-isel-intrinsic.ll
new file mode 100644
index 0000000..6443d82
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-intrinsic.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios | FileCheck %s --check-prefix=ARM64
+
+@message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16
+@temp = common global [80 x i8] zeroinitializer, align 16
+
+define void @t1() {
+; ARM64: t1
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x0, x8, _message@PAGEOFF
+; ARM64: movz w9, #0
+; ARM64: movz x2, #80
+; ARM64: uxtb w1, w9
+; ARM64: bl _memset
+  call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+define void @t2() {
+; ARM64: t2
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x1, x8, _message@PAGEOFF
+; ARM64: movz x2, #80
+; ARM64: bl _memcpy
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 80, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t3() {
+; ARM64: t3
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x1, x8, _message@PAGEOFF
+; ARM64: movz x2, #20
+; ARM64: bl _memmove
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 20, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t4() {
+; ARM64: t4
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 16, i1 false)
+  ret void
+}
+
+define void @t5() {
+; ARM64: t5
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 8, i1 false)
+  ret void
+}
+
+define void @t6() {
+; ARM64: t6
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr w10, [x9]
+; ARM64: str w10, [x8]
+; ARM64: ldr w10, [x9, #4]
+; ARM64: str w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #8]
+; ARM64: strb w10, [x8, #8]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 9, i32 4, i1 false)
+  ret void
+}
+
+define void @t7() {
+; ARM64: t7
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldrh w10, [x9]
+; ARM64: strh w10, [x8]
+; ARM64: ldrh w10, [x9, #2]
+; ARM64: strh w10, [x8, #2]
+; ARM64: ldrh w10, [x9, #4]
+; ARM64: strh w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #6]
+; ARM64: strb w10, [x8, #6]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 7, i32 2, i1 false)
+  ret void
+}
+
+define void @t8() {
+; ARM64: t8
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldrb w10, [x9]
+; ARM64: strb w10, [x8]
+; ARM64: ldrb w10, [x9, #1]
+; ARM64: strb w10, [x8, #1]
+; ARM64: ldrb w10, [x9, #2]
+; ARM64: strb w10, [x8, #2]
+; ARM64: ldrb w10, [x9, #3]
+; ARM64: strb w10, [x8, #3]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 4, i32 1, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/fast-isel-materialize.ll b/test/CodeGen/ARM64/fast-isel-materialize.ll
new file mode 100644
index 0000000..fa2daf7
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-materialize.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Materialize using fmov
+define void @float_(float* %value) {
+; CHECK: @float_
+; CHECK: fmov s0, #1.250000e+00
+  store float 1.250000e+00, float* %value, align 4
+  ret void
+}
+
+define void @double_(double* %value) {
+; CHECK: @double_
+; CHECK: fmov d0, #1.250000e+00
+  store double 1.250000e+00, double* %value, align 8
+  ret void
+}
+
+; Materialize from constant pool
+define float @float_cp() {
+; CHECK: @float_cp
+  ret float 0x400921FB60000000
+}
+
+define double @double_cp() {
+; CHECK: @double_cp
+  ret double 0x400921FB54442D18
+}
diff --git a/test/CodeGen/ARM64/fast-isel-noconvert.ll b/test/CodeGen/ARM64/fast-isel-noconvert.ll
new file mode 100644
index 0000000..3517970
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-noconvert.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=arm64-apple-ios -O0 %s -o - | FileCheck %s
+
+; Fast-isel can't do vector conversions yet, but it was emitting some highly
+; suspect UCVTFUWDri MachineInstrs.
+define <4 x float> @test_uitofp(<4 x i32> %in) {
+; CHECK-LABEL: test_uitofp:
+; CHECK: ucvtf.4s v0, v0
+
+  %res = uitofp <4 x i32> %in to <4 x float>
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_sitofp(<2 x i32> %in) {
+; CHECK-LABEL: test_sitofp:
+; CHECK: sshll.2d [[EXT:v[0-9]+]], v0, #0
+; CHECK: scvtf.2d v0, [[EXT]]
+
+  %res = sitofp <2 x i32> %in to <2 x double>
+  ret <2 x double> %res
+}
+
+define <2 x i32> @test_fptoui(<2 x float> %in) {
+; CHECK-LABEL: test_fptoui:
+; CHECK: fcvtzu.2s v0, v0
+
+  %res = fptoui <2 x float> %in to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <2 x i64> @test_fptosi(<2 x double> %in) {
+; CHECK-LABEL: test_fptosi:
+; CHECK: fcvtzs.2d v0, v0
+
+  %res = fptosi <2 x double> %in to <2 x i64>
+  ret <2 x i64> %res
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM64/fast-isel-rem.ll b/test/CodeGen/ARM64/fast-isel-rem.ll
new file mode 100644
index 0000000..0c68401
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-rem.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @t1(i32 %a, i32 %b) {
+; CHECK: @t1
+; CHECK: sdiv w2, w0, w1
+; CHECK: msub w2, w2, w1, w0
+  %1 = srem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t2(i64 %a, i64 %b) {
+; CHECK: @t2
+; CHECK: sdiv x2, x0, x1
+; CHECK: msub x2, x2, x1, x0
+  %1 = srem i64 %a, %b
+  ret i64 %1
+}
+
+define i32 @t3(i32 %a, i32 %b) {
+; CHECK: @t3
+; CHECK: udiv w2, w0, w1
+; CHECK: msub w2, w2, w1, w0
+  %1 = urem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t4(i64 %a, i64 %b) {
+; CHECK: @t4
+; CHECK: udiv x2, x0, x1
+; CHECK: msub x2, x2, x1, x0
+  %1 = urem i64 %a, %b
+  ret i64 %1
+}
diff --git a/test/CodeGen/ARM64/fast-isel-ret.ll b/test/CodeGen/ARM64/fast-isel-ret.ll
new file mode 100644
index 0000000..d91fd28
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-ret.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+;; Test returns.
+define void @t0() nounwind ssp {
+entry:
+; CHECK: t0
+; CHECK: ret
+  ret void
+}
+
+define i32 @t1(i32 %a) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: str w0, [sp, #12]
+; CHECK-NEXT: ldr w0, [sp, #12]
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  ret i32 %tmp
+}
+
+define i64 @t2(i64 %a) nounwind ssp {
+entry:
+; CHECK: t2
+; CHECK: str x0, [sp, #8]
+; CHECK-NEXT: ldr x0, [sp, #8]
+; CHECK: ret
+  %a.addr = alloca i64, align 8
+  store i64 %a, i64* %a.addr, align 8
+  %tmp = load i64* %a.addr, align 8
+  ret i64 %tmp
+}
+
+define signext i16 @ret_i16(i16 signext %a) nounwind {
+entry:
+; CHECK: @ret_i16
+; CHECK: sxth	w0, w0
+  %a.addr = alloca i16, align 1
+  store i16 %a, i16* %a.addr, align 1
+  %0 = load i16* %a.addr, align 1
+  ret i16 %0
+}
+
+define signext i8 @ret_i8(i8 signext %a) nounwind {
+entry:
+; CHECK: @ret_i8
+; CHECK: sxtb	w0, w0
+  %a.addr = alloca i8, align 1
+  store i8 %a, i8* %a.addr, align 1
+  %0 = load i8* %a.addr, align 1
+  ret i8 %0
+}
+
+define signext i1 @ret_i1(i1 signext %a) nounwind {
+entry:
+; CHECK: @ret_i1
+; CHECK: and w0, w0, #0x1
+  %a.addr = alloca i1, align 1
+  store i1 %a, i1* %a.addr, align 1
+  %0 = load i1* %a.addr, align 1
+  ret i1 %0
+}
diff --git a/test/CodeGen/ARM64/fast-isel-select.ll b/test/CodeGen/ARM64/fast-isel-select.ll
new file mode 100644
index 0000000..1cc207f
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-select.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @t1(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t1
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+  %0 = icmp sgt i32 %c, 1
+  %1 = select i1 %0, i32 123, i32 357
+  ret i32 %1
+}
+
+define i64 @t2(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+  %0 = icmp sgt i32 %c, 1
+  %1 = select i1 %0, i64 123, i64 357
+  ret i64 %1
+}
+
+define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
+entry:
+; CHECK: @t3
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+  %0 = select i1 %c, i32 %a, i32 %b
+  ret i32 %0
+}
+
+define i64 @t4(i1 %c, i64 %a, i64 %b) nounwind readnone {
+entry:
+; CHECK: @t4
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+  %0 = select i1 %c, i64 %a, i64 %b
+  ret i64 %0
+}
+
+define float @t5(i1 %c, float %a, float %b) nounwind readnone {
+entry:
+; CHECK: @t5
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel s0, s0, s1, ne
+  %0 = select i1 %c, float %a, float %b
+  ret float %0
+}
+
+define double @t6(i1 %c, double %a, double %b) nounwind readnone {
+entry:
+; CHECK: @t6
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel d0, d0, d1, ne
+  %0 = select i1 %c, double %a, double %b
+  ret double %0
+}
diff --git a/test/CodeGen/ARM64/fast-isel.ll b/test/CodeGen/ARM64/fast-isel.ll
new file mode 100644
index 0000000..ba718d3
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel.ll
@@ -0,0 +1,95 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @t0(i32 %a) nounwind {
+entry:
+; CHECK: t0
+; CHECK: str {{w[0-9]+}}, [sp, #12]
+; CHECK-NEXT: ldr [[REGISTER:w[0-9]+]], [sp, #12]
+; CHECK-NEXT: str [[REGISTER]], [sp, #12]
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr
+  %tmp = load i32* %a.addr
+  store i32 %tmp, i32* %a.addr
+  ret void
+}
+
+define void @t1(i64 %a) nounwind {
+; CHECK: t1
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK-NEXT: ldr [[REGISTER:x[0-9]+]], [sp, #8]
+; CHECK-NEXT: str [[REGISTER]], [sp, #8]
+; CHECK: ret
+  %a.addr = alloca i64, align 4
+  store i64 %a, i64* %a.addr
+  %tmp = load i64* %a.addr
+  store i64 %tmp, i64* %a.addr
+  ret void
+}
+
+define zeroext i1 @i1(i1 %a) nounwind {
+entry:
+; CHECK: @i1
+; CHECK: and w0, w0, #0x1
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: and w0, w0, #0x1
+; CHECK: and w0, w0, #0x1
+; CHECK: add sp, sp, #16
+; CHECK: ret
+  %a.addr = alloca i1, align 1
+  store i1 %a, i1* %a.addr, align 1
+  %0 = load i1* %a.addr, align 1
+  ret i1 %0
+}
+
+define i32 @t2(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -1
+  %1 = load i32* %0, align 4
+  ret i32 %1
+}
+
+define i32 @t3(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldur w0, [x0, #-256]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -64
+  %1 = load i32* %0, align 4
+  ret i32 %1
+}
+
+define void @t4(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-4]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -1
+  store i32 0, i32* %0, align 4
+  ret void
+}
+
+define void @t5(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-256]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -64
+  store i32 0, i32* %0, align 4
+  ret void
+}
+
+define void @t6() nounwind {
+; CHECK: t6
+; CHECK: brk #1
+  tail call void @llvm.trap()
+  ret void
+}
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM64/fastcc-tailcall.ll b/test/CodeGen/ARM64/fastcc-tailcall.ll
new file mode 100644
index 0000000..8a744c5
--- /dev/null
+++ b/test/CodeGen/ARM64/fastcc-tailcall.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @caller(i32* nocapture %p, i32 %a, i32 %b) nounwind optsize ssp {
+; CHECK-NOT: stp
+; CHECK: b       {{_callee|callee}}
+; CHECK-NOT: ldp
+; CHECK: ret
+  %1 = icmp eq i32 %b, 0
+  br i1 %1, label %3, label %2
+
+  tail call fastcc void @callee(i32* %p, i32 %a) optsize
+  br label %3
+
+  ret void
+}
+
+define internal fastcc void @callee(i32* nocapture %p, i32 %a) nounwind optsize noinline ssp {
+  store volatile i32 %a, i32* %p, align 4, !tbaa !0
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll b/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
new file mode 100644
index 0000000..af9fe05
--- /dev/null
+++ b/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
@@ -0,0 +1,18 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=arm64-apple-darwin %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %add = add i8 64, 64 ; 0x40 + 0x40
+  %0 = load i8** %ptr.addr, align 8
+
+  ; CHECK-LABEL: _gep_promotion:
+  ; CHECK: ldrb {{[a-z][0-9]+}}, {{\[[a-z][0-9]+\]}}
+  %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
+
diff --git a/test/CodeGen/ARM64/fcmp-opt.ll b/test/CodeGen/ARM64/fcmp-opt.ll
new file mode 100644
index 0000000..17412dd
--- /dev/null
+++ b/test/CodeGen/ARM64/fcmp-opt.ll
@@ -0,0 +1,173 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+; rdar://10263824
+
+define i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une float %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une double %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une double %a, %b
+  ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ne
+  %cmp = fcmp oeq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, le
+  %cmp = fcmp ogt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, lt
+  %cmp = fcmp oge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, pl
+  %cmp = fcmp olt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, hi
+  %cmp = fcmp ole float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, vs
+  %cmp = fcmp ord float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, vc
+  %cmp = fcmp uno float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ls
+  %cmp = fcmp ugt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, mi
+  %cmp = fcmp uge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ge
+  %cmp = fcmp ult float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, gt
+  %cmp = fcmp ule float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+; Possible opportunity for improvement.  See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_one(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_one
+;	fcmp	s0, s1
+;	orr	w0, wzr, #0x1
+;	csel	w1, w0, wzr, mi
+;	csel	w0, w0, wzr, gt
+  %cmp = fcmp one float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+; Possible opportunity for improvement.  See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_ueq(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ueq
+; CHECK: fcmp s0, s1
+;        orr w0, wzr, #0x1
+; CHECK: csel [[REG1:w[0-9]]], [[REG2:w[0-9]+]], wzr, eq
+; CHECK: csel {{w[0-9]+}}, [[REG2]], [[REG1]], vs
+  %cmp = fcmp ueq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
diff --git a/test/CodeGen/ARM64/fcopysign.ll b/test/CodeGen/ARM64/fcopysign.ll
new file mode 100644
index 0000000..094ce7a
--- /dev/null
+++ b/test/CodeGen/ARM64/fcopysign.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; rdar://9332258
+
+define float @test1(float %x, float %y) nounwind {
+entry:
+; CHECK-LABEL: test1:
+; CHECK: movi.4s	v2, #128, lsl #24
+; CHECK: bit.16b	v0, v1, v2
+  %0 = tail call float @copysignf(float %x, float %y) nounwind readnone
+  ret float %0
+}
+
+define double @test2(double %x, double %y) nounwind {
+entry:
+; CHECK-LABEL: test2:
+; CHECK: movi.2d	v2, #0
+; CHECK: fneg.2d	v2, v2
+; CHECK: bit.16b	v0, v1, v2
+  %0 = tail call double @copysign(double %x, double %y) nounwind readnone
+  ret double %0
+}
+
+; rdar://9545768
+define double @test3(double %a, float %b, float %c) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: fcvt d1, s1
+; CHECK: fneg.2d v2, v{{[0-9]+}}
+; CHECK: bit.16b v0, v1, v2
+  %tmp1 = fadd float %b, %c
+  %tmp2 = fpext float %tmp1 to double
+  %tmp = tail call double @copysign( double %a, double %tmp2 ) nounwind readnone
+  ret double %tmp
+}
+
+define float @test4() nounwind {
+entry:
+; CHECK-LABEL: test4:
+; CHECK: fcvt s0, d0
+; CHECK: movi.4s v[[CONST:[0-9]+]], #128, lsl #24
+; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
+  %0 = tail call double (...)* @bar() nounwind
+  %1 = fptrunc double %0 to float
+  %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
+  %3 = fadd float %1, %2
+  ret float %3
+}
+
+declare double @bar(...)
+declare double @copysign(double, double) nounwind readnone
+declare float @copysignf(float, float) nounwind readnone
diff --git a/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll b/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
new file mode 100644
index 0000000..77981f2
--- /dev/null
+++ b/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; DAGCombine to transform a conversion of an extract_vector_elt to an
+; extract_vector_elt of a conversion, which saves a round trip of copies
+; of the value to a GPR and back to and FPR.
+; rdar://11855286
+define double @foo0(<2 x i64> %a) nounwind {
+; CHECK:  scvtf.2d  [[REG:v[0-9]+]], v0, #9
+; CHECK-NEXT:  ins.d v0[0], [[REG]][1]
+  %vecext = extractelement <2 x i64> %a, i32 1
+  %fcvt_n = tail call double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
+  ret double %fcvt_n
+}
+
+declare double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/fmadd.ll b/test/CodeGen/ARM64/fmadd.ll
new file mode 100644
index 0000000..d00aaef
--- /dev/null
+++ b/test/CodeGen/ARM64/fmadd.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+define float @fma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fma32:
+; CHECK: fmadd
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %0
+}
+
+define float @fnma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnma32:
+; CHECK: fnmadd
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+  %mul = fmul float %0, -1.000000e+00
+  ret float %mul
+}
+
+define float @fms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fms32:
+; CHECK: fmsub
+  %mul = fmul float %b, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %a, float %mul, float %c)
+  ret float %0
+}
+
+define float @fms32_com(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fms32_com:
+; CHECK: fmsub
+  %mul = fmul float %b, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %mul, float %a, float %c)
+  ret float %0
+}
+
+define float @fnms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnms32:
+; CHECK: fnmsub
+  %mul = fmul float %c, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %mul)
+  ret float %0
+}
+
+define double @fma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fma64:
+; CHECK: fmadd
+entry:
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %0
+}
+
+define double @fnma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnma64:
+; CHECK: fnmadd
+entry:
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+  %mul = fmul double %0, -1.000000e+00
+  ret double %mul
+}
+
+define double @fms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64:
+; CHECK: fmsub
+entry:
+  %mul = fmul double %b, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %a, double %mul, double %c)
+  ret double %0
+}
+
+define double @fms64_com(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64_com:
+; CHECK: fmsub
+entry:
+  %mul = fmul double %b, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %mul, double %a, double %c)
+  ret double %0
+}
+
+define double @fnms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnms64:
+; CHECK: fnmsub
+entry:
+  %mul = fmul double %c, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %mul)
+  ret double %0
+}
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
diff --git a/test/CodeGen/ARM64/fmax.ll b/test/CodeGen/ARM64/fmax.ll
new file mode 100644
index 0000000..53ecf86
--- /dev/null
+++ b/test/CodeGen/ARM64/fmax.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
+
+define double @test_direct(float %in) #1 {
+entry:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double 0.000000e+00, double %longer
+  ret double %val
+
+; CHECK: fmax
+}
+
+define double @test_cross(float %in) #1 {
+entry:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double %longer, double 0.000000e+00
+  ret double %val
+
+; CHECK: fmin
+}
diff --git a/test/CodeGen/ARM64/fminv.ll b/test/CodeGen/ARM64/fminv.ll
new file mode 100644
index 0000000..ca706d8
--- /dev/null
+++ b/test/CodeGen/ARM64/fminv.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+define float @test_fminv_v2f32(<2 x float> %in) {
+; CHECK: test_fminv_v2f32:
+; CHECK: fminp s0, v0.2s
+  %min = call float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float> %in)
+  ret float %min
+}
+
+define float @test_fminv_v4f32(<4 x float> %in) {
+; CHECK: test_fminv_v4f32:
+; CHECK: fminv s0, v0.4s
+  %min = call float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float> %in)
+  ret float %min
+}
+
+define double @test_fminv_v2f64(<2 x double> %in) {
+; CHECK: test_fminv_v2f64:
+; CHECK: fminp d0, v0.2d
+  %min = call double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float>)
+declare float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float>)
+declare double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxv_v2f32:
+; CHECK: fmaxp s0, v0.2s
+  %max = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %in)
+  ret float %max
+}
+
+define float @test_fmaxv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxv_v4f32:
+; CHECK: fmaxv s0, v0.4s
+  %max = call float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float> %in)
+  ret float %max
+}
+
+define double @test_fmaxv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxv_v2f64:
+; CHECK: fmaxp d0, v0.2d
+  %max = call double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+declare float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float>)
+declare float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float>)
+declare double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double>)
+
+define float @test_fminnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fminnmv_v2f32:
+; CHECK: fminnmp s0, v0.2s
+  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float> %in)
+  ret float %minnm
+}
+
+define float @test_fminnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fminnmv_v4f32:
+; CHECK: fminnmv s0, v0.4s
+  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float> %in)
+  ret float %minnm
+}
+
+define double @test_fminnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fminnmv_v2f64:
+; CHECK: fminnmp d0, v0.2d
+  %minnm = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %minnm
+}
+
+declare float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float>)
+declare float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float>)
+declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxnmv_v2f32:
+; CHECK: fmaxnmp s0, v0.2s
+  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
+  ret float %maxnm
+}
+
+define float @test_fmaxnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxnmv_v4f32:
+; CHECK: fmaxnmv s0, v0.4s
+  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
+  ret float %maxnm
+}
+
+define double @test_fmaxnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxnmv_v2f64:
+; CHECK: fmaxnmp d0, v0.2d
+  %maxnm = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %maxnm
+}
+
+declare float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float>)
+declare float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/fmuladd.ll b/test/CodeGen/ARM64/fmuladd.ll
new file mode 100644
index 0000000..174d830
--- /dev/null
+++ b/test/CodeGen/ARM64/fmuladd.ll
@@ -0,0 +1,88 @@
+; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define float @test_f32(float* %A, float* %B, float* %C) nounwind {
+;CHECK-LABEL: test_f32:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+  %tmp1 = load float* %A
+  %tmp2 = load float* %B
+  %tmp3 = load float* %C
+  %tmp4 = call float @llvm.fmuladd.f32(float %tmp1, float %tmp2, float %tmp3)
+  ret float %tmp4
+}
+
+define <2 x float> @test_v2f32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: test_v2f32:
+;CHECK: fmla.2s
+;CHECK-NOT: fmla.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @test_v4f32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: test_v4f32:
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <8 x float> @test_v8f32(<8 x float>* %A, <8 x float>* %B, <8 x float>* %C) nounwind {
+;CHECK-LABEL: test_v8f32:
+;CHECK: fmla.4s
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+  %tmp1 = load <8 x float>* %A
+  %tmp2 = load <8 x float>* %B
+  %tmp3 = load <8 x float>* %C
+  %tmp4 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %tmp1, <8 x float> %tmp2, <8 x float> %tmp3)
+  ret <8 x float> %tmp4
+}
+
+define double @test_f64(double* %A, double* %B, double* %C) nounwind {
+;CHECK-LABEL: test_f64:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+  %tmp1 = load double* %A
+  %tmp2 = load double* %B
+  %tmp3 = load double* %C
+  %tmp4 = call double @llvm.fmuladd.f64(double %tmp1, double %tmp2, double %tmp3)
+  ret double %tmp4
+}
+
+define <2 x double> @test_v2f64(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: test_v2f64:
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+define <4 x double> @test_v4f64(<4 x double>* %A, <4 x double>* %B, <4 x double>* %C) nounwind {
+;CHECK-LABEL: test_v4f64:
+;CHECK: fmla.2d
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+  %tmp1 = load <4 x double>* %A
+  %tmp2 = load <4 x double>* %B
+  %tmp3 = load <4 x double>* %C
+  %tmp4 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %tmp1, <4 x double> %tmp2, <4 x double> %tmp3)
+  ret <4 x double> %tmp4
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
+declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
diff --git a/test/CodeGen/ARM64/fold-address.ll b/test/CodeGen/ARM64/fold-address.ll
new file mode 100644
index 0000000..96cc3e9
--- /dev/null
+++ b/test/CodeGen/ARM64/fold-address.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -O2 -mtriple=arm64-apple-darwin | FileCheck %s
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @nofold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: nofold:
+; CHECK: add x[[REG:[0-9]+]], x0, x{{[0-9]+}}
+; CHECK: ldp d0, d1, [x[[REG]]]
+; CHECK: ldp d2, d3, [x[[REG]], #16]
+; CHECK: ret
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr.sum = add i64 %ivar, 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr.sum17 = add i64 %ivar, 16
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.sum = add i64 %ivar, 24
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+define hidden %struct.CGRect @fold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: fold:
+; CHECK: ldr d0, [x0, x{{[0-9]+}}]
+; CHECK-NOT: add x0, x0, x1
+; CHECK: ret
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}
diff --git a/test/CodeGen/ARM64/fold-lsl.ll b/test/CodeGen/ARM64/fold-lsl.ll
new file mode 100644
index 0000000..a856c96
--- /dev/null
+++ b/test/CodeGen/ARM64/fold-lsl.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+;
+; <rdar://problem/14486451>
+
+%struct.a = type [256 x i16]
+%struct.b = type [256 x i32]
+%struct.c = type [256 x i64]
+
+define i16 @load_halfword(%struct.a* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_halfword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ldrh w0, [x0, [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+  %result = load i16* %arrayidx86, align 2
+  ret i16 %result
+}
+
+define i32 @load_word(%struct.b* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_word:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ldr w0, [x0, [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+  %result = load i32* %arrayidx86, align 4
+  ret i32 %result
+}
+
+define i64 @load_doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_doubleword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ldr x0, [x0, [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+  %result = load i64* %arrayidx86, align 8
+  ret i64 %result
+}
+
+define void @store_halfword(%struct.a* %ctx, i32 %xor72, i16 %val) nounwind {
+; CHECK-LABEL: store_halfword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: strh w2, [x0, [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+  store i16 %val, i16* %arrayidx86, align 8
+  ret void
+}
+
+define void @store_word(%struct.b* %ctx, i32 %xor72, i32 %val) nounwind {
+; CHECK-LABEL: store_word:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: str w2, [x0, [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+  store i32 %val, i32* %arrayidx86, align 8
+  ret void
+}
+
+define void @store_doubleword(%struct.c* %ctx, i32 %xor72, i64 %val) nounwind {
+; CHECK-LABEL: store_doubleword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: str x2, [x0, [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+  store i64 %val, i64* %arrayidx86, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM64/fp-imm.ll b/test/CodeGen/ARM64/fp-imm.ll
new file mode 100644
index 0000000..6e271e0
--- /dev/null
+++ b/test/CodeGen/ARM64/fp-imm.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; CHECK: literal8
+; CHECK: .quad  4614256656552045848
+define double @foo() {
+; CHECK: _foo:
+; CHECK: adrp x[[REG:[0-9]+]], lCPI0_0@PAGE
+; CHECK: ldr  d0, [x[[REG]], lCPI0_0@PAGEOFF]
+; CHECK-NEXT: ret
+  ret double 0x400921FB54442D18
+}
+
+; CHECK: literal4
+; CHECK: .long 1078530011
+define float @bar() {
+; CHECK: _bar:
+; CHECK:  adrp  x[[REG:[0-9]+]], lCPI1_0@PAGE
+; CHECK:  ldr s0, [x[[REG]], lCPI1_0@PAGEOFF]
+; CHECK-NEXT:  ret
+  ret float 0x400921FB60000000
+}
+
+; CHECK: literal16
+; CHECK: .quad 0
+; CHECK: .quad 0
+define fp128 @baz() {
+; CHECK: _baz:
+; CHECK:  adrp x[[REG:[0-9]+]], lCPI2_0@PAGE
+; CHECK:  ldr  q0, [x[[REG]], lCPI2_0@PAGEOFF]
+; CHECK-NEXT:  ret
+  ret fp128 0xL00000000000000000000000000000000
+}
diff --git a/test/CodeGen/ARM64/fp.ll b/test/CodeGen/ARM64/fp.ll
new file mode 100644
index 0000000..08b1b67
--- /dev/null
+++ b/test/CodeGen/ARM64/fp.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @t1(i1 %a, float %b, float %c) nounwind {
+; CHECK: t1
+; CHECK: fcsel	s0, s0, s1, ne
+  %sel = select i1 %a, float %b, float %c
+  ret float %sel
+}
diff --git a/test/CodeGen/ARM64/fp128-folding.ll b/test/CodeGen/ARM64/fp128-folding.ll
new file mode 100644
index 0000000..6a7d203
--- /dev/null
+++ b/test/CodeGen/ARM64/fp128-folding.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+declare void @bar(i8*, i8*, i32*)
+
+; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
+; which is not supported.
+
+define fp128 @test_folding() {
+; CHECK-LABEL: test_folding:
+  %l = alloca i32
+  store i32 42, i32* %l
+  %val = load i32* %l
+  %fpval = sitofp i32 %val to fp128
+  ; If the value is loaded from a constant pool into an fp128, it's been folded
+  ; successfully.
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}},
+  ret fp128 %fpval
+}
diff --git a/test/CodeGen/ARM64/fp128.ll b/test/CodeGen/ARM64/fp128.ll
new file mode 100644
index 0000000..21eb893
--- /dev/null
+++ b/test/CodeGen/ARM64/fp128.ll
@@ -0,0 +1,274 @@
+; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@lhs = global fp128 zeroinitializer, align 16
+@rhs = global fp128 zeroinitializer, align 16
+
+define fp128 @test_add() {
+; CHECK-LABEL: test_add:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fadd fp128 %lhs, %rhs
+; CHECK: bl __addtf3
+  ret fp128 %val
+}
+
+define fp128 @test_sub() {
+; CHECK-LABEL: test_sub:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fsub fp128 %lhs, %rhs
+; CHECK: bl __subtf3
+  ret fp128 %val
+}
+
+define fp128 @test_mul() {
+; CHECK-LABEL: test_mul:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fmul fp128 %lhs, %rhs
+; CHECK: bl __multf3
+  ret fp128 %val
+}
+
+define fp128 @test_div() {
+; CHECK-LABEL: test_div:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fdiv fp128 %lhs, %rhs
+; CHECK: bl __divtf3
+  ret fp128 %val
+}
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_fptosi() {
+; CHECK-LABEL: test_fptosi:
+  %val = load fp128* @lhs, align 16
+
+  %val32 = fptosi fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixtfsi
+
+  %val64 = fptosi fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixtfdi
+
+  ret void
+}
+
+define void @test_fptoui() {
+; CHECK-LABEL: test_fptoui:
+  %val = load fp128* @lhs, align 16
+
+  %val32 = fptoui fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixunstfsi
+
+  %val64 = fptoui fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixunstfdi
+
+  ret void
+}
+
+define void @test_sitofp() {
+; CHECK-LABEL: test_sitofp:
+
+  %src32 = load i32* @var32
+  %val32 = sitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatsitf
+
+  %src64 = load i64* @var64
+  %val64 = sitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatditf
+
+  ret void
+}
+
+define void @test_uitofp() {
+; CHECK-LABEL: test_uitofp:
+
+  %src32 = load i32* @var32
+  %val32 = uitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatunsitf
+
+  %src64 = load i64* @var64
+  %val64 = uitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatunditf
+
+  ret void
+}
+
+define i1 @test_setcc1() {
+; CHECK-LABEL: test_setcc1:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+; Technically, everything after the call to __letf2 is redundant, but we'll let
+; LLVM have its fun for now.
+  %val = fcmp ole fp128 %lhs, %rhs
+; CHECK: bl __letf2
+; CHECK: cmp w0, #0
+; CHECK: csinc w0, wzr, wzr, gt
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i1 @test_setcc2() {
+; CHECK-LABEL: test_setcc2:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fcmp ugt fp128 %lhs, %rhs
+; CHECK: bl      __gttf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[GT:w[0-9]+]], wzr, wzr, le
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+; CHECK: orr     w0, [[UNORDERED]], [[GT]]
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i32 @test_br_cc() {
+; CHECK-LABEL: test_br_cc:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
+  %cond = fcmp olt fp128 %lhs, %rhs
+; CHECK: bl      __getf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[OGE:w[0-9]+]], wzr, wzr, lt
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+
+; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
+; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
+  br i1 %cond, label %iftrue, label %iffalse
+
+iftrue:
+  ret i32 42
+; CHECK-NEXT: BB#
+; CHECK-NEXT: movz w0, #42
+; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
+
+iffalse:
+  ret i32 29
+; CHECK: [[RET29]]:
+; CHECK-NEXT: movz w0, #29
+; CHECK-NEXT: [[REALRET]]:
+; CHECK: ret
+}
+
+define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
+; CHECK-LABEL: test_select:
+
+  %val = select i1 %cond, fp128 %lhs, fp128 %rhs
+  store fp128 %val, fp128* @lhs, align 16
+; CHECK: and [[BIT:w[0-9]+]], w0, #0x1
+; CHECK: cmp [[BIT]], #0
+; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#
+; CHECK-NEXT: orr v[[VAL:[0-9]+]].16b, v0.16b, v0.16b
+; CHECK-NEXT: [[IFFALSE]]:
+; CHECK: str q[[VAL]], [{{x[0-9]+}}, :lo12:lhs]
+  ret void
+; CHECK: ret
+}
+
+@varfloat = global float 0.0, align 4
+@vardouble = global double 0.0, align 8
+
+define void @test_round() {
+; CHECK-LABEL: test_round:
+
+  %val = load fp128* @lhs, align 16
+
+  %float = fptrunc fp128 %val to float
+  store float %float, float* @varfloat, align 4
+; CHECK: bl __trunctfsf2
+; CHECK: str s0, [{{x[0-9]+}}, :lo12:varfloat]
+
+  %double = fptrunc fp128 %val to double
+  store double %double, double* @vardouble, align 8
+; CHECK: bl __trunctfdf2
+; CHECK: str d0, [{{x[0-9]+}}, :lo12:vardouble]
+
+  ret void
+}
+
+define void @test_extend() {
+; CHECK-LABEL: test_extend:
+
+  %val = load fp128* @lhs, align 16
+
+  %float = load float* @varfloat
+  %fromfloat = fpext float %float to fp128
+  store volatile fp128 %fromfloat, fp128* @lhs, align 16
+; CHECK: bl __extendsftf2
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
+
+  %double = load double* @vardouble
+  %fromdouble = fpext double %double to fp128
+  store volatile fp128 %fromdouble, fp128* @lhs, align 16
+; CHECK: bl __extenddftf2
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
+
+  ret void
+; CHECK: ret
+}
+
+define fp128 @test_neg(fp128 %in) {
+; CHECK: [[MINUS0:.LCPI[0-9]+_0]]:
+; Make sure the weird hex constant below *is* -0.0
+; CHECK-NEXT: fp128 -0
+
+; CHECK-LABEL: test_neg:
+
+  ; Could in principle be optimized to fneg which we can't select, this makes
+  ; sure that doesn't happen.
+  %ret = fsub fp128 0xL00000000000000008000000000000000, %in
+; CHECK: orr v1.16b, v0.16b, v0.16b
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:[[MINUS0]]]
+; CHECK: bl __subtf3
+
+  ret fp128 %ret
+; CHECK: ret
+}
diff --git a/test/CodeGen/ARM64/frame-index.ll b/test/CodeGen/ARM64/frame-index.ll
new file mode 100644
index 0000000..4a91ff3
--- /dev/null
+++ b/test/CodeGen/ARM64/frame-index.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://11935841
+
+define void @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: add x{{[0-9]+}}, sp
+; CHECK: stp x28, x27, [sp, #-16]!
+  %v = alloca [288 x i32], align 4
+  unreachable
+}
diff --git a/test/CodeGen/ARM64/frameaddr.ll b/test/CodeGen/ARM64/frameaddr.ll
new file mode 100644
index 0000000..d0635ad
--- /dev/null
+++ b/test/CodeGen/ARM64/frameaddr.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @t() nounwind {
+entry:
+; CHECK-LABEL: t:
+; CHECK: stp fp, lr, [sp, #-16]!
+; CHECK: mov fp, sp
+; CHECK: mov x0, fp
+; CHECK: ldp fp, lr, [sp], #16
+; CHECK: ret
+	%0 = call i8* @llvm.frameaddress(i32 0)
+        ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/global-address.ll b/test/CodeGen/ARM64/global-address.ll
new file mode 100644
index 0000000..005f414
--- /dev/null
+++ b/test/CodeGen/ARM64/global-address.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://9618644
+
+@G = external global i32
+
+define i32 @test(i32 %off) nounwind {
+; CHECK-LABEL: test:
+; CHECK: adrp x[[REG:[0-9]+]], _G@GOTPAGE
+; CHECK: ldr  x[[REG2:[0-9]+]], [x[[REG]], _G@GOTPAGEOFF]
+; CHECK: add w0, w[[REG2]], w0
+  %tmp = ptrtoint i32* @G to i32
+  %tmp1 = add i32 %tmp, %off
+  ret i32 %tmp1
+}
diff --git a/test/CodeGen/ARM64/hello.ll b/test/CodeGen/ARM64/hello.ll
new file mode 100644
index 0000000..f870fff
--- /dev/null
+++ b/test/CodeGen/ARM64/hello.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+
+; CHECK-LABEL: main:
+; CHECK:	stp	fp, lr, [sp, #-16]!
+; CHECK-NEXT:	mov	fp, sp
+; CHECK-NEXT:	sub	sp, sp, #16
+; CHECK-NEXT:	stur	wzr, [fp, #-4]
+; CHECK:	adrp	x0, L_.str@PAGE
+; CHECK:	add	x0, x0, L_.str@PAGEOFF
+; CHECK-NEXT:	bl	_puts
+; CHECK-NEXT:	mov	sp, fp
+; CHECK-NEXT:	ldp	fp, lr, [sp], #16
+; CHECK-NEXT:	ret
+
+; CHECK-LINUX-LABEL: main:
+; CHECK-LINUX:	stp	fp, lr, [sp, #-16]!
+; CHECK-LINUX-NEXT:	mov	fp, sp
+; CHECK-LINUX-NEXT:	sub	sp, sp, #16
+; CHECK-LINUX-NEXT:	stur	wzr, [fp, #-4]
+; CHECK-LINUX:	adrp	x0, .L.str
+; CHECK-LINUX:	add	x0, x0, :lo12:.L.str
+; CHECK-LINUX-NEXT:	bl	puts
+; CHECK-LINUX-NEXT:	mov	sp, fp
+; CHECK-LINUX-NEXT:	ldp	fp, lr, [sp], #16
+; CHECK-LINUX-NEXT:	ret
+
+@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
+
+define i32 @main() nounwind ssp {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
+  ret i32 %call
+}
+
+declare i32 @puts(i8*)
diff --git a/test/CodeGen/ARM64/i16-subreg-extract.ll b/test/CodeGen/ARM64/i16-subreg-extract.ll
new file mode 100644
index 0000000..fc2e8b5
--- /dev/null
+++ b/test/CodeGen/ARM64/i16-subreg-extract.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @foo(<4 x i16>* %__a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: umov.h w{{[0-9]+}}, v{{[0-9]+}}[0]
+  %tmp18 = load <4 x i16>* %__a, align 8
+  %vget_lane = extractelement <4 x i16> %tmp18, i32 0
+  %conv = zext i16 %vget_lane to i32
+  %mul = mul nsw i32 3, %conv
+  ret i32 %mul
+}
+
diff --git a/test/CodeGen/ARM64/icmp-opt.ll b/test/CodeGen/ARM64/icmp-opt.ll
new file mode 100644
index 0000000..f88399b
--- /dev/null
+++ b/test/CodeGen/ARM64/icmp-opt.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; Optimize (x > -1) to (x >= 0) etc.
+; Optimize (cmp (add / sub), 0): eliminate the subs used to update flag
+;   for comparison only
+; rdar://10233472
+
+define i32 @t1(i64 %a) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: movn
+; CHECK: cmp  x0, #0
+; CHECK: csinc w0, wzr, wzr, lt
+  %cmp = icmp sgt i64 %a, -1
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/ARM64/illegal-float-ops.ll b/test/CodeGen/ARM64/illegal-float-ops.ll
new file mode 100644
index 0000000..9a35fe5
--- /dev/null
+++ b/test/CodeGen/ARM64/illegal-float-ops.ll
@@ -0,0 +1,295 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+@varfp128 = global fp128 zeroinitializer
+
+declare float @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare fp128 @llvm.cos.f128(fp128)
+
+define void @test_cos(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_cos:
+
+   %cosfloat = call float @llvm.cos.f32(float %float)
+   store float %cosfloat, float* @varfloat
+; CHECK: bl cosf
+
+   %cosdouble = call double @llvm.cos.f64(double %double)
+   store double %cosdouble, double* @vardouble
+; CHECK: bl cos
+
+   %cosfp128 = call fp128 @llvm.cos.f128(fp128 %fp128)
+   store fp128 %cosfp128, fp128* @varfp128
+; CHECK: bl cosl
+
+  ret void
+}
+
+declare float @llvm.exp.f32(float)
+declare double @llvm.exp.f64(double)
+declare fp128 @llvm.exp.f128(fp128)
+
+define void @test_exp(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp:
+
+   %expfloat = call float @llvm.exp.f32(float %float)
+   store float %expfloat, float* @varfloat
+; CHECK: bl expf
+
+   %expdouble = call double @llvm.exp.f64(double %double)
+   store double %expdouble, double* @vardouble
+; CHECK: bl exp
+
+   %expfp128 = call fp128 @llvm.exp.f128(fp128 %fp128)
+   store fp128 %expfp128, fp128* @varfp128
+; CHECK: bl expl
+
+  ret void
+}
+
+declare float @llvm.exp2.f32(float)
+declare double @llvm.exp2.f64(double)
+declare fp128 @llvm.exp2.f128(fp128)
+
+define void @test_exp2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp2:
+
+   %exp2float = call float @llvm.exp2.f32(float %float)
+   store float %exp2float, float* @varfloat
+; CHECK: bl exp2f
+
+   %exp2double = call double @llvm.exp2.f64(double %double)
+   store double %exp2double, double* @vardouble
+; CHECK: bl exp2
+
+   %exp2fp128 = call fp128 @llvm.exp2.f128(fp128 %fp128)
+   store fp128 %exp2fp128, fp128* @varfp128
+; CHECK: bl exp2l
+  ret void
+
+}
+
+declare float @llvm.log.f32(float)
+declare double @llvm.log.f64(double)
+declare fp128 @llvm.log.f128(fp128)
+
+define void @test_log(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log:
+
+   %logfloat = call float @llvm.log.f32(float %float)
+   store float %logfloat, float* @varfloat
+; CHECK: bl logf
+
+   %logdouble = call double @llvm.log.f64(double %double)
+   store double %logdouble, double* @vardouble
+; CHECK: bl log
+
+   %logfp128 = call fp128 @llvm.log.f128(fp128 %fp128)
+   store fp128 %logfp128, fp128* @varfp128
+; CHECK: bl logl
+
+  ret void
+}
+
+declare float @llvm.log2.f32(float)
+declare double @llvm.log2.f64(double)
+declare fp128 @llvm.log2.f128(fp128)
+
+define void @test_log2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log2:
+
+   %log2float = call float @llvm.log2.f32(float %float)
+   store float %log2float, float* @varfloat
+; CHECK: bl log2f
+
+   %log2double = call double @llvm.log2.f64(double %double)
+   store double %log2double, double* @vardouble
+; CHECK: bl log2
+
+   %log2fp128 = call fp128 @llvm.log2.f128(fp128 %fp128)
+   store fp128 %log2fp128, fp128* @varfp128
+; CHECK: bl log2l
+  ret void
+
+}
+
+declare float @llvm.log10.f32(float)
+declare double @llvm.log10.f64(double)
+declare fp128 @llvm.log10.f128(fp128)
+
+define void @test_log10(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log10:
+
+   %log10float = call float @llvm.log10.f32(float %float)
+   store float %log10float, float* @varfloat
+; CHECK: bl log10f
+
+   %log10double = call double @llvm.log10.f64(double %double)
+   store double %log10double, double* @vardouble
+; CHECK: bl log10
+
+   %log10fp128 = call fp128 @llvm.log10.f128(fp128 %fp128)
+   store fp128 %log10fp128, fp128* @varfp128
+; CHECK: bl log10l
+
+  ret void
+}
+
+declare float @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare fp128 @llvm.sin.f128(fp128)
+
+define void @test_sin(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_sin:
+
+   %sinfloat = call float @llvm.sin.f32(float %float)
+   store float %sinfloat, float* @varfloat
+; CHECK: bl sinf
+
+   %sindouble = call double @llvm.sin.f64(double %double)
+   store double %sindouble, double* @vardouble
+; CHECK: bl sin
+
+   %sinfp128 = call fp128 @llvm.sin.f128(fp128 %fp128)
+   store fp128 %sinfp128, fp128* @varfp128
+; CHECK: bl sinl
+  ret void
+
+}
+
+declare float @llvm.pow.f32(float, float)
+declare double @llvm.pow.f64(double, double)
+declare fp128 @llvm.pow.f128(fp128, fp128)
+
+define void @test_pow(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_pow:
+
+   %powfloat = call float @llvm.pow.f32(float %float, float %float)
+   store float %powfloat, float* @varfloat
+; CHECK: bl powf
+
+   %powdouble = call double @llvm.pow.f64(double %double, double %double)
+   store double %powdouble, double* @vardouble
+; CHECK: bl pow
+
+   %powfp128 = call fp128 @llvm.pow.f128(fp128 %fp128, fp128 %fp128)
+   store fp128 %powfp128, fp128* @varfp128
+; CHECK: bl powl
+
+  ret void
+}
+
+declare float @llvm.powi.f32(float, i32)
+declare double @llvm.powi.f64(double, i32)
+declare fp128 @llvm.powi.f128(fp128, i32)
+
+define void @test_powi(float %float, double %double, i32 %exponent, fp128 %fp128) {
+; CHECK-LABEL: test_powi:
+
+   %powifloat = call float @llvm.powi.f32(float %float, i32 %exponent)
+   store float %powifloat, float* @varfloat
+; CHECK: bl __powisf2
+
+   %powidouble = call double @llvm.powi.f64(double %double, i32 %exponent)
+   store double %powidouble, double* @vardouble
+; CHECK: bl __powidf2
+
+   %powifp128 = call fp128 @llvm.powi.f128(fp128 %fp128, i32 %exponent)
+   store fp128 %powifp128, fp128* @varfp128
+; CHECK: bl __powitf2
+  ret void
+
+}
+
+define void @test_frem(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_frem:
+
+  %fremfloat = frem float %float, %float
+  store float %fremfloat, float* @varfloat
+; CHECK: bl fmodf
+
+  %fremdouble = frem double %double, %double
+  store double %fremdouble, double* @vardouble
+; CHECK: bl fmod
+
+  %fremfp128 = frem fp128 %fp128, %fp128
+  store fp128 %fremfp128, fp128* @varfp128
+; CHECK: bl fmodl
+
+  ret void
+}
+
+declare fp128 @llvm.fma.f128(fp128, fp128, fp128)
+
+define void @test_fma(fp128 %fp128) {
+; CHECK-LABEL: test_fma:
+
+  %fmafp128 = call fp128 @llvm.fma.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+  store fp128 %fmafp128, fp128* @varfp128
+; CHECK: bl fmal
+
+  ret void
+}
+
+declare fp128 @llvm.fmuladd.f128(fp128, fp128, fp128)
+
+define void @test_fmuladd(fp128 %fp128) {
+; CHECK-LABEL: test_fmuladd:
+
+  %fmuladdfp128 = call fp128 @llvm.fmuladd.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+  store fp128 %fmuladdfp128, fp128* @varfp128
+; CHECK-NOT: bl fmal
+; CHECK: bl __multf3
+; CHECK: bl __addtf3
+
+  ret void
+}
+
+define i32 @test_fptosi32(fp128 %a) {
+; CHECK-LABEL: test_fptosi32:
+; CHECK: bl __fixtfsi
+  %conv.i = fptosi fp128 %a to i32
+  %b = add nsw i32 %conv.i, 48
+  ret i32 %b
+}
+
+define i64 @test_fptosi64(fp128 %a) {
+; CHECK-LABEL: test_fptosi64:
+; CHECK: bl __fixtfdi
+  %conv.i = fptosi fp128 %a to i64
+  %b = add nsw i64 %conv.i, 48
+  ret i64 %b
+}
+
+define i128 @test_fptosi128(fp128 %a) {
+; CHECK-LABEL: test_fptosi128:
+; CHECK: bl __fixtfti
+  %conv.i = fptosi fp128 %a to i128
+  %b = add nsw i128 %conv.i, 48
+  ret i128 %b
+}
+
+define i32 @test_fptoui32(fp128 %a) {
+; CHECK-LABEL: test_fptoui32:
+; CHECK: bl __fixunstfsi
+  %conv.i = fptoui fp128 %a to i32
+  %b = add nsw i32 %conv.i, 48
+  ret i32 %b
+}
+
+define i64 @test_fptoui64(fp128 %a) {
+; CHECK-LABEL: test_fptoui64:
+; CHECK: bl __fixunstfdi
+  %conv.i = fptoui fp128 %a to i64
+  %b = add nsw i64 %conv.i, 48
+  ret i64 %b
+}
+
+define i128 @test_fptoui128(fp128 %a) {
+; CHECK-LABEL: test_fptoui128:
+; CHECK: bl __fixunstfti
+  %conv.i = fptoui fp128 %a to i128
+  %b = add nsw i128 %conv.i, 48
+  ret i128 %b
+}
diff --git a/test/CodeGen/ARM64/indexed-memory.ll b/test/CodeGen/ARM64/indexed-memory.ll
new file mode 100644
index 0000000..e390ed7
--- /dev/null
+++ b/test/CodeGen/ARM64/indexed-memory.ll
@@ -0,0 +1,351 @@
+; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+
+define void @store64(i64** nocapture %out, i64 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store64:
+; CHECK: str x{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+  %tmp = load i64** %out, align 8
+  %incdec.ptr = getelementptr inbounds i64* %tmp, i64 1
+  store i64 %spacing, i64* %tmp, align 4
+  store i64* %incdec.ptr, i64** %out, align 8
+  ret void
+}
+
+define void @store32(i32** nocapture %out, i32 %index, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load i32** %out, align 8
+  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+  store i32 %spacing, i32* %tmp, align 4
+  store i32* %incdec.ptr, i32** %out, align 8
+  ret void
+}
+
+define void @store16(i16** nocapture %out, i16 %index, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load i16** %out, align 8
+  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+  store i16 %spacing, i16* %tmp, align 4
+  store i16* %incdec.ptr, i16** %out, align 8
+  ret void
+}
+
+define void @store8(i8** nocapture %out, i8 %index, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+  %tmp = load i8** %out, align 8
+  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+  store i8 %spacing, i8* %tmp, align 4
+  store i8* %incdec.ptr, i8** %out, align 8
+  ret void
+}
+
+define void @truncst64to32(i32** nocapture %out, i32 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load i32** %out, align 8
+  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i32
+  store i32 %trunc, i32* %tmp, align 4
+  store i32* %incdec.ptr, i32** %out, align 8
+  ret void
+}
+
+define void @truncst64to16(i16** nocapture %out, i16 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load i16** %out, align 8
+  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i16
+  store i16 %trunc, i16* %tmp, align 4
+  store i16* %incdec.ptr, i16** %out, align 8
+  ret void
+}
+
+define void @truncst64to8(i8** nocapture %out, i8 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+  %tmp = load i8** %out, align 8
+  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i8
+  store i8 %trunc, i8* %tmp, align 4
+  store i8* %incdec.ptr, i8** %out, align 8
+  ret void
+}
+
+
+define void @storef32(float** nocapture %out, float %index, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef32:
+; CHECK: str s{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load float** %out, align 8
+  %incdec.ptr = getelementptr inbounds float* %tmp, i64 1
+  store float %spacing, float* %tmp, align 4
+  store float* %incdec.ptr, float** %out, align 8
+  ret void
+}
+
+define void @storef64(double** nocapture %out, double %index, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef64:
+; CHECK: str d{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+  %tmp = load double** %out, align 8
+  %incdec.ptr = getelementptr inbounds double* %tmp, i64 1
+  store double %spacing, double* %tmp, align 4
+  store double* %incdec.ptr, double** %out, align 8
+  ret void
+}
+
+define double * @pref64(double** nocapture %out, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref64:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     d0, [x0, #32]!
+; CHECK-NEXT: ret
+  %tmp = load double** %out, align 8
+  %ptr = getelementptr inbounds double* %tmp, i64 4
+  store double %spacing, double* %ptr, align 4
+  ret double *%ptr
+}
+
+define float * @pref32(float** nocapture %out, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     s0, [x0, #12]!
+; CHECK-NEXT: ret
+  %tmp = load float** %out, align 8
+  %ptr = getelementptr inbounds float* %tmp, i64 3
+  store float %spacing, float* %ptr, align 4
+  ret float *%ptr
+}
+
+define i64 * @pre64(i64** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre64:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     x1, [x0, #16]!
+; CHECK-NEXT: ret
+  %tmp = load i64** %out, align 8
+  %ptr = getelementptr inbounds i64* %tmp, i64 2
+  store i64 %spacing, i64* %ptr, align 4
+  ret i64 *%ptr
+}
+
+define i32 * @pre32(i32** nocapture %out, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     w1, [x0, #8]!
+; CHECK-NEXT: ret
+  %tmp = load i32** %out, align 8
+  %ptr = getelementptr inbounds i32* %tmp, i64 2
+  store i32 %spacing, i32* %ptr, align 4
+  ret i32 *%ptr
+}
+
+define i16 * @pre16(i16** nocapture %out, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre16:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strh    w1, [x0, #4]!
+; CHECK-NEXT: ret
+  %tmp = load i16** %out, align 8
+  %ptr = getelementptr inbounds i16* %tmp, i64 2
+  store i16 %spacing, i16* %ptr, align 4
+  ret i16 *%ptr
+}
+
+define i8 * @pre8(i8** nocapture %out, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre8:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strb    w1, [x0, #2]!
+; CHECK-NEXT: ret
+  %tmp = load i8** %out, align 8
+  %ptr = getelementptr inbounds i8* %tmp, i64 2
+  store i8 %spacing, i8* %ptr, align 4
+  ret i8 *%ptr
+}
+
+define i32 * @pretrunc64to32(i32** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     w1, [x0, #8]!
+; CHECK-NEXT: ret
+  %tmp = load i32** %out, align 8
+  %ptr = getelementptr inbounds i32* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i32
+  store i32 %trunc, i32* %ptr, align 4
+  ret i32 *%ptr
+}
+
+define i16 * @pretrunc64to16(i16** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to16:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strh    w1, [x0, #4]!
+; CHECK-NEXT: ret
+  %tmp = load i16** %out, align 8
+  %ptr = getelementptr inbounds i16* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i16
+  store i16 %trunc, i16* %ptr, align 4
+  ret i16 *%ptr
+}
+
+define i8 * @pretrunc64to8(i8** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to8:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strb    w1, [x0, #2]!
+; CHECK-NEXT: ret
+  %tmp = load i8** %out, align 8
+  %ptr = getelementptr inbounds i8* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i8
+  store i8 %trunc, i8* %ptr, align 4
+  ret i8 *%ptr
+}
+
+;-----
+; Pre-indexed loads
+;-----
+define double* @preidxf64(double* %src, double* %out) {
+; CHECK-LABEL: preidxf64:
+; CHECK: ldr     d0, [x0, #8]!
+; CHECK: str     d0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds double* %src, i64 1
+  %tmp = load double* %ptr, align 4
+  store double %tmp, double* %out, align 4
+  ret double* %ptr
+}
+
+define float* @preidxf32(float* %src, float* %out) {
+; CHECK-LABEL: preidxf32:
+; CHECK: ldr     s0, [x0, #4]!
+; CHECK: str     s0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds float* %src, i64 1
+  %tmp = load float* %ptr, align 4
+  store float %tmp, float* %out, align 4
+  ret float* %ptr
+}
+
+define i64* @preidx64(i64* %src, i64* %out) {
+; CHECK-LABEL: preidx64:
+; CHECK: ldr     x[[REG:[0-9]+]], [x0, #8]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i64* %src, i64 1
+  %tmp = load i64* %ptr, align 4
+  store i64 %tmp, i64* %out, align 4
+  ret i64* %ptr
+}
+
+define i32* @preidx32(i32* %src, i32* %out) {
+; CHECK: ldr     w[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i32* %src, i64 1
+  %tmp = load i32* %ptr, align 4
+  store i32 %tmp, i32* %out, align 4
+  ret i32* %ptr
+}
+
+define i16* @preidx16zext32(i16* %src, i32* %out) {
+; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = zext i16 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i16* %ptr
+}
+
+define i16* @preidx16zext64(i16* %src, i64* %out) {
+; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = zext i16 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i16* %ptr
+}
+
+define i8* @preidx8zext32(i8* %src, i32* %out) {
+; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = zext i8 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i8* %ptr
+}
+
+define i8* @preidx8zext64(i8* %src, i64* %out) {
+; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = zext i8 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i8* %ptr
+}
+
+define i32* @preidx32sext64(i32* %src, i64* %out) {
+; CHECK: ldrsw   x[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i32* %src, i64 1
+  %tmp = load i32* %ptr, align 4
+  %ext = sext i32 %tmp to i64
+  store i64 %ext, i64* %out, align 8
+  ret i32* %ptr
+}
+
+define i16* @preidx16sext32(i16* %src, i32* %out) {
+; CHECK: ldrsh   w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = sext i16 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i16* %ptr
+}
+
+define i16* @preidx16sext64(i16* %src, i64* %out) {
+; CHECK: ldrsh   x[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = sext i16 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i16* %ptr
+}
+
+define i8* @preidx8sext32(i8* %src, i32* %out) {
+; CHECK: ldrsb   w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = sext i8 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i8* %ptr
+}
+
+define i8* @preidx8sext64(i8* %src, i64* %out) {
+; CHECK: ldrsb   x[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = sext i8 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i8* %ptr
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-I.ll b/test/CodeGen/ARM64/inline-asm-error-I.ll
new file mode 100644
index 0000000..a7aaf9e
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-I.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'I'
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind ssp {
+entry:
+  %0 = tail call i32 asm sideeffect "add $0, $1, $2", "=r,r,I"(i32 %i, i32 4097) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-J.ll b/test/CodeGen/ARM64/inline-asm-error-J.ll
new file mode 100644
index 0000000..077e1b8
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-J.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'J'
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind ssp {
+entry:
+  %0 = tail call i32 asm sideeffect "sub $0, $1, $2", "=r,r,J"(i32 %i, i32 2) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-K.ll b/test/CodeGen/ARM64/inline-asm-error-K.ll
new file mode 100644
index 0000000..2a7f961
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-K.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'K'
+
+define i32 @constraint_K(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,K"(i32 %i, i32 -1) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-L.ll b/test/CodeGen/ARM64/inline-asm-error-L.ll
new file mode 100644
index 0000000..1701943
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-L.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'L'
+
+define i32 @constraint_L(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,L"(i32 %i, i64 -1) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-M.ll b/test/CodeGen/ARM64/inline-asm-error-M.ll
new file mode 100644
index 0000000..952bf60
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-M.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'M'
+
+define i32 @constraint_M(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,M"(i32 305418240) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-N.ll b/test/CodeGen/ARM64/inline-asm-error-N.ll
new file mode 100644
index 0000000..b4a199f
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-N.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'N'
+
+define i32 @constraint_N(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,N"(i64 1311761352401879040) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll b/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
new file mode 100644
index 0000000..6bfce8f
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -march=arm64 2>&1 | FileCheck %s
+
+
+; The 'z' constraint allocates either xzr or wzr, but obviously an input of 1 is
+; incompatible.
+define void @test_bad_zero_reg() {
+  tail call void asm sideeffect "USE($0)", "z"(i32 1) nounwind
+; CHECK: error: invalid operand for inline asm constraint 'z'
+
+  ret void
+}
diff --git a/test/CodeGen/ARM64/inline-asm.ll b/test/CodeGen/ARM64/inline-asm.ll
new file mode 100644
index 0000000..e645078
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -no-integrated-as | FileCheck %s
+
+; rdar://9167275
+
+define i32 @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mov {{w[0-9]+}}, 7
+  %0 = tail call i32 asm "mov ${0:w}, 7", "=r"() nounwind
+  ret i32 %0
+}
+
+define i64 @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mov {{x[0-9]+}}, 7
+  %0 = tail call i64 asm "mov $0, 7", "=r"() nounwind
+  ret i64 %0
+}
+
+define i64 @t3() nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: mov {{w[0-9]+}}, 7
+  %0 = tail call i64 asm "mov ${0:w}, 7", "=r"() nounwind
+  ret i64 %0
+}
+
+; rdar://9281206
+
+define void @t4(i64 %op) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: mov x0, {{x[0-9]+}}; svc #0
+  %0 = tail call i64 asm sideeffect "mov x0, $1; svc #0;", "=r,r,r,~{x0}"(i64 %op, i64 undef) nounwind
+  ret void
+}
+
+; rdar://9394290
+
+define float @t5(float %x) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %0 = tail call float asm "fadd ${0:s}, ${0:s}, ${0:s}", "=w,0"(float %x) nounwind
+  ret float %0
+}
+
+; rdar://9553599
+
+define zeroext i8 @t6(i8* %src) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldtrb {{w[0-9]+}}, [{{x[0-9]+}}]
+  %0 = tail call i8 asm "ldtrb ${0:w}, [$1]", "=r,r"(i8* %src) nounwind
+  ret i8 %0
+}
+
+define void @t7(i8* %f, i32 %g) nounwind {
+entry:
+  %f.addr = alloca i8*, align 8
+  store i8* %f, i8** %f.addr, align 8
+  ; CHECK-LABEL: t7:
+  ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+  call void asm "str ${1:w}, $0", "=*Q,r"(i8** %f.addr, i32 %g) nounwind
+  ret void
+}
+
+; rdar://10258229
+; ARM64TargetLowering::getRegForInlineAsmConstraint() should recognize 'v'
+; registers.
+define void @t8() nounwind ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: stp {{d[0-9]+}}, {{d[0-9]+}}, [sp, #-16]
+  tail call void asm sideeffect "nop", "~{v8}"() nounwind
+  ret void
+}
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_I:
+  %0 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 16773120) nounwind
+  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #16773120
+  %1 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 4096) nounwind
+  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #4096
+  ret i32 %1
+}
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_J:
+  %0 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -16773120) nounwind
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4278194176
+  %1 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -1) nounwind
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4294967295
+  ret i32 %1
+}
+
+define i32 @constraint_KL(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_KL:
+  %0 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,K"(i32 %i, i32 255) nounwind
+  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #255
+  %1 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,L"(i32 %i, i64 16711680) nounwind
+  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #16711680
+  ret i32 %1
+}
+
+define i32 @constraint_MN(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_MN:
+  %0 = tail call i32 asm sideeffect "movk ${0:w}, $1", "=r,M"(i32 65535) nounwind
+  ; CHECK: movk  {{w[0-9]+}}, #65535
+  %1 = tail call i32 asm sideeffect "movz ${0:w}, $1", "=r,N"(i64 0) nounwind
+  ; CHECK: movz  {{w[0-9]+}}, #0
+  ret i32 %1
+}
+
+define void @t9() nounwind {
+entry:
+  ; CHECK-LABEL: t9:
+  %data = alloca <2 x double>, align 16
+  %0 = load <2 x double>* %data, align 16
+  call void asm sideeffect "mov.2d v4, $0\0A", "w,~{v4}"(<2 x double> %0) nounwind
+  ; CHECK: mov.2d v4, {{v[0-9]+}}
+  ret void
+}
+
+define void @t10() nounwind {
+entry:
+  ; CHECK-LABEL: t10:
+  %data = alloca <2 x float>, align 8
+  %a = alloca [2 x float], align 4
+  %arraydecay = getelementptr inbounds [2 x float]* %a, i32 0, i32 0
+  %0 = load <2 x float>* %data, align 8
+  call void asm sideeffect "ldr ${1:q}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:d}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:s}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:h}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{h[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:b}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{b[0-9]+}}, [{{x[0-9]+}}]
+  ret void
+}
+
+define void @t11() nounwind {
+entry:
+  ; CHECK-LABEL: t11:
+  %a = alloca i32, align 4
+  %0 = load i32* %a, align 4
+  call void asm sideeffect "mov ${1:x}, ${0:x}\0A", "r,i"(i32 %0, i32 0) nounwind
+  ; CHECK: mov xzr, {{x[0-9]+}}
+  %1 = load i32* %a, align 4
+  call void asm sideeffect "mov ${1:w}, ${0:w}\0A", "r,i"(i32 %1, i32 0) nounwind
+  ; CHECK: mov wzr, {{w[0-9]+}}
+  ret void
+}
+
+define void @t12() nounwind {
+entry:
+  ; CHECK-LABEL: t12:
+  %data = alloca <4 x float>, align 16
+  %0 = load <4 x float>* %data, align 16
+  call void asm sideeffect "mov.2d v4, $0\0A", "x,~{v4}"(<4 x float> %0) nounwind
+  ; CHECK mov.2d v4, {{v([0-9])|(1[0-5])}}
+  ret void
+}
+
+define void @t13() nounwind {
+entry:
+  ; CHECK-LABEL: t13:
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 1311673391471656960) nounwind
+  ; CHECK: mov x4, #1311673391471656960
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 -4662) nounwind
+  ; CHECK: mov x4, #-4662
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 4660) nounwind
+  ; CHECK: mov x4, #4660
+  call void asm sideeffect "mov x4, $0\0A", "N"(i64 -71777214294589696) nounwind
+  ; CHECK: mov x4, #-71777214294589696
+  ret void
+}
+
+define void @t14() nounwind {
+entry:
+  ; CHECK-LABEL: t14:
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 305397760) nounwind
+  ; CHECK: mov w4, #305397760
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 -4662) nounwind
+  ; CHECK: mov w4, #4294962634
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 4660) nounwind
+  ; CHECK: mov w4, #4660
+  call void asm sideeffect "mov w4, $0\0A", "M"(i32 -16711936) nounwind
+  ; CHECK: mov w4, #4278255360
+  ret void
+}
+
+define void @t15() nounwind {
+entry:
+  %0 = tail call double asm sideeffect "fmov $0, d8", "=r"() nounwind
+  ; CHECK: fmov {{x[0-9]+}}, d8
+  ret void
+}
+
+; rdar://problem/14285178
+
+define void @test_zero_reg(i32* %addr) {
+; CHECK-LABEL: test_zero_reg:
+
+  tail call void asm sideeffect "USE($0)", "z"(i32 0) nounwind
+; CHECK: USE(xzr)
+
+  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 0)
+; CHECK: USE(wzr)
+
+  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 1)
+; CHECK: orr [[VAL1:w[0-9]+]], wzr, #0x1
+; CHECK: USE([[VAL1]])
+
+  tail call void asm sideeffect "USE($0), USE($1)", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(xzr)
+
+  tail call void asm sideeffect "USE($0), USE(${1:w})", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(wzr)
+
+  ret void
+}
diff --git a/test/CodeGen/ARM64/join-reserved.ll b/test/CodeGen/ARM64/join-reserved.ll
new file mode 100644
index 0000000..e99168b
--- /dev/null
+++ b/test/CodeGen/ARM64/join-reserved.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Make sure that a store to [sp] addresses off sp directly.
+; A move isn't necessary.
+; <rdar://problem/11492712>
+; CHECK-LABEL: g:
+; CHECK: str xzr, [sp]
+; CHECK: bl
+; CHECK: ret
+define void @g() nounwind ssp {
+entry:
+  tail call void (i32, ...)* @f(i32 0, i32 0) nounwind
+  ret void
+}
+
+declare void @f(i32, ...)
diff --git a/test/CodeGen/ARM64/jumptable.ll b/test/CodeGen/ARM64/jumptable.ll
new file mode 100644
index 0000000..4635cfe
--- /dev/null
+++ b/test/CodeGen/ARM64/jumptable.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://11417675>
+
+define void @sum(i32* %to) {
+entry:
+  switch i32 undef, label %exit [
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+  ]
+bb1:
+  store i32 undef, i32* %to
+  br label %exit
+bb2:
+  store i32 undef, i32* %to
+  br label %exit
+bb3:
+  store i32 undef, i32* %to
+  br label %exit
+bb4:
+  store i32 undef, i32* %to
+  br label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: sum:
+; CHECK: adrp    {{x[0-9]+}}, LJTI0_0@PAGE
+; CHECK:  add    {{x[0-9]+}}, {{x[0-9]+}}, LJTI0_0@PAGEOFF
+
+; CHECK-LINUX-LABEL: sum:
+; CHECK-LINUX: adrp    {{x[0-9]+}}, .LJTI0_0
+; CHECK-LINUX:  add    {{x[0-9]+}}, {{x[0-9]+}}, :lo12:.LJTI0_0
diff --git a/test/CodeGen/ARM64/ld1.ll b/test/CodeGen/ARM64/ld1.ll
new file mode 100644
index 0000000..61836a1
--- /dev/null
+++ b/test/CodeGen/ARM64/ld1.ll
@@ -0,0 +1,1345 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }
+
+define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld2_8b
+; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
+; and from the argument of the function also defined by ABI (i.e., x0)
+; CHECK ld2.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x2_t  %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld3_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x3_t  %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld4_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }
+
+define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld2_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld3_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld4_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }
+
+define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld2_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x2_t  %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld3_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x3_t  %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld4_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }
+
+define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld2_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld3_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x3_t %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld4_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }
+
+define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld2_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x2_t  %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld3_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x3_t  %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld4_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }
+
+define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld2_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld3_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld4_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }
+
+define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld2_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld3_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld4_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
+%struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
+%struct.__neon_int64x1x4_t = type { <1 x i64>,  <1 x i64>, <1 x i64>, <1 x i64> }
+
+
+define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld2_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x2_t  %tmp2
+}
+
+define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld3_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x3_t  %tmp2
+}
+
+define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld4_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x4_t  %tmp2
+}
+
+
+declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
+%struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
+%struct.__neon_float64x1x4_t = type { <1 x double>,  <1 x double>, <1 x double>, <1 x double> }
+
+
+define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld2_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x2_t  %tmp2
+}
+
+define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld3_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x3_t  %tmp2
+}
+
+define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld4_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x4_t  %tmp2
+}
+
+declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
+
+
+define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_16b
+; CHECK ld2.b { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_16b
+; CHECK ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_16b
+; CHECK ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_8h
+; CHECK ld2.h { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_8h
+; CHECK ld3.h { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x3_t  %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_8h
+; CHECK ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_4s
+; CHECK ld2.s { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_4s
+; CHECK ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_4s
+; CHECK ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_2d
+; CHECK ld2.d { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_2d
+; CHECK ld3.d { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_2d
+; CHECK ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+define <8 x i8> @ld1r_8b(i8* %bar) {
+; CHECK: ld1r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8b { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+  ret <8 x i8> %tmp9
+}
+
+define <16 x i8> @ld1r_16b(i8* %bar) {
+; CHECK: ld1r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.16b { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+  ret <16 x i8> %tmp17
+}
+
+define <4 x i16> @ld1r_4h(i16* %bar) {
+; CHECK: ld1r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4h { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+  ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @ld1r_8h(i16* %bar) {
+; CHECK: ld1r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8h { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+  ret <8 x i16> %tmp9
+}
+
+define <2 x i32> @ld1r_2s(i32* %bar) {
+; CHECK: ld1r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ld1r_4s(i32* %bar) {
+; CHECK: ld1r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ld1r_2d(i64* %bar) {
+; CHECK: ld1r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+  ret <2 x i64> %tmp3
+}
+
+define %struct.__neon_int8x8x2_t @ld2r_8b(i8* %A) nounwind {
+; CHECK: ld2r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x2_t  %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3r_8b(i8* %A) nounwind {
+; CHECK: ld3r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x3_t  %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4r_8b(i8* %A) nounwind {
+; CHECK: ld4r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld2r_16b(i8* %A) nounwind {
+; CHECK: ld2r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3r_16b(i8* %A) nounwind {
+; CHECK: ld3r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4r_16b(i8* %A) nounwind {
+; CHECK: ld4r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int16x4x2_t @ld2r_4h(i16* %A) nounwind {
+; CHECK: ld2r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x2_t  %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3r_4h(i16* %A) nounwind {
+; CHECK: ld3r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x3_t  %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4r_4h(i16* %A) nounwind {
+; CHECK: ld4r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2r_8h(i16* %A) nounwind {
+; CHECK: ld2r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3r_8h(i16* %A) nounwind {
+; CHECK: ld3r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x3_t  %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4r_8h(i16* %A) nounwind {
+; CHECK: ld4r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int32x2x2_t @ld2r_2s(i32* %A) nounwind {
+; CHECK: ld2r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x2_t  %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3r_2s(i32* %A) nounwind {
+; CHECK: ld3r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x3_t  %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4r_2s(i32* %A) nounwind {
+; CHECK: ld4r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2r_4s(i32* %A) nounwind {
+; CHECK: ld2r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3r_4s(i32* %A) nounwind {
+; CHECK: ld3r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4r_4s(i32* %A) nounwind {
+; CHECK: ld4r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int64x1x2_t @ld2r_1d(i64* %A) nounwind {
+; CHECK: ld2r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x2_t  %tmp2
+}
+
+define %struct.__neon_int64x1x3_t @ld3r_1d(i64* %A) nounwind {
+; CHECK: ld3r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x3_t  %tmp2
+}
+
+define %struct.__neon_int64x1x4_t @ld4r_1d(i64* %A) nounwind {
+; CHECK: ld4r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2r_2d(i64* %A) nounwind {
+; CHECK: ld2r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3r_2d(i64* %A) nounwind {
+; CHECK: ld3r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4r_2d(i64* %A) nounwind {
+; CHECK: ld4r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+
+define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
+; CHECK-LABEL: ld1_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> %V, i8 %tmp1, i32 0
+  ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @ld1_8h(<8 x i16> %V, i16* %bar) {
+; CHECK-LABEL: ld1_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> %V, i16 %tmp1, i32 0
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @ld1_4s(<4 x i32> %V, i32* %bar) {
+; CHECK-LABEL: ld1_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> %V, i32 %tmp1, i32 0
+  ret <4 x i32> %tmp2
+}
+
+define <4 x float> @ld1_4s_float(<4 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_4s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> %V, float %tmp1, i32 0
+  ret <4 x float> %tmp2
+}
+
+define <2 x i64> @ld1_2d(<2 x i64> %V, i64* %bar) {
+; CHECK-LABEL: ld1_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> %V, i64 %tmp1, i32 0
+  ret <2 x i64> %tmp2
+}
+
+define <2 x double> @ld1_2d_double(<2 x double> %V, double* %bar) {
+; CHECK-LABEL: ld1_2d_double:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> %V, double %tmp1, i32 0
+  ret <2 x double> %tmp2
+}
+
+define <1 x i64> @ld1_1d(<1 x i64>* %p) {
+; CHECK-LABEL: ld1_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr [[REG:d[0-9]+]], [x0]
+; CHECK-NEXT: ret
+  %tmp = load <1 x i64>* %p, align 8
+  ret <1 x i64> %tmp
+}
+
+define <8 x i8> @ld1_8b(<8 x i8> %V, i8* %bar) {
+; CHECK-LABEL: ld1_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> %V, i8 %tmp1, i32 0
+  ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @ld1_4h(<4 x i16> %V, i16* %bar) {
+; CHECK-LABEL: ld1_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> %V, i16 %tmp1, i32 0
+  ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @ld1_2s(<2 x i32> %V, i32* %bar) {
+; CHECK-LABEL: ld1_2s:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> %V, i32 %tmp1, i32 0
+  ret <2 x i32> %tmp2
+}
+
+define <2 x float> @ld1_2s_float(<2 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_2s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> %V, float %tmp1, i32 0
+  ret <2 x float> %tmp2
+}
+
+
+; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
+define void @ld1r_2s_from_dup(i8* nocapture %a, i8* nocapture %b, i16* nocapture %diff) nounwind ssp {
+entry:
+; CHECK: ld1r_2s_from_dup
+; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0]
+; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1]
+; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]]
+; CHECK-NEXT: str d[[RESREGNUM]], [x2]
+; CHECK-NEXT: ret
+  %tmp = bitcast i8* %a to i32*
+  %tmp1 = load i32* %tmp, align 4
+  %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
+  %lane = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmp3 = bitcast <2 x i32> %lane to <8 x i8>
+  %tmp4 = bitcast i8* %b to i32*
+  %tmp5 = load i32* %tmp4, align 4
+  %tmp6 = insertelement <2 x i32> undef, i32 %tmp5, i32 0
+  %lane1 = shufflevector <2 x i32> %tmp6, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmp7 = bitcast <2 x i32> %lane1 to <8 x i8>
+  %vmovl.i.i = zext <8 x i8> %tmp3 to <8 x i16>
+  %vmovl.i4.i = zext <8 x i8> %tmp7 to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i4.i
+  %tmp8 = bitcast <8 x i16> %sub.i to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %tmp8, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp9 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %tmp10 = bitcast i16* %diff to <4 x i16>*
+  store <4 x i16> %tmp9, <4 x i16>* %tmp10, align 8
+  ret void
+}
+
+; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
+define <4 x float> @ld1r_4s_float(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_4s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp2 = insertelement <4 x float> %tmp1, float %tmp, i32 1
+  %tmp3 = insertelement <4 x float> %tmp2, float %tmp, i32 2
+  %tmp4 = insertelement <4 x float> %tmp3, float %tmp, i32 3
+  ret <4 x float> %tmp4
+}
+
+define <2 x float> @ld1r_2s_float(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+  %tmp2 = insertelement <2 x float> %tmp1, float %tmp, i32 1
+  ret <2 x float> %tmp2
+}
+
+define <2 x double> @ld1r_2d_double(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+  %tmp2 = insertelement <2 x double> %tmp1, double %tmp, i32 1
+  ret <2 x double> %tmp2
+}
+
+define <1 x double> @ld1r_1d_double(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_1d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+  ret <1 x double> %tmp1
+}
+
+define <4 x float> @ld1r_4s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_4s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+  %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x float> @ld1r_2s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+  %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <2 x double> @ld1r_2d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+  %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <1 x double> @ld1r_1d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_1d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+  %lane = shufflevector <1 x double> %tmp1, <1 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %lane
+}
+
+%struct.__neon_float32x2x2_t = type { <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
+
+declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x2_t %val
+}
+
+define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x2_t %val
+}
+
+define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x2_t %val
+}
+
+define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x2_t %val
+}
+
+
+%struct.__neon_float32x4x2_t = type { <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x3_t = type { <4 x float>,  <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x4_t = type { <4 x float>,  <4 x float>, <4 x float>,  <4 x float> }
+
+%struct.__neon_float64x2x2_t = type { <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x2_t %val
+}
+
+define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x2_t %val
+}
+
+define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x2_t %val
+}
+
+define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x2_t %val
+}
+
+define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x2_t %val
+}
+
+define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x2_t %val
+}
+
+declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x3_t %val
+}
+
+define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x3_t %val
+}
+
+define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x3_t %val
+}
+
+define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x3_t %val
+}
+
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x3_t %val
+}
+
+define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x3_t %val
+}
+
+define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x3_t %val
+}
+
+define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x3_t %val
+}
+
+define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x3_t %val
+}
+
+define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x3_t %val
+}
+
+declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x4_t %val
+}
+
+define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x4_t %val
+}
+
+define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x4_t %val
+}
+
+define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x4_t %val
+}
+
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x4_t %val
+}
+
+define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x4_t %val
+}
+
+define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x4_t %val
+}
+
+define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x4_t %val
+}
+
+define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x4_t %val
+}
+
+define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x4_t %val
+}
diff --git a/test/CodeGen/ARM64/ldp.ll b/test/CodeGen/ARM64/ldp.ll
new file mode 100644
index 0000000..9444385
--- /dev/null
+++ b/test/CodeGen/ARM64/ldp.ll
@@ -0,0 +1,149 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
+; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
+
+; CHECK: ldp_int
+; CHECK: ldp
+define i32 @ldp_int(i32* %p) nounwind {
+  %tmp = load i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32* %p, i64 1
+  %tmp1 = load i32* %add.ptr, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  ret i32 %add
+}
+
+; CHECK: ldp_long
+; CHECK: ldp
+define i64 @ldp_long(i64* %p) nounwind {
+  %tmp = load i64* %p, align 8
+  %add.ptr = getelementptr inbounds i64* %p, i64 1
+  %tmp1 = load i64* %add.ptr, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
+
+; CHECK: ldp_float
+; CHECK: ldp
+define float @ldp_float(float* %p) nounwind {
+  %tmp = load float* %p, align 4
+  %add.ptr = getelementptr inbounds float* %p, i64 1
+  %tmp1 = load float* %add.ptr, align 4
+  %add = fadd float %tmp, %tmp1
+  ret float %add
+}
+
+; CHECK: ldp_double
+; CHECK: ldp
+define double @ldp_double(double* %p) nounwind {
+  %tmp = load double* %p, align 8
+  %add.ptr = getelementptr inbounds double* %p, i64 1
+  %tmp1 = load double* %add.ptr, align 8
+  %add = fadd double %tmp, %tmp1
+  ret double %add
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define i32 @ldur_int(i32* %a) nounwind {
+; LDUR_CHK: ldur_int
+; LDUR_CHK: ldp     [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add     w{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %a, i32 -1
+  %tmp1 = load i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %a, i32 -2
+  %tmp2 = load i32* %p2, align 2
+  %tmp3 = add i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+define i64 @ldur_long(i64* %a) nounwind ssp {
+; LDUR_CHK: ldur_long
+; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -1
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -2
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define float @ldur_float(float* %a) {
+; LDUR_CHK: ldur_float
+; LDUR_CHK: ldp     [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add     s{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds float* %a, i64 -1
+  %tmp1 = load float* %p1, align 2
+  %p2 = getelementptr inbounds float* %a, i64 -2
+  %tmp2 = load float* %p2, align 2
+  %tmp3 = fadd float %tmp1, %tmp2
+  ret float %tmp3
+}
+
+define double @ldur_double(double* %a) {
+; LDUR_CHK: ldur_double
+; LDUR_CHK: ldp     [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add     d{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds double* %a, i64 -1
+  %tmp1 = load double* %p1, align 2
+  %p2 = getelementptr inbounds double* %a, i64 -2
+  %tmp2 = load double* %p2, align 2
+  %tmp3 = fadd double %tmp1, %tmp2
+  ret double %tmp3
+}
+
+; Now check some boundary conditions
+define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyIn
+; LDUR_CHK-NOT: ldur
+; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -31
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -32
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyOut
+; LDUR_CHK-NOT: ldp
+; Don't be fragile about which loads or manipulations of the base register
+; are used---just check that there isn't an ldp before the add
+; LDUR_CHK: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -32
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -33
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpNotAligned
+; LDUR_CHK-NOT: ldp
+; LDUR_CHK: ldur
+; LDUR_CHK-NEXT: ldur
+; LDUR_CHK-NEXT: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -18
+  %bp1 = bitcast i64* %p1 to i8*
+  %bp1p1 = getelementptr inbounds i8* %bp1, i64 1
+  %dp1 = bitcast i8* %bp1p1 to i64*
+  %tmp1 = load i64* %dp1, align 1
+
+  %p2 = getelementptr inbounds i64* %a, i64 -17
+  %bp2 = bitcast i64* %p2 to i8*
+  %bp2p1 = getelementptr inbounds i8* %bp2, i64 1
+  %dp2 = bitcast i8* %bp2p1 to i64*
+  %tmp2 = load i64* %dp2, align 1
+
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
diff --git a/test/CodeGen/ARM64/ldur.ll b/test/CodeGen/ARM64/ldur.ll
new file mode 100644
index 0000000..2848c06
--- /dev/null
+++ b/test/CodeGen/ARM64/ldur.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i64 @_f0(i64* %p) {
+; CHECK: f0:
+; CHECK: ldur x0, [x0, #-8]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i64* %p, i64 -1
+  %ret = load i64* %tmp, align 2
+  ret i64 %ret
+}
+define i32 @_f1(i32* %p) {
+; CHECK: f1:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i32* %p, i64 -1
+  %ret = load i32* %tmp, align 2
+  ret i32 %ret
+}
+define i16 @_f2(i16* %p) {
+; CHECK: f2:
+; CHECK: ldurh w0, [x0, #-2]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i16* %p, i64 -1
+  %ret = load i16* %tmp, align 2
+  ret i16 %ret
+}
+define i8 @_f3(i8* %p) {
+; CHECK: f3:
+; CHECK: ldurb w0, [x0, #-1]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i8* %p, i64 -1
+  %ret = load i8* %tmp, align 2
+  ret i8 %ret
+}
+
+define i64 @zext32(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext32:
+; CHECK: ldur w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp1 = bitcast i8* %p to i32*
+  %tmp2 = load i32* %tmp1, align 4
+  %ret = zext i32 %tmp2 to i64
+
+  ret i64 %ret
+}
+define i64 @zext16(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext16:
+; CHECK: ldurh w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp1 = bitcast i8* %p to i16*
+  %tmp2 = load i16* %tmp1, align 2
+  %ret = zext i16 %tmp2 to i64
+
+  ret i64 %ret
+}
+define i64 @zext8(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext8:
+; CHECK: ldurb w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp2 = load i8* %p, align 1
+  %ret = zext i8 %tmp2 to i64
+
+  ret i64 %ret
+}
diff --git a/test/CodeGen/ARM64/ldxr-stxr.ll b/test/CodeGen/ARM64/ldxr-stxr.ll
new file mode 100644
index 0000000..d50ba94
--- /dev/null
+++ b/test/CodeGen/ARM64/ldxr-stxr.ll
@@ -0,0 +1,143 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+%0 = type { i64, i64 }
+
+define i128 @f0(i8* %p) nounwind readonly {
+; CHECK-LABEL: f0:
+; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %ldrexd = tail call %0 @llvm.arm64.ldxp(i8* %p)
+  %0 = extractvalue %0 %ldrexd, 1
+  %1 = extractvalue %0 %ldrexd, 0
+  %2 = zext i64 %0 to i128
+  %3 = zext i64 %1 to i128
+  %shl = shl nuw i128 %2, 64
+  %4 = or i128 %shl, %3
+  ret i128 %4
+}
+
+define i32 @f1(i8* %ptr, i128 %val) nounwind {
+; CHECK-LABEL: f1:
+; CHECK: stxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %tmp4 = trunc i128 %val to i64
+  %tmp6 = lshr i128 %val, 64
+  %tmp7 = trunc i128 %tmp6 to i64
+  %strexd = tail call i32 @llvm.arm64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  ret i32 %strexd
+}
+
+declare %0 @llvm.arm64.ldxp(i8*) nounwind
+declare i32 @llvm.arm64.stxp(i64, i64, i8*) nounwind
+
+@var = global i64 0, align 8
+
+define void @test_load_i8(i8* %addr) {
+; CHECK-LABEL: test_load_i8:
+; CHECK: ldxrb w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i8(i8* %addr)
+  %shortval = trunc i64 %val to i8
+  %extval = zext i8 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i16(i16* %addr) {
+; CHECK-LABEL: test_load_i16:
+; CHECK: ldxrh w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i16(i16* %addr)
+  %shortval = trunc i64 %val to i16
+  %extval = zext i16 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i32(i32* %addr) {
+; CHECK-LABEL: test_load_i32:
+; CHECK: ldxr w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i32(i32* %addr)
+  %shortval = trunc i64 %val to i32
+  %extval = zext i32 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i64(i64* %addr) {
+; CHECK-LABEL: test_load_i64:
+; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i64(i64* %addr)
+  store i64 %val, i64* @var, align 8
+  ret void
+}
+
+
+declare i64 @llvm.arm64.ldxr.p0i8(i8*) nounwind
+declare i64 @llvm.arm64.ldxr.p0i16(i16*) nounwind
+declare i64 @llvm.arm64.ldxr.p0i32(i32*) nounwind
+declare i64 @llvm.arm64.ldxr.p0i64(i64*) nounwind
+
+define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
+; CHECK-LABEL: test_store_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: stxrb w0, w1, [x2]
+  %extval = zext i8 %val to i64
+  %res = call i32 @llvm.arm64.stxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
+; CHECK-LABEL: test_store_i16:
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: stxrh w0, w1, [x2]
+  %extval = zext i16 %val to i64
+  %res = call i32 @llvm.arm64.stxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
+; CHECK-LABEL: test_store_i32:
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: stxr w0, w1, [x2]
+  %extval = zext i32 %val to i64
+  %res = call i32 @llvm.arm64.stxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
+; CHECK-LABEL: test_store_i64:
+; CHECK: stxr w0, x1, [x2]
+  %res = call i32 @llvm.arm64.stxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %res
+}
+
+declare i32 @llvm.arm64.stxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.arm64.stxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.arm64.stxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.arm64.stxr.p0i64(i64, i64*) nounwind
+
+; CHECK: test_clear:
+; CHECK: clrex
+define void @test_clear() {
+  call void @llvm.arm64.clrex()
+  ret void
+}
+
+declare void @llvm.arm64.clrex() nounwind
+
diff --git a/test/CodeGen/ARM64/leaf-compact-unwind.ll b/test/CodeGen/ARM64/leaf-compact-unwind.ll
new file mode 100644
index 0000000..0a58717
--- /dev/null
+++ b/test/CodeGen/ARM64/leaf-compact-unwind.ll
@@ -0,0 +1,161 @@
+; Use the -disable-cfi flag so that we get the compact unwind info in the
+; emitted assembly. Compact unwind info is omitted when CFI directives
+; are emitted.
+;
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios -disable-cfi < %s | FileCheck %s
+;
+; rdar://13070556
+
+@bar = common global i32 0, align 4
+
+; Leaf function with no stack allocation and no saving/restoring
+; of non-volatile registers.
+define i32 @foo1(i32 %a) #0 {
+entry:
+  %add = add nsw i32 %a, 42
+  ret i32 %add
+}
+
+; Leaf function with stack allocation but no saving/restoring
+; of non-volatile registers.
+define i32 @foo2(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #0 {
+entry:
+  %stack = alloca [36 x i32], align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.body ]
+  %arrayidx = getelementptr inbounds [36 x i32]* %stack, i64 0, i64 %indvars.iv19
+  %0 = trunc i64 %indvars.iv19 to i32
+  store i32 %0, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next20 = add i64 %indvars.iv19, 1
+  %lftr.wideiv21 = trunc i64 %indvars.iv.next20 to i32
+  %exitcond22 = icmp eq i32 %lftr.wideiv21, 36
+  br i1 %exitcond22, label %for.body4, label %for.body
+
+for.body4:                                        ; preds = %for.body, %for.body4
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body ]
+  %z1.016 = phi i32 [ %add, %for.body4 ], [ 0, %for.body ]
+  %arrayidx6 = getelementptr inbounds [36 x i32]* %stack, i64 0, i64 %indvars.iv
+  %1 = load i32* %arrayidx6, align 4, !tbaa !0
+  %add = add nsw i32 %1, %z1.016
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 36
+  br i1 %exitcond, label %for.end9, label %for.body4
+
+for.end9:                                         ; preds = %for.body4
+  ret i32 %add
+}
+
+; Leaf function with no stack allocation but with saving restoring of
+; non-volatile registers.
+define i32 @foo3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #1 {
+entry:
+  %0 = load volatile i32* @bar, align 4, !tbaa !0
+  %1 = load volatile i32* @bar, align 4, !tbaa !0
+  %2 = load volatile i32* @bar, align 4, !tbaa !0
+  %3 = load volatile i32* @bar, align 4, !tbaa !0
+  %4 = load volatile i32* @bar, align 4, !tbaa !0
+  %5 = load volatile i32* @bar, align 4, !tbaa !0
+  %6 = load volatile i32* @bar, align 4, !tbaa !0
+  %7 = load volatile i32* @bar, align 4, !tbaa !0
+  %8 = load volatile i32* @bar, align 4, !tbaa !0
+  %9 = load volatile i32* @bar, align 4, !tbaa !0
+  %10 = load volatile i32* @bar, align 4, !tbaa !0
+  %11 = load volatile i32* @bar, align 4, !tbaa !0
+  %12 = load volatile i32* @bar, align 4, !tbaa !0
+  %13 = load volatile i32* @bar, align 4, !tbaa !0
+  %14 = load volatile i32* @bar, align 4, !tbaa !0
+  %15 = load volatile i32* @bar, align 4, !tbaa !0
+  %16 = load volatile i32* @bar, align 4, !tbaa !0
+  %17 = load volatile i32* @bar, align 4, !tbaa !0
+  %factor = mul i32 %h, -2
+  %factor56 = mul i32 %g, -2
+  %factor57 = mul i32 %f, -2
+  %factor58 = mul i32 %e, -2
+  %factor59 = mul i32 %d, -2
+  %factor60 = mul i32 %c, -2
+  %factor61 = mul i32 %b, -2
+  %sum = add i32 %1, %0
+  %sum62 = add i32 %sum, %2
+  %sum63 = add i32 %sum62, %3
+  %sum64 = add i32 %sum63, %4
+  %sum65 = add i32 %sum64, %5
+  %sum66 = add i32 %sum65, %6
+  %sum67 = add i32 %sum66, %7
+  %sum68 = add i32 %sum67, %8
+  %sum69 = add i32 %sum68, %9
+  %sum70 = add i32 %sum69, %10
+  %sum71 = add i32 %sum70, %11
+  %sum72 = add i32 %sum71, %12
+  %sum73 = add i32 %sum72, %13
+  %sum74 = add i32 %sum73, %14
+  %sum75 = add i32 %sum74, %15
+  %sum76 = add i32 %sum75, %16
+  %sub10 = sub i32 %17, %sum76
+  %sub11 = add i32 %sub10, %factor
+  %sub12 = add i32 %sub11, %factor56
+  %sub13 = add i32 %sub12, %factor57
+  %sub14 = add i32 %sub13, %factor58
+  %sub15 = add i32 %sub14, %factor59
+  %sub16 = add i32 %sub15, %factor60
+  %add = add i32 %sub16, %factor61
+  ret i32 %add
+}
+
+; Leaf function with stack allocation and saving/restoring of non-volatile
+; registers.
+define i32 @foo4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #0 {
+entry:
+  %stack = alloca [128 x i32], align 4
+  %0 = zext i32 %a to i64
+  br label %for.body
+
+for.cond2.preheader:                              ; preds = %for.body
+  %1 = sext i32 %f to i64
+  br label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv22 = phi i64 [ 0, %entry ], [ %indvars.iv.next23, %for.body ]
+  %2 = add nsw i64 %indvars.iv22, %0
+  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %indvars.iv22
+  %3 = trunc i64 %2 to i32
+  store i32 %3, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next23 = add i64 %indvars.iv22, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv.next23 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, 128
+  br i1 %exitcond26, label %for.cond2.preheader, label %for.body
+
+for.body4:                                        ; preds = %for.body4, %for.cond2.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond2.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %z1.018 = phi i32 [ 0, %for.cond2.preheader ], [ %add8, %for.body4 ]
+  %4 = add nsw i64 %indvars.iv, %1
+  %arrayidx7 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %4
+  %5 = load i32* %arrayidx7, align 4, !tbaa !0
+  %add8 = add nsw i32 %5, %z1.018
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.end11, label %for.body4
+
+for.end11:                                        ; preds = %for.body4
+  ret i32 %add8
+}
+
+attributes #0 = { readnone "target-cpu"="cyclone" }
+attributes #1 = { "target-cpu"="cyclone" }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+
+; CHECK:        .section        __LD,__compact_unwind,regular,debug
+; CHECK:        .quad   _foo1                   ; Range Start
+; CHECK:        .long   33554432                ; Compact Unwind Encoding: 0x2000000
+; CHECK:        .quad   _foo2                   ; Range Start
+; CHECK:        .long   33591296                ; Compact Unwind Encoding: 0x2009000
+; CHECK:        .quad   _foo3                   ; Range Start
+; CHECK:        .long   33570831                ; Compact Unwind Encoding: 0x200400f
+; CHECK:        .quad   _foo4                   ; Range Start
+; CHECK:        .long   33689616                ; Compact Unwind Encoding: 0x2021010
diff --git a/test/CodeGen/ARM64/leaf.ll b/test/CodeGen/ARM64/leaf.ll
new file mode 100644
index 0000000..d3b2031
--- /dev/null
+++ b/test/CodeGen/ARM64/leaf.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://12829704
+
+define void @t8() nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK-NOT: stp	fp, lr, [sp, #-16]!
+; CHECK-NOT: mov	fp, sp
+; CHECK: nop
+; CHECK-NOT: mov	sp, fp
+; CHECK-NOT: ldp	fp, lr, [sp], #16
+  tail call void asm sideeffect "nop", "~{v8}"() nounwind
+  ret void
+}
diff --git a/test/CodeGen/ARM64/lit.local.cfg b/test/CodeGen/ARM64/lit.local.cfg
new file mode 100644
index 0000000..48af100
--- /dev/null
+++ b/test/CodeGen/ARM64/lit.local.cfg
@@ -0,0 +1,11 @@
+import re
+
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
+
+# For now we don't test arm64-win32.
+if re.search(r'cygwin|mingw32|win32', config.target_triple):
+    config.unsupported = True
diff --git a/test/CodeGen/ARM64/long-shift.ll b/test/CodeGen/ARM64/long-shift.ll
new file mode 100644
index 0000000..6f37044
--- /dev/null
+++ b/test/CodeGen/ARM64/long-shift.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+define i128 @shl(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: shl:
+; CHECK: lslv  [[XREG_0:x[0-9]+]], x1, x2
+; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
+; CHECK-NEXT: lsrv  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
+; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
+; CHECK-NEXT: lslv  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
+; CHECK-NEXT: cmp   [[XREG_4]], #0
+; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
+; CHECK-NEXT: lslv  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
+; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
+; CHECK-NEXT: ret
+
+  %shl = shl i128 %r, %s
+  ret i128 %shl
+}
+
+define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
+; CHECK: ashr:
+; CHECK: lsrv  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
+; CHECK-NEXT: lslv  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: asrv  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp   [[XREG_5]], #0
+; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: asrv  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
+; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+  %shr = ashr i128 %r, %s
+  ret i128 %shr
+}
+
+define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
+; CHECK: lshr:
+; CHECK: lsrv  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
+; CHECK-NEXT: lslv  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: lsrv  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp   [[XREG_5]], #0
+; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: lsrv  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+  %shr = lshr i128 %r, %s
+  ret i128 %shr
+}
diff --git a/test/CodeGen/ARM64/memcpy-inline.ll b/test/CodeGen/ARM64/memcpy-inline.ll
new file mode 100644
index 0000000..26f5166
--- /dev/null
+++ b/test/CodeGen/ARM64/memcpy-inline.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
+
+@src = external global %struct.x
+@dst = external global %struct.x
+
+@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
+@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
+@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
+@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR  \00", align 1
+@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
+@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
+@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
+
+define i32 @t0() {
+entry:
+; CHECK-LABEL: t0:
+; CHECK: ldrb [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #10]
+; CHECK: strb [[REG0]], [x[[BASEREG2:[0-9]+]], #10]
+; CHECK: ldrh [[REG1:w[0-9]+]], [x[[BASEREG]], #8]
+; CHECK: strh [[REG1]], [x[[BASEREG2]], #8]
+; CHECK: ldr [[REG2:x[0-9]+]],
+; CHECK: str [[REG2]],
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
+  ret i32 0
+}
+
+define void @t1(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
+; CHECK: stur [[DEST]], [x0, #15]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
+  ret void
+}
+
+define void @t2(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: movz [[REG3:w[0-9]+]]
+; CHECK: movk [[REG3]],
+; CHECK: str [[REG3]], [x0, #32]
+; CHECK: ldp [[DEST1:q[0-9]+]], [[DEST2:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: stp [[DEST1]], [[DEST2]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
+  ret void
+}
+
+define void @t3(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
+; CHECK: str [[REG4]], [x0, #16]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
+  ret void
+}
+
+define void @t4(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: orr [[REG5:w[0-9]+]], wzr, #0x20
+; CHECK: strh [[REG5]], [x0, #16]
+; CHECK: ldr [[REG6:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: str [[REG6]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
+  ret void
+}
+
+define void @t5(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: strb wzr, [x0, #6]
+; CHECK: movz [[REG7:w[0-9]+]], #21587
+; CHECK: strh [[REG7]], [x0, #4]
+; CHECK: movz [[REG8:w[0-9]+]],
+; CHECK: movk [[REG8]],
+; CHECK: str [[REG8]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
+  ret void
+}
+
+define void @t6() nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6]
+; CHECK: stur [[REG9]], [x{{[0-9]+}}, #6]
+; CHECK: ldr
+; CHECK: str
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
+  ret void
+}
+
+%struct.Foo = type { i32, i32, i32, i32 }
+
+define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
+entry:
+; CHECK: t7
+; CHECK: ldr [[REG10:q[0-9]+]], [x1]
+; CHECK: str [[REG10]], [x0]
+  %0 = bitcast %struct.Foo* %a to i8*
+  %1 = bitcast %struct.Foo* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/memset-inline.ll b/test/CodeGen/ARM64/memset-inline.ll
new file mode 100644
index 0000000..2e237f4
--- /dev/null
+++ b/test/CodeGen/ARM64/memset-inline.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @t1(i8* nocapture %c) nounwind optsize {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: str wzr, [x0, #8]
+; CHECK: str xzr, [x0]
+  call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
+  ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: strh wzr, [sp, #32]
+; CHECK: stp xzr, xzr, [sp, #16]
+; CHECK: str xzr, [sp, #8]
+  %buf = alloca [26 x i8], align 1
+  %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
+  call void @something(i8* %0) nounwind
+  ret void
+}
+
+declare void @something(i8*) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/memset-to-bzero.ll b/test/CodeGen/ARM64/memset-to-bzero.ll
new file mode 100644
index 0000000..b28122c
--- /dev/null
+++ b/test/CodeGen/ARM64/memset-to-bzero.ll
@@ -0,0 +1,101 @@
+; RUN: llc %s -march arm64 -o - | FileCheck %s
+; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
+
+; CHECK: @fct1
+; For small size (<= 256), we do not change memset to bzero.
+; CHECK: memset
+define void @fct1(i8* nocapture %ptr) {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+; CHECK: @fct2
+; When the size is bigger than 256, change into bzero.
+; CHECK: bzero
+define void @fct2(i8* nocapture %ptr) {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: @fct3
+; For unknown size, change to bzero.
+; CHECK: bzero
+define void @fct3(i8* nocapture %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: @fct4
+; Size <= 256, no change.
+; CHECK: memset
+define void @fct4(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
+
+declare i64 @llvm.objectsize.i64(i8*, i1)
+
+; CHECK: @fct5
+; Size > 256, change.
+; CHECK: bzero
+define void @fct5(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct6
+; Size = unknown, change.
+; CHECK: bzero
+define void @fct6(i8* %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp)
+  ret void
+}
+
+; Next functions check that memset is not turned into bzero
+; when the set constant is non-zero, whatever the given size.
+
+; CHECK: @fct7
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct7(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct8
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct8(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct9
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct9(i8* %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/movi.ll b/test/CodeGen/ARM64/movi.ll
new file mode 100644
index 0000000..8fceccc
--- /dev/null
+++ b/test/CodeGen/ARM64/movi.ll
@@ -0,0 +1,202 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+;==--------------------------------------------------------------------------==
+; Tests for MOV-immediate implemented with ORR-immediate.
+;==--------------------------------------------------------------------------==
+
+; 64-bit immed with 32-bit pattern size, rotated by 0.
+define i64 @test64_32_rot0() nounwind {
+; CHECK: test64_32_rot0
+; CHECK: orr x0, xzr, #0x700000007
+  ret i64 30064771079
+}
+
+; 64-bit immed with 32-bit pattern size, rotated by 2.
+define i64 @test64_32_rot2() nounwind {
+; CHECK: test64_32_rot2
+; CHECK: orr x0, xzr, #0xc0000003c0000003
+  ret i64 13835058071388291075
+}
+
+; 64-bit immed with 4-bit pattern size, rotated by 3.
+define i64 @test64_4_rot3() nounwind {
+; CHECK: test64_4_rot3
+; CHECK: orr  x0, xzr, #0xeeeeeeeeeeeeeeee
+  ret i64 17216961135462248174
+}
+
+; 32-bit immed with 32-bit pattern size, rotated by 16.
+define i32 @test32_32_rot16() nounwind {
+; CHECK: test32_32_rot16
+; CHECK: orr w0, wzr, #0xff0000
+  ret i32 16711680
+}
+
+; 32-bit immed with 2-bit pattern size, rotated by 1.
+define i32 @test32_2_rot1() nounwind {
+; CHECK: test32_2_rot1
+; CHECK: orr w0, wzr, #0xaaaaaaaa
+  ret i32 2863311530
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVZ with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i32 @movz() nounwind {
+; CHECK: movz
+; CHECK: movz w0, #5
+  ret i32 5
+}
+
+define i64 @movz_3movk() nounwind {
+; CHECK: movz_3movk
+; CHECK:      movz x0, #5, lsl #48
+; CHECK-NEXT: movk x0, #4660, lsl #32
+; CHECK-NEXT: movk x0, #43981, lsl #16
+; CHECK-NEXT: movk x0, #22136
+  ret i64 1427392313513592
+}
+
+define i64 @movz_movk_skip1() nounwind {
+; CHECK: movz_movk_skip1
+; CHECK:      movz x0, #5, lsl #32
+; CHECK-NEXT: movk x0, #17185, lsl #16
+  ret i64 22601072640
+}
+
+define i64 @movz_skip1_movk() nounwind {
+; CHECK: movz_skip1_movk
+; CHECK:      movz x0, #34388, lsl #32
+; CHECK-NEXT: movk x0, #4660
+  ret i64 147695335379508
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVN with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i64 @movn() nounwind {
+; CHECK: movn
+; CHECK: movn x0, #41
+  ret i64 -42
+}
+
+define i64 @movn_skip1_movk() nounwind {
+; CHECK: movn_skip1_movk
+; CHECK:      movn x0, #41, lsl #32
+; CHECK-NEXT: movk x0, #4660
+  ret i64 -176093720012
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for ORR with MOVK.
+;==--------------------------------------------------------------------------==
+; rdar://14987673
+
+define i64 @orr_movk1() nounwind {
+; CHECK: orr_movk1
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 72056498262245120
+}
+
+define i64 @orr_movk2() nounwind {
+; CHECK: orr_movk2
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982650836746496
+}
+
+define i64 @orr_movk3() nounwind {
+; CHECK: orr_movk3
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005, lsl #32
+  ret i64 72020953688702720
+}
+
+define i64 @orr_movk4() nounwind {
+; CHECK: orr_movk4
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005
+  ret i64 72056494543068845
+}
+
+; rdar://14987618
+define i64 @orr_movk5() nounwind {
+; CHECK: orr_movk5
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 -71777214836900096
+}
+
+define i64 @orr_movk6() nounwind {
+; CHECK: orr_movk6
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982647117578496
+}
+
+define i64 @orr_movk7() nounwind {
+; CHECK: orr_movk7
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982646575268096
+}
+
+define i64 @orr_movk8() nounwind {
+; CHECK: orr_movk8
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982646575276371
+}
+
+; rdar://14987715
+define i64 @orr_movk9() nounwind {
+; CHECK: orr_movk9
+; CHECK: orr x0, xzr, #0xffffff000000000
+; CHECK: movk x0, #65280
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 1152921439623315200
+}
+
+define i64 @orr_movk10() nounwind {
+; CHECK: orr_movk10
+; CHECK: orr x0, xzr, #0xfffffffffffff00
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 1152921504047824640
+}
+
+define i64 @orr_movk11() nounwind {
+; CHECK: orr_movk11
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #65535, lsl #32
+  ret i64 -4222125209747201
+}
+
+define i64 @orr_movk12() nounwind {
+; CHECK: orr_movk12
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #57005, lsl #32
+  ret i64 -4258765016661761
+}
+
+define i64 @orr_movk13() nounwind {
+; CHECK: orr_movk13
+; CHECK: orr x0, xzr, #0xfffff000000
+; CHECK: movk x0, #57005
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2401245434149282131
+}
+
+; rdar://13944082
+define i64 @g() nounwind {
+; CHECK: g
+; CHECK: movz x0, #65535, lsl #48
+; CHECK: movk x0, #2
+entry:
+  ret i64 -281474976710654
+}
diff --git a/test/CodeGen/ARM64/mul.ll b/test/CodeGen/ARM64/mul.ll
new file mode 100644
index 0000000..2e7986d
--- /dev/null
+++ b/test/CodeGen/ARM64/mul.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; rdar://9296808
+; rdar://9349137
+
+define i128 @t1(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: umulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = zext i64 %a to i128
+  %tmp2 = zext i64 %b to i128
+  %tmp3 = mul i128 %tmp1, %tmp2
+  ret i128 %tmp3
+}
+
+define i128 @t2(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: smulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = sext i64 %a to i128
+  %tmp2 = sext i64 %b to i128
+  %tmp3 = mul i128 %tmp1, %tmp2
+  ret i128 %tmp3
+}
+
+define i64 @t3(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: umull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @t4(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: smull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @t5(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = add i64 %c, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t6(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 %c, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: umnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 0, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t8(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 0, %tmp3
+  ret i64 %tmp4
+}
diff --git a/test/CodeGen/ARM64/neg.ll b/test/CodeGen/ARM64/neg.ll
new file mode 100644
index 0000000..659ce98
--- /dev/null
+++ b/test/CodeGen/ARM64/neg.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+define i32 @test_neg_i32(i32 %in) {
+; CHECK-LABEL: test_neg_i32:
+; CHECK: neg w0, w0
+  %res = sub i32 0, %in
+  ret i32 %res
+}
+
+define i64 @test_neg_i64(i64 %in) {
+; CHECK-LABEL: test_neg_i64:
+; CHECK: neg x0, x0
+  %res = sub i64 0, %in
+  ret i64 %res
+}
+
+define <8 x i8> @test_neg_v8i8(<8 x i8> %in) {
+; CHECK-LABEL: test_neg_v8i8:
+; CHECK: neg v0.8b, v0.8b
+  %res = sub <8 x i8> zeroinitializer, %in
+  ret <8 x i8> %res
+}
+
+define <4 x i16> @test_neg_v4i16(<4 x i16> %in) {
+; CHECK-LABEL: test_neg_v4i16:
+; CHECK: neg v0.4h, v0.4h
+  %res = sub <4 x i16> zeroinitializer, %in
+  ret <4 x i16> %res
+}
+
+define <2 x i32> @test_neg_v2i32(<2 x i32> %in) {
+; CHECK-LABEL: test_neg_v2i32:
+; CHECK: neg v0.2s, v0.2s
+  %res = sub <2 x i32> zeroinitializer, %in
+  ret <2 x i32> %res
+}
+
+define <16 x i8> @test_neg_v16i8(<16 x i8> %in) {
+; CHECK-LABEL: test_neg_v16i8:
+; CHECK: neg v0.16b, v0.16b
+  %res = sub <16 x i8> zeroinitializer, %in
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_neg_v8i16(<8 x i16> %in) {
+; CHECK-LABEL: test_neg_v8i16:
+; CHECK: neg v0.8h, v0.8h
+  %res = sub <8 x i16> zeroinitializer, %in
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_neg_v4i32(<4 x i32> %in) {
+; CHECK-LABEL: test_neg_v4i32:
+; CHECK: neg v0.4s, v0.4s
+  %res = sub <4 x i32> zeroinitializer, %in
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_neg_v2i64(<2 x i64> %in) {
+; CHECK-LABEL: test_neg_v2i64:
+; CHECK: neg v0.2d, v0.2d
+  %res = sub <2 x i64> zeroinitializer, %in
+  ret <2 x i64> %res
+}
+
+define <1 x i64> @test_neg_v1i64(<1 x i64> %in) {
+; CHECK-LABEL: test_neg_v1i64:
+; CHECK: neg d0, d0
+  %res = sub <1 x i64> zeroinitializer, %in
+  ret <1 x i64> %res
+}
diff --git a/test/CodeGen/ARM64/neon-compare-instructions.ll b/test/CodeGen/ARM64/neon-compare-instructions.ll
new file mode 100644
index 0000000..55f7b99
--- /dev/null
+++ b/test/CodeGen/ARM64/neon-compare-instructions.ll
@@ -0,0 +1,1191 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s
+
+define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp eq <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp eq <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp eq <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp eq <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp eq <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp eq <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp eq <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sgt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sgt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sgt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sgt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sgt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sgt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sgt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp slt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp slt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp slt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp slt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp slt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp slt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp slt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp sle <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp sle <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp sle <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp sle <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp sle <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp sle <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp sle <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ugt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ugt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp ugt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp ugt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp ugt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp ugt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp ugt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ult <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp uge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp uge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp uge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp uge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp uge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp uge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp uge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
+;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+	%tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+	%tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+	%tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+	%tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+	%tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+	%tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+	%tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+	%tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+	%tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+	%tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+	%tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+	%tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+	%tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+	%tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v[[ZERO]].8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <1 x i64> @cmeqz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmeqz_v1i64:
+; CHECK: cmeq d0, d0, #0
+  %tst = icmp eq <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgez_v1i64:
+; CHECK: cmge d0, d0, #0
+  %tst = icmp sge <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgtz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgtz_v1i64:
+; CHECK: cmgt d0, d0, #0
+  %tst = icmp sgt <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmlez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmlez_v1i64:
+; CHECK: cmle d0, d0, #0
+  %tst = icmp sle <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmltz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmltz_v1i64:
+; CHECK: cmlt d0, d0, #0
+  %tst = icmp slt <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmeqz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmeqz_v1f64:
+; CHECK: fcmeq d0, d0, #0
+  %tst = fcmp oeq <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgez_v1f64:
+; CHECK: fcmge d0, d0, #0
+  %tst = fcmp oge <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgtz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgtz_v1f64:
+; CHECK: fcmgt d0, d0, #0
+  %tst = fcmp ogt <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmlez_v1f64:
+; CHECK: fcmle d0, d0, #0
+  %tst = fcmp ole <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmltz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmltz_v1f64:
+; CHECK: fcmlt d0, d0, #0
+  %tst = fcmp olt <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
diff --git a/test/CodeGen/ARM64/patchpoint.ll b/test/CodeGen/ARM64/patchpoint.ll
new file mode 100644
index 0000000..993e3eb
--- /dev/null
+++ b/test/CodeGen/ARM64/patchpoint.ll
@@ -0,0 +1,163 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 | FileCheck %s
+
+; Trivial patchpoint codegen
+;
+define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: trivial_patchpoint_codegen:
+; CHECK:       movz x16, #57005, lsl #32
+; CHECK-NEXT:  movk x16, #48879, lsl #16
+; CHECK-NEXT:  movk x16, #51966
+; CHECK-NEXT:  blr  x16
+; CHECK:       movz x16, #57005, lsl #32
+; CHECK-NEXT:  movk x16, #48879, lsl #16
+; CHECK-NEXT:  movk x16, #51967
+; CHECK-NEXT:  blr  x16
+; CHECK:       ret
+  %resolveCall2 = inttoptr i64 244837814094590 to i8*
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %resolveCall3 = inttoptr i64 244837814094591 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  ret i64 %result
+}
+
+; Caller frame metadata with stackmaps. This should not be optimized
+; as a leaf function.
+;
+; CHECK-LABEL: caller_meta_leaf
+; CHECK:       mov fp, sp
+; CHECK-NEXT:  sub sp, sp, #32
+; CHECK:       Ltmp
+; CHECK:       mov sp, fp
+; CHECK:       ret
+
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+  ret void
+}
+
+; Test the webkit_jscc calling convention.
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in x0.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      str x{{.+}}, [sp]
+; CHECK-NEXT: mov  x0, x{{.+}}
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #65535, lsl #32
+; CHECK-NEXT: movk  x16, #57005, lsl #16
+; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: blr x16
+  %resolveCall2 = inttoptr i64 281474417671919 to i8*
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %resolveCall3 = inttoptr i64 244837814038255 to i8*
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:      Ltmp
+; CHECK:      orr x{{.+}}, xzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr x{{.+}}, xzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #65535, lsl #32
+; CHECK-NEXT: movk  x16, #57005, lsl #16
+; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:      Ltmp
+; CHECK:      movz  x{{.+}}, #10
+; CHECK-NEXT: str x{{.+}}, [sp, #48]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
+; CHECK-NEXT: str w{{.+}}, [sp, #36]
+; CHECK-NEXT: orr x{{.+}}, xzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr x{{.+}}, xzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #65535, lsl #32
+; CHECK-NEXT: movk  x16, #57005, lsl #16
+; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
+; Test patchpoints reusing the same TargetConstant.
+; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
+; There is no way to verify this, since it depends on memory allocation.
+; But I think it's useful to include as a working example.
+define i64 @testLowerConstant(i64 %arg, i64 %tmp2, i64 %tmp10, i64* %tmp33, i64 %tmp79) {
+entry:
+  %tmp80 = add i64 %tmp79, -16
+  %tmp81 = inttoptr i64 %tmp80 to i64*
+  %tmp82 = load i64* %tmp81, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  %tmp83 = load i64* %tmp33, align 8
+  %tmp84 = add i64 %tmp83, -24
+  %tmp85 = inttoptr i64 %tmp84 to i64*
+  %tmp86 = load i64* %tmp85, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  ret i64 10
+}
+
+; Test small patchpoints that don't emit calls.
+define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: small_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: ldp
+; CHECK-NEXT: ret
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
+  ret void
+}
+
+; Test that scratch registers are spilled around patchpoints
+; CHECK: InlineAsm End
+; CHECK-NEXT: mov x{{[0-9]+}}, x16
+; CHECK-NEXT: mov x{{[0-9]+}}, x17
+; CHECK-NEXT: Ltmp
+; CHECK-NEXT: nop
+define void @clobberScratch(i32* %p) {
+  %v = load i32* %p
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+  store i32 %v, i32* %p
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/platform-reg.ll b/test/CodeGen/ARM64/platform-reg.ll
new file mode 100644
index 0000000..651c793
--- /dev/null
+++ b/test/CodeGen/ARM64/platform-reg.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+; x18 is reserved as a platform register on Darwin but not on other
+; systems. Create loads of register pressure and make sure this is respected.
+
+; Also, fp must always refer to a valid frame record, even if it's not the one
+; of the current function, so it shouldn't be used either.
+
+@var = global [30 x i64] zeroinitializer
+
+define void @keep_live() {
+  %val = load volatile [30 x i64]* @var
+  store volatile [30 x i64] %val, [30 x i64]* @var
+
+; CHECK: ldr x18
+; CHECK: str x18
+
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: Spill
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: ret
+  ret void
+}
diff --git a/test/CodeGen/ARM64/popcnt.ll b/test/CodeGen/ARM64/popcnt.ll
new file mode 100644
index 0000000..9bbba09c
--- /dev/null
+++ b/test/CodeGen/ARM64/popcnt.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+; CHECK: fmov	s0, w0
+; CHECK: cnt.8b	v0, v0
+; CHECK: uaddlv.8b	h0, v0
+; CHECK: fmov w0, s0
+; CHECK: ret
+}
+
+define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+; CHECK: fmov	d0, x0
+; CHECK: cnt.8b	v0, v0
+; CHECK: uaddlv.8b	h0, v0
+; CHECK: fmov	w0, s0
+; CHECK: ret
+}
+
+; Do not use AdvSIMD when -mno-implicit-float is specified.
+; rdar://9473858
+
+define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+; CHECK-LABEL: cnt32:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+; CHECK-LABEL: cnt64:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/test/CodeGen/ARM64/prefetch.ll b/test/CodeGen/ARM64/prefetch.ll
new file mode 100644
index 0000000..b2e06ed
--- /dev/null
+++ b/test/CodeGen/ARM64/prefetch.ll
@@ -0,0 +1,88 @@
+; RUN: llc %s -march arm64 -o - | FileCheck %s
+
+@a = common global i32* null, align 8
+
+define void @test(i32 %i, i32 %j) nounwind ssp {
+entry:
+  ; CHECK: @test
+  %j.addr = alloca i32, align 4
+  store i32 %j, i32* %j.addr, align 4, !tbaa !0
+  %tmp = bitcast i32* %j.addr to i8*
+  ; CHECK: prfum pldl1strm
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 0, i32 1)
+  ; CHECK: prfum pldl3keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 1, i32 1)
+  ; CHECK: prfum pldl2keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 2, i32 1)
+  ; CHECK: prfum pldl1keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 1)
+
+  ; CHECK: prfum pstl1strm
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 0, i32 1)
+  ; CHECK: prfum pstl3keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 1, i32 1)
+  ; CHECK: prfum pstl2keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 2, i32 1)
+  ; CHECK: prfum pstl1keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 3, i32 1)
+
+  %tmp1 = load i32* %j.addr, align 4, !tbaa !0
+  %add = add nsw i32 %tmp1, %i
+  %idxprom = sext i32 %add to i64
+  %tmp2 = load i32** @a, align 8, !tbaa !3
+  %arrayidx = getelementptr inbounds i32* %tmp2, i64 %idxprom
+  %tmp3 = bitcast i32* %arrayidx to i8*
+
+  ; CHECK: prfm pldl1strm
+  call void @llvm.prefetch(i8* %tmp3, i32 0, i32 0, i32 1)
+  %tmp4 = load i32** @a, align 8, !tbaa !3
+  %arrayidx3 = getelementptr inbounds i32* %tmp4, i64 %idxprom
+  %tmp5 = bitcast i32* %arrayidx3 to i8*
+
+  ; CHECK: prfm pldl3keep
+  call void @llvm.prefetch(i8* %tmp5, i32 0, i32 1, i32 1)
+  %tmp6 = load i32** @a, align 8, !tbaa !3
+  %arrayidx6 = getelementptr inbounds i32* %tmp6, i64 %idxprom
+  %tmp7 = bitcast i32* %arrayidx6 to i8*
+
+  ; CHECK: prfm pldl2keep
+  call void @llvm.prefetch(i8* %tmp7, i32 0, i32 2, i32 1)
+  %tmp8 = load i32** @a, align 8, !tbaa !3
+  %arrayidx9 = getelementptr inbounds i32* %tmp8, i64 %idxprom
+  %tmp9 = bitcast i32* %arrayidx9 to i8*
+
+  ; CHECK: prfm pldl1keep
+  call void @llvm.prefetch(i8* %tmp9, i32 0, i32 3, i32 1)
+  %tmp10 = load i32** @a, align 8, !tbaa !3
+  %arrayidx12 = getelementptr inbounds i32* %tmp10, i64 %idxprom
+  %tmp11 = bitcast i32* %arrayidx12 to i8*
+
+  ; CHECK: prfm pstl1strm
+  call void @llvm.prefetch(i8* %tmp11, i32 1, i32 0, i32 1)
+  %tmp12 = load i32** @a, align 8, !tbaa !3
+  %arrayidx15 = getelementptr inbounds i32* %tmp12, i64 %idxprom
+  %tmp13 = bitcast i32* %arrayidx15 to i8*
+
+  ; CHECK: prfm pstl3keep
+  call void @llvm.prefetch(i8* %tmp13, i32 1, i32 1, i32 1)
+  %tmp14 = load i32** @a, align 8, !tbaa !3
+  %arrayidx18 = getelementptr inbounds i32* %tmp14, i64 %idxprom
+  %tmp15 = bitcast i32* %arrayidx18 to i8*
+
+  ; CHECK: prfm pstl2keep
+  call void @llvm.prefetch(i8* %tmp15, i32 1, i32 2, i32 1)
+  %tmp16 = load i32** @a, align 8, !tbaa !3
+  %arrayidx21 = getelementptr inbounds i32* %tmp16, i64 %idxprom
+  %tmp17 = bitcast i32* %arrayidx21 to i8*
+
+  ; CHECK: prfm pstl1keep
+  call void @llvm.prefetch(i8* %tmp17, i32 1, i32 3, i32 1)
+  ret void
+}
+
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/ARM64/promote-const.ll b/test/CodeGen/ARM64/promote-const.ll
new file mode 100644
index 0000000..4a336db
--- /dev/null
+++ b/test/CodeGen/ARM64/promote-const.ll
@@ -0,0 +1,255 @@
+; Disable machine cse to stress the different path of the algorithm.
+; Otherwise, we always fall in the simple case, i.e., only one definition.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-stress-promote-const | FileCheck -check-prefix=PROMOTED %s
+; The REGULAR run just checks that the inputs passed to promote const expose
+; the appropriate patterns.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-promote-const=false | FileCheck -check-prefix=REGULAR %s
+
+%struct.uint8x16x4_t = type { [4 x <16 x i8>] }
+
+; Constant is a structure
+define %struct.uint8x16x4_t @test1() {
+; PROMOTED-LABEL: test1:
+; Promote constant has created a big constant for the whole structure
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], __PromotedConst@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], __PromotedConst@PAGEOFF
+; Destination registers are defined by the ABI
+; PROMOTED-NEXT: ldp q0, q1, {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: ldp q2, q3, {{\[}}[[BASEADDR]], #32]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test1:
+; Regular access is quite bad, it performs 4 loads, one for each chunk of
+; the structure
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; Destination registers are defined by the ABI
+; REGULAR: ldr q0, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q1, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR2:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR: ldr q2, {{\[}}[[PAGEADDR2]], [[CSTLABEL2]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR3:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR: ldr q3, {{\[}}[[PAGEADDR3]], [[CSTLABEL3]]@PAGEOFF]
+; REGULAR-NEXT: ret
+entry:
+  ret %struct.uint8x16x4_t { [4 x <16 x i8>] [<16 x i8> <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, <16 x i8> <i8 32, i8 124, i8 121, i8 120, i8 8, i8 117, i8 -56, i8 113, i8 -76, i8 110, i8 -53, i8 107, i8 7, i8 105, i8 103, i8 102>, <16 x i8> <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>, <16 x i8> <i8 -104, i8 83, i8 -20, i8 81, i8 81, i8 80, i8 -59, i8 78, i8 73, i8 77, i8 -37, i8 75, i8 122, i8 74, i8 37, i8 73>] }
+}
+
+; Two different uses of the same constant in the same basic block
+define <16 x i8> @test2(<16 x i8> %arg) {
+entry:
+; PROMOTED-LABEL: test2:
+; In stress mode, constant vector are promoted
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test2:
+; Regular access is strickly the same as promoted access.
+; The difference is that the address (and thus the space in memory) is not
+; shared between constants
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: ret
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %add.i9 = add <16 x i8> %add.i, %mul.i
+  ret <16 x i8> %add.i9
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one dominates the other
+define <16 x i8> @test3(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test3:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbnz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV2:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV2]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM]], {{\[}}[[BASEADDR]]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: add.16b v0, v0, [[DESTV]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test3:
+; Regular mode does not elimitate common sub expression by its own.
+; In other words, the same loads appears several times.
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; Redundant load
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL2]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: add.16b v0, v0, [[DESTV]]
+; REGULAR-NEXT: ret
+entry:
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul.i13 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul.i = mul <16 x i8> %add.i, <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %ret2.0 = phi <16 x i8> [ %mul.i13, %if.then ], [ %mul.i, %if.else ]
+  %add.i12 = add <16 x i8> %add.i, %ret2.0
+  ret <16 x i8> %add.i12
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; none dominates the other
+define <16 x i8> @test4(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test4:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: ret
+
+
+; REGULAR-LABEL: test4:
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; Redundant expression
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; REGULAR-NEXT: [[LABEL]]:
+; REGULAR-NEXT: ret
+entry:
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %ret.0 = phi <16 x i8> [ %mul.i, %if.then ], [ %add.i, %entry ]
+  ret <16 x i8> %ret.0
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one is in a phi.
+define <16 x i8> @test5(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test5:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b v[[REGNUM]], [[DESTV]], v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[REGNUM]], v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; PROMOTED-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; PROMOTED-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test5:
+; REGULAR: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR-NEXT: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: mul.16b v[[DESTREGNUM:[0-9]+]], [[DESTV]], v[[REGNUM]]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[DESTREGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[DESTREGNUM]], v[[DESTREGNUM]]
+; REGULAR-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; REGULAR-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; REGULAR-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; REGULAR-NEXT: ret
+entry:
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %mul.i26 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %ret.0 = phi <16 x i8> [ %mul.i26, %if.then ], [ <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, %entry ]
+  %mul.i25 = mul <16 x i8> %ret.0, %ret.0
+  %mul.i24 = mul <16 x i8> %mul.i25, %mul.i25
+  %mul.i23 = mul <16 x i8> %mul.i24, %mul.i24
+  %mul.i = mul <16 x i8> %mul.i23, %mul.i23
+  ret <16 x i8> %mul.i
+}
+
+define void @accessBig(i64* %storage) {
+; PROMOTED-LABEL: accessBig:
+; PROMOTED: adrp
+; PROMOTED: ret
+  %addr = bitcast i64* %storage to <1 x i80>*
+  store <1 x i80> <i80 483673642326615442599424>, <1 x i80>* %addr
+  ret void
+}
+
+define void @asmStatement() {
+; PROMOTED-LABEL: asmStatement:
+; PROMOTED-NOT: adrp
+; PROMOTED: ret
+  call void asm sideeffect "bfxil w0, w0, $0, $1", "i,i"(i32 28, i32 4)
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/redzone.ll b/test/CodeGen/ARM64/redzone.ll
new file mode 100644
index 0000000..b89d7b1
--- /dev/null
+++ b/test/CodeGen/ARM64/redzone.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+
+define i32 @foo(i32 %a, i32 %b) nounwind ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: sub sp, sp
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %x = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %tmp1 = load i32* %b.addr, align 4
+  %add = add nsw i32 %tmp, %tmp1
+  store i32 %add, i32* %x, align 4
+  %tmp2 = load i32* %x, align 4
+  ret i32 %tmp2
+}
diff --git a/test/CodeGen/ARM64/register-offset-addressing.ll b/test/CodeGen/ARM64/register-offset-addressing.ll
new file mode 100644
index 0000000..c273602
--- /dev/null
+++ b/test/CodeGen/ARM64/register-offset-addressing.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i8 @t1(i16* %a, i64 %b) {
+; CHECK: t1
+; CHECK: lsl [[REG:x[0-9]+]], x1, #1
+; CHECK: ldrb w0, [x0, [[REG]]]
+; CHECK: ret
+  %tmp1 = getelementptr inbounds i16* %a, i64 %b
+  %tmp2 = load i16* %tmp1
+  %tmp3 = trunc i16 %tmp2 to i8
+  ret i8 %tmp3
+}
diff --git a/test/CodeGen/ARM64/register-pairing.ll b/test/CodeGen/ARM64/register-pairing.ll
new file mode 100644
index 0000000..4de80d2
--- /dev/null
+++ b/test/CodeGen/ARM64/register-pairing.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; rdar://14075006
+
+define void @odd() nounwind {
+; CHECK-LABEL: odd:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #42
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+  call void asm sideeffect "mov x0, #42", "~{x0},~{x19},~{x21},~{x23},~{x25},~{x27},~{d8},~{d10},~{d12},~{d14}"() nounwind
+  ret void
+}
+
+define void @even() nounwind {
+; CHECK-LABEL: even:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #42
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+  call void asm sideeffect "mov x0, #42", "~{x0},~{x20},~{x22},~{x24},~{x26},~{x28},~{d9},~{d11},~{d13},~{d15}"() nounwind
+  ret void
+}
diff --git a/test/CodeGen/ARM64/regress-f128csel-flags.ll b/test/CodeGen/ARM64/regress-f128csel-flags.ll
new file mode 100644
index 0000000..a1daf03
--- /dev/null
+++ b/test/CodeGen/ARM64/regress-f128csel-flags.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+
+; We used to not mark NZCV as being used in the continuation basic-block
+; when lowering a 128-bit "select" to branches. This meant a subsequent use
+; of the same flags gave an internal fault here.
+
+declare void @foo(fp128)
+
+define double @test_f128csel_flags(i32 %lhs, fp128 %a, fp128 %b, double %l, double %r) nounwind {
+; CHECK: test_f128csel_flags
+
+    %tst = icmp ne i32 %lhs, 42
+    %val = select i1 %tst, fp128 %a, fp128 %b
+; CHECK: cmp w0, #42
+; CHECK: b.eq {{.?LBB0}}
+
+    call void @foo(fp128 %val)
+    %retval = select i1 %tst, double %l, double %r
+
+    ; It's also reasonably important that the actual fcsel comes before the
+    ; function call since bl may corrupt NZCV. We were doing the right thing anyway,
+    ; but just as well test it while we're here.
+; CHECK: fcsel {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, ne
+; CHECK: bl {{_?foo}}
+
+    ret double %retval
+}
diff --git a/test/CodeGen/ARM64/regress-interphase-shift.ll b/test/CodeGen/ARM64/regress-interphase-shift.ll
new file mode 100644
index 0000000..fddf591
--- /dev/null
+++ b/test/CodeGen/ARM64/regress-interphase-shift.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+; This is mostly a "don't assert" test. The type of the RHS of a shift depended
+; on the phase of legalization, which led to the creation of an unexpected and
+; unselectable "rotr" node: (i32 (rotr i32, i64)).
+
+define void @foo(i64* nocapture %d) {
+; CHECK-LABEL: foo:
+; CHECK: rorv
+  %tmp = load i64* undef, align 8
+  %sub397 = sub i64 0, %tmp
+  %and398 = and i64 %sub397, 4294967295
+  %shr404 = lshr i64 %and398, 0
+  %or405 = or i64 0, %shr404
+  %xor406 = xor i64 %or405, 0
+  %xor417 = xor i64 0, %xor406
+  %xor428 = xor i64 0, %xor417
+  %sub430 = sub i64 %xor417, 0
+  %and431 = and i64 %sub430, 4294967295
+  %and432 = and i64 %xor428, 31
+  %sub433 = sub i64 32, %and432
+  %shl434 = shl i64 %and431, %sub433
+  %shr437 = lshr i64 %and431, %and432
+  %or438 = or i64 %shl434, %shr437
+  %xor439 = xor i64 %or438, %xor428
+  %sub441 = sub i64 %xor439, 0
+  store i64 %sub441, i64* %d, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM64/return-vector.ll b/test/CodeGen/ARM64/return-vector.ll
new file mode 100644
index 0000000..9457d8b
--- /dev/null
+++ b/test/CodeGen/ARM64/return-vector.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; 2x64 vector should be returned in Q0.
+
+define <2 x double> @test(<2 x double>* %p) nounwind {
+; CHECK: test
+; CHECK: ldr q0, [x0]
+; CHECK: ret
+  %tmp1 = load <2 x double>* %p, align 16
+  ret <2 x double> %tmp1
+}
diff --git a/test/CodeGen/ARM64/returnaddr.ll b/test/CodeGen/ARM64/returnaddr.ll
new file mode 100644
index 0000000..e06ce90
--- /dev/null
+++ b/test/CodeGen/ARM64/returnaddr.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @rt0(i32 %x) nounwind readnone {
+entry:
+; CHECK-LABEL: rt0:
+; CHECK: mov x0, lr
+; CHECK: ret
+  %0 = tail call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @rt2() nounwind readnone {
+entry:
+; CHECK-LABEL: rt2:
+; CHECK: stp fp, lr, [sp, #-16]!
+; CHECK: mov fp, sp
+; CHECK: ldr x[[REG:[0-9]+]], [fp]
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]]]
+; CHECK: ldr x0, [x[[REG2]], #8]
+; CHECK: ldp fp, lr, [sp], #16
+; CHECK: ret
+  %0 = tail call i8* @llvm.returnaddress(i32 2)
+  ret i8* %0
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/rev.ll b/test/CodeGen/ARM64/rev.ll
new file mode 100644
index 0000000..867d5b3
--- /dev/null
+++ b/test/CodeGen/ARM64/rev.ll
@@ -0,0 +1,221 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @test_rev_w(i32 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_w:
+; CHECK: rev w0, w0
+  %0 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %0
+}
+
+define i64 @test_rev_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_x:
+; CHECK: rev x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %0
+}
+
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
+
+define i32 @test_rev16_w(i32 %X) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_w:
+; CHECK: rev16 w0, w0
+  %tmp1 = lshr i32 %X, 8
+  %X15 = bitcast i32 %X to i32
+  %tmp4 = shl i32 %X15, 8
+  %tmp2 = and i32 %tmp1, 16711680
+  %tmp5 = and i32 %tmp4, -16777216
+  %tmp9 = and i32 %tmp1, 255
+  %tmp13 = and i32 %tmp4, 65280
+  %tmp6 = or i32 %tmp5, %tmp2
+  %tmp10 = or i32 %tmp6, %tmp13
+  %tmp14 = or i32 %tmp10, %tmp9
+  ret i32 %tmp14
+}
+
+define i64 @test_rev16_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_x:
+; CHECK: rev16 x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %1 = lshr i64 %0, 16
+  %2 = shl i64 %0, 48
+  %3 = or i64 %1, %2
+  ret i64 %3
+}
+
+define i64 @test_rev32_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev32_x:
+; CHECK: rev32 x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %1 = lshr i64 %0, 32
+  %2 = shl i64 %0, 32
+  %3 = or i64 %1, %2
+  ret i64 %3
+}
+
+define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8:
+;CHECK: rev64.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D16:
+;CHECK: rev64.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D32:
+;CHECK: rev64.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Df:
+;CHECK: rev64.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q8:
+;CHECK: rev64.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q16:
+;CHECK: rev64.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q32:
+;CHECK: rev64.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Qf:
+;CHECK: rev64.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x float> %tmp2
+}
+
+define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D8:
+;CHECK: rev32.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D16:
+;CHECK: rev32.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i16> %tmp2
+}
+
+define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q8:
+;CHECK: rev32.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16:
+;CHECK: rev32.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i16> %tmp2
+}
+
+define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16D8:
+;CHECK: rev16.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16Q8:
+;CHECK: rev16.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+	ret <16 x i8> %tmp2
+}
+
+; Undef shuffle indices should not prevent matching to VREV:
+
+define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8_undef:
+;CHECK: rev64.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16_undef:
+;CHECK: rev32.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
+	ret <8 x i16> %tmp2
+}
+
+; vrev <4 x i16> should use REV32 and not REV64
+define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
+; CHECK-LABEL: test_vrev64:
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: st1.h
+; CHECK: st1.h
+entry:
+  %0 = bitcast <4 x i16>* %source to <8 x i16>*
+  %tmp2 = load <8 x i16>* %0, align 4
+  %tmp3 = extractelement <8 x i16> %tmp2, i32 6
+  %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
+  %tmp9 = extractelement <8 x i16> %tmp2, i32 5
+  %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
+  store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
+  ret void
+}
+
+; Test vrev of float4
+define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
+; CHECK: float_vrev64
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: rev64.4s
+entry:
+  %0 = bitcast float* %source to <4 x float>*
+  %tmp2 = load <4 x float>* %0, align 4
+  %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
+  %arrayidx8 = getelementptr inbounds <4 x float>* %dest, i32 11
+  store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/rounding.ll b/test/CodeGen/ARM64/rounding.ll
new file mode 100644
index 0000000..7ff65c3
--- /dev/null
+++ b/test/CodeGen/ARM64/rounding.ll
@@ -0,0 +1,208 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-ios6.0.0"
+
+; CHECK: test1
+; CHECK: frintx
+; CHECK: frintm
+define float @test1(float %a) #0 {
+entry:
+  %call = tail call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @floorf(float) nounwind readnone
+
+; CHECK: test2
+; CHECK: frintx
+; CHECK: frintm
+define double @test2(double %a) #0 {
+entry:
+  %call = tail call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @floor(double) nounwind readnone
+
+; CHECK: test3
+; CHECK: frinti
+define float @test3(float %a) #0 {
+entry:
+  %call = tail call float @nearbyintf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @nearbyintf(float) nounwind readnone
+
+; CHECK: test4
+; CHECK: frinti
+define double @test4(double %a) #0 {
+entry:
+  %call = tail call double @nearbyint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @nearbyint(double) nounwind readnone
+
+; CHECK: test5
+; CHECK: frintx
+; CHECK: frintp
+define float @test5(float %a) #0 {
+entry:
+  %call = tail call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @ceilf(float) nounwind readnone
+
+; CHECK: test6
+; CHECK: frintx
+; CHECK: frintp
+define double @test6(double %a) #0 {
+entry:
+  %call = tail call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @ceil(double) nounwind readnone
+
+; CHECK: test7
+; CHECK: frintx
+define float @test7(float %a) #0 {
+entry:
+  %call = tail call float @rintf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @rintf(float) nounwind readnone
+
+; CHECK: test8
+; CHECK: frintx
+define double @test8(double %a) #0 {
+entry:
+  %call = tail call double @rint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @rint(double) nounwind readnone
+
+; CHECK: test9
+; CHECK: frintx
+; CHECK: frintz
+define float @test9(float %a) #0 {
+entry:
+  %call = tail call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @truncf(float) nounwind readnone
+
+; CHECK: test10
+; CHECK: frintx
+; CHECK: frintz
+define double @test10(double %a) #0 {
+entry:
+  %call = tail call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @trunc(double) nounwind readnone
+
+; CHECK: test11
+; CHECK: frintx
+; CHECK: frinta
+define float @test11(float %a) #0 {
+entry:
+  %call = tail call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @roundf(float %a) nounwind readnone
+
+; CHECK: test12
+; CHECK: frintx
+; CHECK: frinta
+define double @test12(double %a) #0 {
+entry:
+  %call = tail call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @round(double %a) nounwind readnone
+
+; CHECK: test13
+; CHECK-NOT: frintx
+; CHECK: frintm
+define float @test13(float %a) #1 {
+entry:
+  %call = tail call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test14
+; CHECK-NOT: frintx
+; CHECK: frintm
+define double @test14(double %a) #1 {
+entry:
+  %call = tail call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test15
+; CHECK-NOT: frintx
+; CHECK: frintp
+define float @test15(float %a) #1 {
+entry:
+  %call = tail call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test16
+; CHECK-NOT: frintx
+; CHECK: frintp
+define double @test16(double %a) #1 {
+entry:
+  %call = tail call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test17
+; CHECK-NOT: frintx
+; CHECK: frintz
+define float @test17(float %a) #1 {
+entry:
+  %call = tail call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test18
+; CHECK-NOT: frintx
+; CHECK: frintz
+define double @test18(double %a) #1 {
+entry:
+  %call = tail call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test19
+; CHECK-NOT: frintx
+; CHECK: frinta
+define float @test19(float %a) #1 {
+entry:
+  %call = tail call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test20
+; CHECK-NOT: frintx
+; CHECK: frinta
+define double @test20(double %a) #1 {
+entry:
+  %call = tail call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/ARM64/scaled_iv.ll b/test/CodeGen/ARM64/scaled_iv.ll
new file mode 100644
index 0000000..987373e
--- /dev/null
+++ b/test/CodeGen/ARM64/scaled_iv.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+; Scaling factor in addressing mode are costly.
+; Make loop-reduce prefer unscaled accesses.
+; <rdar://problem/13806271>
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp
+define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
+; CHECK: @mulDouble
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
+; Only one induction variable should have been generated.
+; CHECK-NOT: phi
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = add nsw i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds double* %b, i64 %tmp
+  %tmp1 = load double* %arrayidx, align 8
+; The induction variable should carry the scaling factor: 1 * 8 = 8.
+; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
+  %tmp2 = load double* %arrayidx2, align 8
+  %mul = fmul double %tmp1, %tmp2
+  %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
+  store double %mul, double* %arrayidx4, align 8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; Comparison should be 19 * 8 = 152.
+; CHECK: icmp eq i32 {{%[^,]+}}, 152
+  %exitcond = icmp eq i32 %lftr.wideiv, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/CodeGen/ARM64/scvt.ll b/test/CodeGen/ARM64/scvt.ll
new file mode 100644
index 0000000..b4d4add
--- /dev/null
+++ b/test/CodeGen/ARM64/scvt.ll
@@ -0,0 +1,830 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; rdar://13082402
+
+define float @t1(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldr s0, [x0]
+; CHECK: scvtf s0, s0
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = sitofp i32 %tmp1 to float
+  ret float %tmp2
+}
+
+define float @t2(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr s0, [x0]
+; CHECK: ucvtf s0, s0
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = uitofp i32 %tmp1 to float
+  ret float %tmp2
+}
+
+define double @t3(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr d0, [x0]
+; CHECK: scvtf d0, d0
+  %tmp1 = load i64* %src, align 4
+  %tmp2 = sitofp i64 %tmp1 to double
+  ret double %tmp2
+}
+
+define double @t4(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: ldr d0, [x0]
+; CHECK: ucvtf d0, d0
+  %tmp1 = load i64* %src, align 4
+  %tmp2 = uitofp i64 %tmp1 to double
+  ret double %tmp2
+}
+
+; rdar://13136456
+define double @t5(i32* nocapture %src) nounwind ssp optsize {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: ldr [[REG:w[0-9]+]], [x0]
+; CHECK: scvtf d0, [[REG]]
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = sitofp i32 %tmp1 to double
+  ret double %tmp2
+}
+
+; Check that we load in FP register when we want to convert into
+; floating point value.
+; This is much faster than loading on GPR and making the conversion
+; GPR -> FPR.
+; <rdar://problem/14599607>
+;
+; Check the flollowing patterns for signed/unsigned:
+; 1. load with scaled imm to float.
+; 2. load with scaled register to float.
+; 3. load with scaled imm to double.
+; 4. load with scaled register to double.
+; 5. load with unscaled imm to float.
+; 6. load with unscaled imm to double.
+; With loading size: 8, 16, 32, and 64-bits.
+
+; ********* 1. load with scaled imm to float. *********
+define float @fct1(i8* nocapture %sp0) {
+; CHECK-LABEL: fct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct2(i16* nocapture %sp0) {
+; CHECK-LABEL: fct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct3(i32* nocapture %sp0) {
+; CHECK-LABEL: fct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct4(i64* nocapture %sp0) {
+; CHECK-LABEL: fct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 2. load with scaled register to float. *********
+define float @fct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+
+; ********* 3. load with scaled imm to double. *********
+define double @fct9(i8* nocapture %sp0) {
+; CHECK-LABEL: fct9:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct10(i16* nocapture %sp0) {
+; CHECK-LABEL: fct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct11(i32* nocapture %sp0) {
+; CHECK-LABEL: fct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct12(i64* nocapture %sp0) {
+; CHECK-LABEL: fct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 4. load with scaled register to double. *********
+define double @fct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct13:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 5. load with unscaled imm to float. *********
+define float @fct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct18(i16* nocapture %sp0) {
+; CHECK-LABEL: fct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct19(i32* nocapture %sp0) {
+; CHECK-LABEL: fct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct20(i64* nocapture %sp0) {
+; CHECK-LABEL: fct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+
+}
+
+; ********* 6. load with unscaled imm to double. *********
+define double @fct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct21:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct22(i16* nocapture %sp0) {
+; CHECK-LABEL: fct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct23(i32* nocapture %sp0) {
+; CHECK-LABEL: fct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct24(i64* nocapture %sp0) {
+; CHECK-LABEL: fct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+
+}
+
+; ********* 1s. load with scaled imm to float. *********
+define float @sfct1(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct2(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct3(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct4(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 2s. load with scaled register to float. *********
+define float @sfct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 3s. load with scaled imm to double. *********
+define double @sfct9(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct9:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct10(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct11(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct12(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 4s. load with scaled register to double. *********
+define double @sfct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct13:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 5s. load with unscaled imm to float. *********
+define float @sfct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct18(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct19(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct20(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+
+}
+
+; ********* 6s. load with unscaled imm to double. *********
+define double @sfct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct21:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct22(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct23(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct24(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+
+}
+
+; Check that we do not use SSHLL code sequence when code size is a concern.
+define float @codesize_sfct17(i8* nocapture %sp0) optsize {
+entry:
+; CHECK-LABEL: codesize_sfct17:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define double @codesize_sfct11(i32* nocapture %sp0) minsize {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr w[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; Adding fp128 custom lowering makes these a little fragile since we have to
+; return the correct mix of Legal/Expand from the custom method.
+;
+; rdar://problem/14991489
+
+define float @float_from_i128(i128 %in) {
+; CHECK-LABEL: float_from_i128:
+; CHECK: bl {{_?__floatuntisf}}
+  %conv = uitofp i128 %in to float
+  ret float %conv
+}
+
+define double @double_from_i128(i128 %in) {
+; CHECK-LABEL: double_from_i128:
+; CHECK: bl {{_?__floattidf}}
+  %conv = sitofp i128 %in to double
+  ret double %conv
+}
+
+define fp128 @fp128_from_i128(i128 %in) {
+; CHECK-LABEL: fp128_from_i128:
+; CHECK: bl {{_?__floatuntitf}}
+  %conv = uitofp i128 %in to fp128
+  ret fp128 %conv
+}
+
+define i128 @i128_from_float(float %in) {
+; CHECK-LABEL: i128_from_float
+; CHECK: bl {{_?__fixsfti}}
+  %conv = fptosi float %in to i128
+  ret i128 %conv
+}
+
+define i128 @i128_from_double(double %in) {
+; CHECK-LABEL: i128_from_double
+; CHECK: bl {{_?__fixunsdfti}}
+  %conv = fptoui double %in to i128
+  ret i128 %conv
+}
+
+define i128 @i128_from_fp128(fp128 %in) {
+; CHECK-LABEL: i128_from_fp128
+; CHECK: bl {{_?__fixtfti}}
+  %conv = fptosi fp128 %in to i128
+  ret i128 %conv
+}
+
diff --git a/test/CodeGen/ARM64/shifted-sext.ll b/test/CodeGen/ARM64/shifted-sext.ll
new file mode 100644
index 0000000..e553be5
--- /dev/null
+++ b/test/CodeGen/ARM64/shifted-sext.ll
@@ -0,0 +1,277 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; <rdar://problem/13820218>
+
+define signext i16 @extendedLeftShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #28, #7
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv1, 4
+  %conv2 = trunc i32 %shl to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #4, #7
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shr4 = lshr i32 %conv1, 4
+  %conv2 = trunc i32 %shr4 to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedLeftShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #24, #7
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv1, 8
+  %conv2 = trunc i32 %shl to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shr4 = lshr i32 %conv1, 8
+  %conv2 = trunc i32 %shr4 to i16
+  ret i16 %conv2
+}
+
+define i32 @extendedLeftShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #28, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv, 4
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #4, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shr = ashr i32 %conv, 4
+  ret i32 %shr
+}
+
+define i32 @extendedLeftShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #24, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv, 8
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shr = ashr i32 %conv, 8
+  ret i32 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #60, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #4, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #56, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shl = shl nsw i64 %conv, 8
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtb x[[REG]], x[[REG]]
+; CHECK: asr x0, x[[REG]], #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shr = ashr i64 %conv, 8
+  ret i64 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #28, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shl = shl nsw i32 %conv, 4
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #4, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shr = ashr i32 %conv, 4
+  ret i32 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: lsl w0, [[REG]], #16
+  %inc = add i16 %a, 1
+  %conv2 = zext i16 %inc to i32
+  %shl = shl nuw i32 %conv2, 16
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxth [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shr = ashr i32 %conv, 16
+  ret i32 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #60, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #4, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #48, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shl = shl nsw i64 %conv, 16
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxth x[[REG]], x[[REG]]
+; CHECK: asr x0, x[[REG]], #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shr = ashr i64 %conv, 16
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #60, #31
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #4, #31
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: lsl x0, x[[REG]], #32
+  %inc = add nsw i32 %a, 1
+  %conv2 = zext i32 %inc to i64
+  %shl = shl nuw i64 %conv2, 32
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtw x[[REG]], x[[REG]]
+; CHECK: asr x0, x[[REG]], #32
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shr = ashr i64 %conv, 32
+  ret i64 %shr
+}
diff --git a/test/CodeGen/ARM64/simd-scalar-to-vector.ll b/test/CodeGen/ARM64/simd-scalar-to-vector.ll
new file mode 100644
index 0000000..6c0b840
--- /dev/null
+++ b/test/CodeGen/ARM64/simd-scalar-to-vector.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -O0 | FileCheck %s --check-prefix=CHECK-FAST
+
+define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
+; CHECK: uaddlv.16b h0, v0
+; CHECK: rshrn.8b v0, v0, #4
+; CHECK: dup.16b v0, v0[0]
+; CHECK: ret
+
+; CHECK-FAST: uaddlv.16b
+; CHECK-FAST: rshrn.8b
+; CHECK-FAST: dup.16b
+  %tmp = tail call i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp1 = trunc i32 %tmp to i16
+  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
+  %tmp3 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
+  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp4
+}
+
+declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/simplest-elf.ll b/test/CodeGen/ARM64/simplest-elf.ll
new file mode 100644
index 0000000..1254365
--- /dev/null
+++ b/test/CodeGen/ARM64/simplest-elf.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump - -r -d --triple=arm64-linux-gnu | FileCheck --check-prefix=CHECK-ELF %s
+
+define void @foo() nounwind {
+  ret void
+}
+
+  ; Check source looks ELF-like: no leading underscore, comments with //
+; CHECK: foo: // @foo
+; CHECK:     ret
+
+  ; Similarly make sure ELF output works and is vaguely sane: aarch64 target
+  ; machine with correct section & symbol names.
+; CHECK-ELF: file format ELF64-aarch64
+
+; CHECK-ELF: Disassembly of section .text
+; CHECK-ELF-LABEL: foo:
+; CHECK-ELF:    ret
diff --git a/test/CodeGen/ARM64/sincos.ll b/test/CodeGen/ARM64/sincos.ll
new file mode 100644
index 0000000..06157b2
--- /dev/null
+++ b/test/CodeGen/ARM64/sincos.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7 | FileCheck %s --check-prefix CHECK-IOS
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix CHECK-LINUX
+
+; Combine sin / cos into a single call.
+; rdar://12856873
+
+define float @test1(float %x) nounwind {
+entry:
+; CHECK-IOS-LABEL: test1:
+; CHECK-IOS: bl ___sincosf_stret
+; CHECK-IOS: fadd s0, s0, s1
+
+; CHECK-LINUX-LABEL: test1:
+; CHECK-LINUX: bl sinf
+; CHECK-LINUX: bl cosf
+
+  %call = tail call float @sinf(float %x) nounwind readnone
+  %call1 = tail call float @cosf(float %x) nounwind readnone
+  %add = fadd float %call, %call1
+  ret float %add
+}
+
+define double @test2(double %x) nounwind {
+entry:
+; CHECK-IOS-LABEL: test2:
+; CHECK-IOS: bl ___sincos_stret
+; CHECK-IOS: fadd d0, d0, d1
+
+; CHECK-LINUX-LABEL: test2:
+; CHECK-LINUX: bl sin
+; CHECK-LINUX: bl cos
+
+  %call = tail call double @sin(double %x) nounwind readnone
+  %call1 = tail call double @cos(double %x) nounwind readnone
+  %add = fadd double %call, %call1
+  ret double %add
+}
+
+declare float  @sinf(float) readonly
+declare double @sin(double) readonly
+declare float @cosf(float) readonly
+declare double @cos(double) readonly
diff --git a/test/CodeGen/ARM64/sitofp-combine-chains.ll b/test/CodeGen/ARM64/sitofp-combine-chains.ll
new file mode 100644
index 0000000..10b433b
--- /dev/null
+++ b/test/CodeGen/ARM64/sitofp-combine-chains.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=arm64 -o -  %s | FileCheck %s
+
+; ARM64ISelLowering.cpp was creating a new (floating-point) load for efficiency
+; but not updating chain-successors of the old one. As a result, the two memory
+; operations in this function both ended up direct successors to the EntryToken
+; and could be reordered.
+
+@var = global i32 0, align 4
+
+define float @foo() {
+; CHECK-LABEL: foo:
+  ; Load must come before we clobber @var
+; CHECK: adrp x[[VARBASE:[0-9]+]], {{_?var}}
+; CHECK: ldr [[SREG:s[0-9]+]], [x[[VARBASE]],
+; CHECK: str wzr, [x[[VARBASE]],
+
+  %val = load i32* @var, align 4
+  store i32 0, i32* @var, align 4
+
+  %fltval = sitofp i32 %val to float
+  ret float %fltval
+}
diff --git a/test/CodeGen/ARM64/sli-sri-opt.ll b/test/CodeGen/ARM64/sli-sri-opt.ll
new file mode 100644
index 0000000..725dcd5
--- /dev/null
+++ b/test/CodeGen/ARM64/sli-sri-opt.ll
@@ -0,0 +1,41 @@
+; RUN: llc -arm64-shift-insert-generation=true -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood:
+; CHECK: sli.16b v0, v1, #3
+  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+  %vshl_n = shl <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testLeftBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad:
+; CHECK-NOT: sli
+  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+  %vshl_n = shl <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testRightGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightGood:
+; CHECK: sri.16b v0, v1, #3
+  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+  %vshl_n = lshr <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testRightBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightBad:
+; CHECK-NOT: sri
+  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+  %vshl_n = lshr <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
diff --git a/test/CodeGen/ARM64/smaxv.ll b/test/CodeGen/ARM64/smaxv.ll
new file mode 100644
index 0000000..4f6e01b
--- /dev/null
+++ b/test/CodeGen/ARM64/smaxv.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
+; CHECK: test_vmaxv_s8
+; CHECK: smaxv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vmaxv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vmaxv_s16(<4 x i16> %a1) {
+; CHECK: test_vmaxv_s16
+; CHECK: smaxv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vmaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxv_s32(<2 x i32> %a1) {
+; CHECK: test_vmaxv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: smaxp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vmaxv.i
+}
+
+define signext i8 @test_vmaxvq_s8(<16 x i8> %a1) {
+; CHECK: test_vmaxvq_s8
+; CHECK: smaxv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vmaxv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vmaxvq_s16(<8 x i16> %a1) {
+; CHECK: test_vmaxvq_s16
+; CHECK: smaxv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vmaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a1) {
+; CHECK: test_vmaxvq_s32
+; CHECK: smaxv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vmaxv.i
+}
+
+declare i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8>)
+
diff --git a/test/CodeGen/ARM64/sminv.ll b/test/CodeGen/ARM64/sminv.ll
new file mode 100644
index 0000000..a246868
--- /dev/null
+++ b/test/CodeGen/ARM64/sminv.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vminv_s8(<8 x i8> %a1) {
+; CHECK: test_vminv_s8
+; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vminv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vminv_s16(<4 x i16> %a1) {
+; CHECK: test_vminv_s16
+; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminv_s32(<2 x i32> %a1) {
+; CHECK: test_vminv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: sminp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vminv.i
+}
+
+define signext i8 @test_vminvq_s8(<16 x i8> %a1) {
+; CHECK: test_vminvq_s8
+; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vminv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vminvq_s16(<8 x i16> %a1) {
+; CHECK: test_vminvq_s16
+; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a1) {
+; CHECK: test_vminvq_s32
+; CHECK: sminv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vminv.i
+}
+
+declare i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8>)
+
diff --git a/test/CodeGen/ARM64/spill-lr.ll b/test/CodeGen/ARM64/spill-lr.ll
new file mode 100644
index 0000000..fb6588e
--- /dev/null
+++ b/test/CodeGen/ARM64/spill-lr.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s
+@bar = common global i32 0, align 4
+
+; Leaf function which uses all callee-saved registers and allocates >= 256 bytes on the stack
+; this will cause processFunctionBeforeCalleeSavedScan() to spill LR as an additional scratch
+; register.
+;
+; This is a crash-only regression test for rdar://15124582.
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind {
+entry:
+  %stack = alloca [128 x i32], align 4
+  %0 = bitcast [128 x i32]* %stack to i8*
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom
+  store i32 %b, i32* %arrayidx, align 4
+  %1 = load volatile i32* @bar, align 4
+  %2 = load volatile i32* @bar, align 4
+  %3 = load volatile i32* @bar, align 4
+  %4 = load volatile i32* @bar, align 4
+  %5 = load volatile i32* @bar, align 4
+  %6 = load volatile i32* @bar, align 4
+  %7 = load volatile i32* @bar, align 4
+  %8 = load volatile i32* @bar, align 4
+  %9 = load volatile i32* @bar, align 4
+  %10 = load volatile i32* @bar, align 4
+  %11 = load volatile i32* @bar, align 4
+  %12 = load volatile i32* @bar, align 4
+  %13 = load volatile i32* @bar, align 4
+  %14 = load volatile i32* @bar, align 4
+  %15 = load volatile i32* @bar, align 4
+  %16 = load volatile i32* @bar, align 4
+  %17 = load volatile i32* @bar, align 4
+  %18 = load volatile i32* @bar, align 4
+  %19 = load volatile i32* @bar, align 4
+  %20 = load volatile i32* @bar, align 4
+  %idxprom1 = sext i32 %c to i64
+  %arrayidx2 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom1
+  %21 = load i32* %arrayidx2, align 4
+  %factor = mul i32 %h, -2
+  %factor67 = mul i32 %g, -2
+  %factor68 = mul i32 %f, -2
+  %factor69 = mul i32 %e, -2
+  %factor70 = mul i32 %d, -2
+  %factor71 = mul i32 %c, -2
+  %factor72 = mul i32 %b, -2
+  %sum = add i32 %2, %1
+  %sum73 = add i32 %sum, %3
+  %sum74 = add i32 %sum73, %4
+  %sum75 = add i32 %sum74, %5
+  %sum76 = add i32 %sum75, %6
+  %sum77 = add i32 %sum76, %7
+  %sum78 = add i32 %sum77, %8
+  %sum79 = add i32 %sum78, %9
+  %sum80 = add i32 %sum79, %10
+  %sum81 = add i32 %sum80, %11
+  %sum82 = add i32 %sum81, %12
+  %sum83 = add i32 %sum82, %13
+  %sum84 = add i32 %sum83, %14
+  %sum85 = add i32 %sum84, %15
+  %sum86 = add i32 %sum85, %16
+  %sum87 = add i32 %sum86, %17
+  %sum88 = add i32 %sum87, %18
+  %sum89 = add i32 %sum88, %19
+  %sum90 = add i32 %sum89, %20
+  %sub15 = sub i32 %21, %sum90
+  %sub16 = add i32 %sub15, %factor
+  %sub17 = add i32 %sub16, %factor67
+  %sub18 = add i32 %sub17, %factor68
+  %sub19 = add i32 %sub18, %factor69
+  %sub20 = add i32 %sub19, %factor70
+  %sub21 = add i32 %sub20, %factor71
+  %add = add i32 %sub21, %factor72
+  ret i32 %add
+}
diff --git a/test/CodeGen/ARM64/spill.ll b/test/CodeGen/ARM64/spill.ll
new file mode 100644
index 0000000..9173c87
--- /dev/null
+++ b/test/CodeGen/ARM64/spill.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs
+
+; CHECK: fpr128
+; CHECK: ld1.2d
+; CHECK: str q
+; CHECK: inlineasm
+; CHECK: ldr q
+; CHECK: st1.2d
+define void @fpr128(<4 x float>* %p) nounwind ssp {
+entry:
+  %x = load <4 x float>* %p, align 16
+  call void asm sideeffect "; inlineasm", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23},~{q24},~{q25},~{q26},~{q27},~{q28},~{q29},~{q30},~{q31},~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+  store <4 x float> %x, <4 x float>* %p, align 16
+  ret void
+}
diff --git a/test/CodeGen/ARM64/st1.ll b/test/CodeGen/ARM64/st1.ll
new file mode 100644
index 0000000..b9aafc6
--- /dev/null
+++ b/test/CodeGen/ARM64/st1.ll
@@ -0,0 +1,676 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+define void @st1lane_16b(<16 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_16b
+; CHECK: st1.b
+  %tmp = extractelement <16 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_8h(<8 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_8h
+; CHECK: st1.h
+  %tmp = extractelement <8 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_4s(<4 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_4s
+; CHECK: st1.s
+  %tmp = extractelement <4 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_4s_float(<4 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_4s_float
+; CHECK: st1.s
+  %tmp = extractelement <4 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
+define void @st1lane_2d(<2 x i64> %A, i64* %D) {
+; CHECK-LABEL: st1lane_2d
+; CHECK: st1.d
+  %tmp = extractelement <2 x i64> %A, i32 1
+  store i64 %tmp, i64* %D
+  ret void
+}
+
+define void @st1lane_2d_double(<2 x double> %A, double* %D) {
+; CHECK-LABEL: st1lane_2d_double
+; CHECK: st1.d
+  %tmp = extractelement <2 x double> %A, i32 1
+  store double %tmp, double* %D
+  ret void
+}
+
+define void @st1lane_8b(<8 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_8b
+; CHECK: st1.b
+  %tmp = extractelement <8 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_4h(<4 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_4h
+; CHECK: st1.h
+  %tmp = extractelement <4 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_2s(<2 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_2s
+; CHECK: st1.s
+  %tmp = extractelement <2 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_2s_float(<2 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_2s_float
+; CHECK: st1.s
+  %tmp = extractelement <2 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
+define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
+; CHECK-LABEL: st2lane_16b
+; CHECK: st2.b
+  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
+  ret void
+}
+
+define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
+; CHECK-LABEL: st2lane_8h
+; CHECK: st2.h
+  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
+  ret void
+}
+
+define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
+; CHECK-LABEL: st2lane_4s
+; CHECK: st2.s
+  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
+  ret void
+}
+
+define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
+; CHECK-LABEL: st2lane_2d
+; CHECK: st2.d
+  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
+; CHECK-LABEL: st3lane_16b
+; CHECK: st3.b
+  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
+  ret void
+}
+
+define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
+; CHECK-LABEL: st3lane_8h
+; CHECK: st3.h
+  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
+  ret void
+}
+
+define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
+; CHECK-LABEL: st3lane_4s
+; CHECK: st3.s
+  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
+  ret void
+}
+
+define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
+; CHECK-LABEL: st3lane_2d
+; CHECK: st3.d
+  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
+; CHECK-LABEL: st4lane_16b
+; CHECK: st4.b
+  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
+  ret void
+}
+
+define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
+; CHECK-LABEL: st4lane_8h
+; CHECK: st4.h
+  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
+  ret void
+}
+
+define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
+; CHECK-LABEL: st4lane_4s
+; CHECK: st4.s
+  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
+  ret void
+}
+
+define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
+; CHECK-LABEL: st4lane_2d
+; CHECK: st4.d
+  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+
+define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
+; CHECK-LABEL: st2_8b
+; CHECK st2.8b
+	call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
+; CHECK-LABEL: st3_8b
+; CHECK st3.8b
+	call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
+; CHECK-LABEL: st4_8b
+; CHECK st4.8b
+	call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+
+define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
+; CHECK-LABEL: st2_16b
+; CHECK st2.16b
+	call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
+; CHECK-LABEL: st3_16b
+; CHECK st3.16b
+	call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
+; CHECK-LABEL: st4_16b
+; CHECK st4.16b
+	call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+
+define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
+; CHECK-LABEL: st2_4h
+; CHECK st2.4h
+	call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
+; CHECK-LABEL: st3_4h
+; CHECK st3.4h
+	call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
+; CHECK-LABEL: st4_4h
+; CHECK st4.4h
+	call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+
+define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
+; CHECK-LABEL: st2_8h
+; CHECK st2.8h
+	call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
+; CHECK-LABEL: st3_8h
+; CHECK st3.8h
+	call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
+; CHECK-LABEL: st4_8h
+; CHECK st4.8h
+	call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+
+define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
+; CHECK-LABEL: st2_2s
+; CHECK st2.2s
+	call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
+; CHECK-LABEL: st3_2s
+; CHECK st3.2s
+	call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
+; CHECK-LABEL: st4_2s
+; CHECK st4.2s
+	call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+
+define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
+; CHECK-LABEL: st2_4s
+; CHECK st2.4s
+	call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
+; CHECK-LABEL: st3_4s
+; CHECK st3.4s
+	call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
+; CHECK-LABEL: st4_4s
+; CHECK st4.4s
+	call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+
+define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
+; CHECK-LABEL: st2_1d
+; CHECK st1.2d
+	call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
+; CHECK-LABEL: st3_1d
+; CHECK st1.3d
+	call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
+; CHECK-LABEL: st4_1d
+; CHECK st1.4d
+	call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+
+define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
+; CHECK-LABEL: st2_2d
+; CHECK st2.2d
+	call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
+; CHECK-LABEL: st3_2d
+; CHECK st2.3d
+	call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
+; CHECK-LABEL: st4_2d
+; CHECK st2.4d
+	call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+
+declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
+  ret void
+}
+
+
+declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/stack-no-frame.ll b/test/CodeGen/ARM64/stack-no-frame.ll
new file mode 100644
index 0000000..b5970c0
--- /dev/null
+++ b/test/CodeGen/ARM64/stack-no-frame.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+
+@global = global [20 x i64] zeroinitializer, align 8
+
+; The following function has enough locals to need some restoring, but not a
+; frame record. In an intermediate frame refactoring, prologue and epilogue were
+; inconsistent about how much to move SP.
+define void @test_stack_no_frame() {
+; CHECK: test_stack_no_frame
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+  %local = alloca [20 x i64]
+  %val = load volatile [20 x i64]* @global, align 8
+  store volatile [20 x i64] %val, [20 x i64]* %local, align 8
+
+  %val2 = load volatile [20 x i64]* %local, align 8
+  store volatile [20 x i64] %val2, [20 x i64]* @global, align 8
+
+; CHECK: add sp, sp, #[[STACKSIZE]]
+  ret void
+}
diff --git a/test/CodeGen/ARM64/stackmap.ll b/test/CodeGen/ARM64/stackmap.ll
new file mode 100644
index 0000000..2c7c6ae
--- /dev/null
+++ b/test/CodeGen/ARM64/stackmap.ll
@@ -0,0 +1,288 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+;
+; Note: Print verbose stackmaps using -debug-only=stackmaps.
+
+; We are not getting the correct stack alignment when cross compiling for arm64.
+; So specify a datalayout here.
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 11
+; Num LargeConstants
+; CHECK-NEXT:   .long 2
+; Num Callsites
+; CHECK-NEXT:   .long 11
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _constantargs
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _osrinline
+; CHECK-NEXT:   .quad 32
+; CHECK-NEXT:   .quad _osrcold
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _propertyRead
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _propertyWrite
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _jsVoidCall
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _jsIntCall
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _spilledValue
+; CHECK-NEXT:   .quad 160
+; CHECK-NEXT:   .quad _spilledStackMapValue
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad _liveConstant
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _clobberLR
+; CHECK-NEXT:   .quad 112
+
+; Num LargeConstants
+; CHECK-NEXT:   .quad   4294967295
+; CHECK-NEXT:   .quad   4294967296
+
+; Constant arguments
+;
+; CHECK-NEXT:   .quad   1
+; CHECK-NEXT:   .long   L{{.*}}-_constantargs
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  4
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65535
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65536
+; SmallConstant
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+; LargeConstant at index 0
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   1
+
+define void @constantargs() {
+entry:
+  %0 = inttoptr i64 244837814094590 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  ret void
+}
+
+; Inline OSR Exit
+;
+; CHECK-LABEL:  .long   L{{.*}}-_osrinline
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrinline(i64 %a, i64 %b) {
+entry:
+  ; Runtime void->void call.
+  call void inttoptr (i64 244837814094590 to void ()*)()
+  ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+  ret void
+}
+
+; Cold OSR Exit
+;
+; 2 live variables in register.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_osrcold
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrcold(i64 %a, i64 %b) {
+entry:
+  %test = icmp slt i64 %a, %b
+  br i1 %test, label %ret, label %cold
+cold:
+  ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
+  %thunk = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
+  unreachable
+ret:
+  ret void
+}
+
+; Property Read
+; CHECK-LABEL:  .long   L{{.*}}-_propertyRead
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+;
+; FIXME: There are currently no stackmap entries. After moving to
+; AnyRegCC, we will have entries for the object and return value.
+define i64 @propertyRead(i64* %obj) {
+entry:
+  %resolveRead = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Property Write
+; CHECK-LABEL:  .long   L{{.*}}-_propertyWrite
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
+entry:
+  %resolveWrite = inttoptr i64 244837814094590 to i8*
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  ret void
+}
+
+; Void JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_jsVoidCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  ret void
+}
+
+; i64 JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_jsIntCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Spilled stack map values.
+;
+; Verify 28 stack map entries.
+;
+; CHECK-LABEL:  .long L{{.*}}-_spilledValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 28
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
+entry:
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+  ret void
+}
+
+; Spilled stack map values.
+;
+; Verify 23 stack map entries.
+;
+; CHECK-LABEL:  .long L{{.*}}-_spilledStackMapValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 30
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
+entry:
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
+  ret void
+}
+
+
+; Map a constant value.
+;
+; CHECK-LABEL:  .long L{{.*}}-_liveConstant
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   33
+
+define void @liveConstant() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
+  ret void
+}
+
+; Map a value when LR is the only free register.
+;
+; CHECK-LABEL:  .long L{{.*}}-_clobberLR
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: Indirect FP (r29) - offset
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .short  29
+; CHECK-NEXT:   .long   -{{[0-9]+}}
+define void @clobberLR(i32 %a) {
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/stacksave.ll b/test/CodeGen/ARM64/stacksave.ll
new file mode 100644
index 0000000..a79e99b
--- /dev/null
+++ b/test/CodeGen/ARM64/stacksave.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -verify-coalescing
+; <rdar://problem/11522048>
+target triple = "arm64-apple-macosx10.8.0"
+
+; Verify that we can handle spilling the stack pointer without attempting
+; spilling it directly.
+; CHECK: f
+; CHECK: mov [[X0:x[0-9]+]], sp
+; CHECK: str [[X0]]
+; CHECK: inlineasm
+define void @f() nounwind ssp {
+entry:
+  %savedstack = call i8* @llvm.stacksave() nounwind
+  call void asm sideeffect "; inlineasm", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+  call void @llvm.stackrestore(i8* %savedstack) nounwind
+  ret void
+}
+
+declare i8* @llvm.stacksave() nounwind
+declare void @llvm.stackrestore(i8*) nounwind
diff --git a/test/CodeGen/ARM64/stp.ll b/test/CodeGen/ARM64/stp.ll
new file mode 100644
index 0000000..eacf093
--- /dev/null
+++ b/test/CodeGen/ARM64/stp.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=arm64 -arm64-stp-suppress=false -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
+; RUN:   -verify-machineinstrs | FileCheck -check-prefix=STUR_CHK %s
+
+; CHECK: stp_int
+; CHECK: stp w0, w1, [x2]
+define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+  store i32 %a, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32* %p, i64 1
+  store i32 %b, i32* %add.ptr, align 4
+  ret void
+}
+
+; CHECK: stp_long
+; CHECK: stp x0, x1, [x2]
+define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+  store i64 %a, i64* %p, align 8
+  %add.ptr = getelementptr inbounds i64* %p, i64 1
+  store i64 %b, i64* %add.ptr, align 8
+  ret void
+}
+
+; CHECK: stp_float
+; CHECK: stp s0, s1, [x0]
+define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
+  store float %a, float* %p, align 4
+  %add.ptr = getelementptr inbounds float* %p, i64 1
+  store float %b, float* %add.ptr, align 4
+  ret void
+}
+
+; CHECK: stp_double
+; CHECK: stp d0, d1, [x0]
+define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
+  store double %a, double* %p, align 8
+  %add.ptr = getelementptr inbounds double* %p, i64 1
+  store double %b, double* %add.ptr, align 8
+  ret void
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+; STUR_CHK: stur_int
+; STUR_CHK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %p, i32 -1
+  store i32 %a, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %p, i32 -2
+  store i32 %b, i32* %p2, align 2
+  ret void
+}
+
+define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+; STUR_CHK: stur_long
+; STUR_CHK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %p, i32 -1
+  store i64 %a, i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %p, i32 -2
+  store i64 %b, i64* %p2, align 2
+  ret void
+}
+
+define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
+; STUR_CHK: stur_float
+; STUR_CHK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds float* %p, i32 -1
+  store float %a, float* %p1, align 2
+  %p2 = getelementptr inbounds float* %p, i32 -2
+  store float %b, float* %p2, align 2
+  ret void
+}
+
+define void @stur_double(double %a, double %b, double* nocapture %p) nounwind {
+; STUR_CHK: stur_double
+; STUR_CHK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds double* %p, i32 -1
+  store double %a, double* %p1, align 2
+  %p2 = getelementptr inbounds double* %p, i32 -2
+  store double %b, double* %p2, align 2
+  ret void
+}
+
+define void @splat_v4i32(i32 %v, i32 *%p) {
+entry:
+
+; CHECK-LABEL: splat_v4i32
+; CHECK-DAG: stp w0, w0, [x1]
+; CHECK-DAG: stp w0, w0, [x1, #8]
+; CHECK: ret
+
+  %p17 = insertelement <4 x i32> undef, i32 %v, i32 0
+  %p18 = insertelement <4 x i32> %p17, i32 %v, i32 1
+  %p19 = insertelement <4 x i32> %p18, i32 %v, i32 2
+  %p20 = insertelement <4 x i32> %p19, i32 %v, i32 3
+  %p21 = bitcast i32* %p to <4 x i32>*
+  store <4 x i32> %p20, <4 x i32>* %p21, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM64/strict-align.ll b/test/CodeGen/ARM64/strict-align.ll
new file mode 100644
index 0000000..e392172
--- /dev/null
+++ b/test/CodeGen/ARM64/strict-align.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -arm64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
+
+define i32 @f0(i32* nocapture %p) nounwind {
+; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
+; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
+; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
+; CHECK-STRICT: ret
+
+; CHECK: ldr w0, [x0]
+; CHECK: ret
+  %tmp = load i32* %p, align 2
+  ret i32 %tmp
+}
+
+define i64 @f1(i64* nocapture %p) nounwind {
+; CHECK-STRICT:	ldp	w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
+; CHECK-STRICT:	orr	x0, x[[LOW]], x[[HIGH]], lsl #32
+; CHECK-STRICT:	ret
+
+; CHECK: ldr x0, [x0]
+; CHECK: ret
+  %tmp = load i64* %p, align 4
+  ret i64 %tmp
+}
diff --git a/test/CodeGen/ARM64/stur.ll b/test/CodeGen/ARM64/stur.ll
new file mode 100644
index 0000000..8326bba
--- /dev/null
+++ b/test/CodeGen/ARM64/stur.ll
@@ -0,0 +1,98 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+%struct.X = type <{ i32, i64, i64 }>
+
+define void @foo1(i32* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: 	stur	w1, [x0, #-4]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i32
+  %ptr = getelementptr inbounds i32* %p, i64 -1
+  store i32 %tmp1, i32* %ptr, align 4
+  ret void
+}
+define void @foo2(i16* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK: 	sturh	w1, [x0, #-2]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i16
+  %ptr = getelementptr inbounds i16* %p, i64 -1
+  store i16 %tmp1, i16* %ptr, align 2
+  ret void
+}
+define void @foo3(i8* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK: 	sturb	w1, [x0, #-1]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i8
+  %ptr = getelementptr inbounds i8* %p, i64 -1
+  store i8 %tmp1, i8* %ptr, align 1
+  ret void
+}
+define void @foo4(i16* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK: 	sturh	w1, [x0, #-2]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i32 %val to i16
+  %ptr = getelementptr inbounds i16* %p, i32 -1
+  store i16 %tmp1, i16* %ptr, align 2
+  ret void
+}
+define void @foo5(i8* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK: 	sturb	w1, [x0, #-1]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i32 %val to i8
+  %ptr = getelementptr inbounds i8* %p, i32 -1
+  store i8 %tmp1, i8* %ptr, align 1
+  ret void
+}
+
+define void @foo(%struct.X* nocapture %p) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: str
+; CHECK: stur    xzr, [x0, #12]
+; CHECK-NEXT: stur    xzr, [x0, #4]
+; CHECK-NEXT: ret
+  %B = getelementptr inbounds %struct.X* %p, i64 0, i32 1
+  %val = bitcast i64* %B to i8*
+  call void @llvm.memset.p0i8.i64(i8* %val, i8 0, i64 16, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+; Unaligned 16b stores are split into 8b stores for performance.
+; radar://15424193
+
+; CHECK-LABEL: unaligned:
+; CHECK-NOT: str q0
+; CHECK: str     d[[REG:[0-9]+]], [x0]
+; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG]], v[[REG]], #8
+; CHECK: str     d[[REG2]], [x0, #8]
+define void @unaligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 4
+  ret void
+}
+
+; CHECK-LABEL: aligned:
+; CHECK: str q0
+define void @aligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p
+  ret void
+}
+
+; Don't split one and two byte aligned stores.
+; radar://16349308
+
+; CHECK-LABEL: twobytealign:
+; CHECK: str q0
+define void @twobytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 2
+  ret void
+}
+; CHECK-LABEL: onebytealign:
+; CHECK: str q0
+define void @onebytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM64/subvector-extend.ll b/test/CodeGen/ARM64/subvector-extend.ll
new file mode 100644
index 0000000..ad2f06c
--- /dev/null
+++ b/test/CodeGen/ARM64/subvector-extend.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+; Test efficient codegen of vector extends up from legal type to 128 bit
+; and 256 bit vector types.
+
+;-----
+; Vectors of i16.
+;-----
+define <8 x i16> @func1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func1:
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i8> %v0 to <8 x i16>
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @func2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func2:
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i8> %v0 to <8 x i16>
+  ret <8 x i16> %r
+}
+
+define <16 x i16> @func3(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll2.8h  v1, v0, #0
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <16 x i8> %v0 to <16 x i16>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @func4(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll2.8h  v1, v0, #0
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <16 x i8> %v0 to <16 x i16>
+  ret <16 x i16> %r
+}
+
+;-----
+; Vectors of i32.
+;-----
+
+define <4 x i32> @afunc1(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc1:
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i16> %v0 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @afunc2(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc2:
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i16> %v0 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @afunc3(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc3:
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i16> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @afunc4(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc4:
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i16> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc1:
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i8> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc2:
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i8> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+;-----
+; Vectors of i64.
+;-----
+
+define <4 x i64> @zfunc1(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc1:
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i32> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @zfunc2(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc2:
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i32> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @bfunc3(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll.4s  v0, v0, #0
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i16> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @cfunc4(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll.4s  v0, v0, #0
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i16> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
diff --git a/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll b/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
new file mode 100644
index 0000000..4ab2bee
--- /dev/null
+++ b/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://13214163 - Make sure we generate a correct lookup table for the TBL
+; instruction when the element size of the vector is not 8 bits. We were
+; getting both the endianness wrong and the element indexing wrong.
+define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
+; CHECK:	.section	__TEXT,__literal16,16byte_literals
+; CHECK:	.align	4
+; CHECK:lCPI0_0:
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.section __TEXT,__text,regular,pure_instructions
+; CHECK:	.globl	_foo
+; CHECK:	.align	2
+; CHECK:_foo:                                   ; @foo
+; CHECK:	adrp	[[BASE:x[0-9]+]], lCPI0_0@PAGE
+; CHECK:	ldr	q[[REG:[0-9]+]], {{\[}}[[BASE]], lCPI0_0@PAGEOFF]
+; CHECK:	tbl.16b	v0, { v0 }, v[[REG]]
+; CHECK:	ret
+
+  %val = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %val
+}
diff --git a/test/CodeGen/ARM64/tbl.ll b/test/CodeGen/ARM64/tbl.ll
new file mode 100644
index 0000000..e1edd21
--- /dev/null
+++ b/test/CodeGen/ARM64/tbl.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+; CHECK: tbl1_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK: tbl1_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+; CHECK: tbl2_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK: tbl2_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbl3_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbl3_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbl4_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbl4_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+; CHECK: tbx1_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+; CHECK: tbx1_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbx2_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbx2_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbx3_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbx3_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+; CHECK: tbx4_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+; CHECK: tbx4_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/this-return.ll b/test/CodeGen/ARM64/this-return.ll
new file mode 100644
index 0000000..30f5b9b
--- /dev/null
+++ b/test/CodeGen/ARM64/this-return.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+%struct.A = type { i8 }
+%struct.B = type { i32 }
+%struct.C = type { %struct.B }
+%struct.D = type { %struct.B }
+%struct.E = type { %struct.B, %struct.B }
+
+declare %struct.A* @A_ctor_base(%struct.A* returned)
+declare %struct.B* @B_ctor_base(%struct.B* returned, i32)
+declare %struct.B* @B_ctor_complete(%struct.B* returned, i32)
+
+declare %struct.A* @A_ctor_base_nothisret(%struct.A*)
+declare %struct.B* @B_ctor_base_nothisret(%struct.B*, i32)
+declare %struct.B* @B_ctor_complete_nothisret(%struct.B*, i32)
+
+define %struct.C* @C_ctor_base(%struct.C* returned %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?A_ctor_base}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_base}}
+  %0 = bitcast %struct.C* %this to %struct.A*
+  %call = tail call %struct.A* @A_ctor_base(%struct.A* %0)
+  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+  %call2 = tail call %struct.B* @B_ctor_base(%struct.B* %1, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base_nothisret:
+; CHECK: mov [[SAVETHIS:x[0-9]+]], x0
+; CHECK: bl {{_?A_ctor_base_nothisret}}
+; CHECK: mov x0, [[SAVETHIS]]
+; CHECK-NOT: b {{_?B_ctor_base_nothisret}}
+  %0 = bitcast %struct.C* %this to %struct.A*
+  %call = tail call %struct.A* @A_ctor_base_nothisret(%struct.A* %0)
+  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+  %call2 = tail call %struct.B* @B_ctor_base_nothisret(%struct.B* %1, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete:
+; CHECK: b {{_?C_ctor_base}}
+  %call = tail call %struct.C* @C_ctor_base(%struct.C* %this, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete_nothisret:
+; CHECK-NOT: b {{_?C_ctor_base_nothisret}}
+  %call = tail call %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.D* @D_ctor_base(%struct.D* %this, i32 %x) {
+entry:
+; CHECK-LABEL: D_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?B_ctor_complete}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_complete}}
+  %b = getelementptr inbounds %struct.D* %this, i32 0, i32 0
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  ret %struct.D* %this
+}
+
+define %struct.E* @E_ctor_base(%struct.E* %this, i32 %x) {
+entry:
+; CHECK-LABEL: E_ctor_base:
+; CHECK-NOT: b {{_?B_ctor_complete}}
+  %b = getelementptr inbounds %struct.E* %this, i32 0, i32 0
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %b2 = getelementptr inbounds %struct.E* %this, i32 0, i32 1
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b2, i32 %x)
+  ret %struct.E* %this
+}
diff --git a/test/CodeGen/ARM64/tls-darwin.ll b/test/CodeGen/ARM64/tls-darwin.ll
new file mode 100644
index 0000000..5e8ec33
--- /dev/null
+++ b/test/CodeGen/ARM64/tls-darwin.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+
+@var = thread_local global i8 0
+
+; N.b. x0 must be the result of the first load (i.e. the address of the
+; descriptor) when tlv_get_addr is called. Likewise the result is returned in
+; x0.
+define i8 @get_var() {
+; CHECK-LABEL: get_var:
+; CHECK: adrp x[[TLVPDESC_SLOT_HI:[0-9]+]], _var@TLVPPAGE
+; CHECK: ldr x0, [x[[TLVPDESC_SLOT_HI]], _var@TLVPPAGEOFF]
+; CHECK: ldr [[TLV_GET_ADDR:x[0-9]+]], [x0]
+; CHECK: blr [[TLV_GET_ADDR]]
+; CHECK: ldrb w0, [x0]
+
+  %val = load i8* @var, align 1
+  ret i8 %val
+}
diff --git a/test/CodeGen/ARM64/tls-dynamic-together.ll b/test/CodeGen/ARM64/tls-dynamic-together.ll
new file mode 100644
index 0000000..3daae62
--- /dev/null
+++ b/test/CodeGen/ARM64/tls-dynamic-together.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+
+; If the .tlsdesccall and blr parts are emitted completely separately (even with
+; glue) then LLVM will separate them quite happily (with a spill at O0, hence
+; the option). This is definitely wrong, so we make sure they are emitted
+; together.
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr {{x[0-9]+}}
+}
diff --git a/test/CodeGen/ARM64/tls-dynamics.ll b/test/CodeGen/ARM64/tls-dynamics.ll
new file mode 100644
index 0000000..e8a83fd
--- /dev/null
+++ b/test/CodeGen/ARM64/tls-dynamics.ll
@@ -0,0 +1,135 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+  ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], x0]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_generaldynamic_addr() {
+; CHECK-LABEL: test_generaldynamic_addr:
+
+  ret i32* @general_dynamic_var
+
+  ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], x0
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+}
+
+@local_dynamic_var = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic() {
+; CHECK-LABEL: test_localdynamic:
+
+  %val = load i32* @local_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: add x[[TPREL:[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+
+; CHECK: ldr w0, [x[[TPIDR]], x[[TPREL]]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_localdynamic_addr() {
+; CHECK-LABEL: test_localdynamic_addr:
+
+  ret i32* @local_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: add [[TPREL:x[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs [[TPIDR:x[0-9]+]], TPIDR_EL0
+
+; CHECK: add x0, [[TPIDR]], [[TPREL]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+; The entire point of the local-dynamic access model is to have a single call to
+; the expensive resolver. Make sure we achieve that goal.
+
+@local_dynamic_var2 = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic_deduplicate() {
+; CHECK-LABEL: test_localdynamic_deduplicate:
+
+  %val = load i32* @local_dynamic_var
+  %val2 = load i32* @local_dynamic_var2
+
+  %sum = add i32 %val, %val2
+  ret i32 %sum
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK-NOT: _TLS_MODULE_BASE_
+
+; CHECK: ret
+}
diff --git a/test/CodeGen/ARM64/tls-execs.ll b/test/CodeGen/ARM64/tls-execs.ll
new file mode 100644
index 0000000..f0130d8
--- /dev/null
+++ b/test/CodeGen/ARM64/tls-execs.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+@initial_exec_var = external thread_local(initialexec) global i32
+
+define i32 @test_initial_exec() {
+; CHECK-LABEL: test_initial_exec:
+  %val = load i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+  ret i32 %val
+}
+
+define i32* @test_initial_exec_addr() {
+; CHECK-LABEL: test_initial_exec_addr:
+  ret i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+}
+
+@local_exec_var = thread_local(localexec) global i32 0
+
+define i32 @test_local_exec() {
+; CHECK-LABEL: test_local_exec:
+  %val = load i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [0bAAA{{[01]+}},A,0b101AAAAA,0x92]
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+
+  ret i32 %val
+}
+
+define i32* @test_local_exec_addr() {
+; CHECK-LABEL: test_local_exec_addr:
+  ret i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+}
diff --git a/test/CodeGen/ARM64/trap.ll b/test/CodeGen/ARM64/trap.ll
new file mode 100644
index 0000000..c9e0bea
--- /dev/null
+++ b/test/CodeGen/ARM64/trap.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo() nounwind {
+; CHECK: foo
+; CHECK: brk #1
+  tail call void @llvm.trap()
+  ret void
+}
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM64/trn.ll b/test/CodeGen/ARM64/trn.ll
new file mode 100644
index 0000000..f467984
--- /dev/null
+++ b/test/CodeGen/ARM64/trn.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrni16:
+;CHECK: trn1.4h
+;CHECK: trn2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+; 2xi32 TRN is redundant with ZIP
+define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrni32:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: add.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
+	%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+	ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnf:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: fadd.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
+	%tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
+        %tmp5 = fadd <2 x float> %tmp3, %tmp4
+	ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrnQi8:
+;CHECK: trn1.16b
+;CHECK: trn2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrnQi32:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnQf:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VTRN:
+
+define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8_undef:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16_undef:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
diff --git a/test/CodeGen/ARM64/trunc-store.ll b/test/CodeGen/ARM64/trunc-store.ll
new file mode 100644
index 0000000..e65f5b5
--- /dev/null
+++ b/test/CodeGen/ARM64/trunc-store.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define void @bar(<8 x i16> %arg, <8 x i8>* %p) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: xtn.8b v[[REG:[0-9]+]], v0
+; CHECK-NEXT: str d[[REG]], [x0]
+; CHECK-NEXT: ret
+  %tmp = trunc <8 x i16> %arg to <8 x i8>
+  store <8 x i8> %tmp, <8 x i8>* %p, align 8
+  ret void
+}
+
+@zptr8 = common global i8* null, align 8
+@zptr16 = common global i16* null, align 8
+@zptr32 = common global i32* null, align 8
+
+define void @fct32(i32 %arg, i64 %var) {
+; CHECK: fct32
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr32@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr32@GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: str w1, {{\[}}[[GLOBALADDR]], x[[OFFSETREGNUM]], sxtw #2]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i32** @zptr32, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i32* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i32
+  store i32 %tmp, i32* %arrayidx9, align 4
+  ret void
+}
+
+define void @fct16(i32 %arg, i64 %var) {
+; CHECK: fct16
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr16@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr16@GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: strh w1, {{\[}}[[GLOBALADDR]], x[[OFFSETREGNUM]], sxtw #1]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i16** @zptr16, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i16* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i16
+  store i16 %tmp, i16* %arrayidx9, align 4
+  ret void
+}
+
+define void @fct8(i32 %arg, i64 %var) {
+; CHECK: fct8
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr8@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr8@GOTPAGEOFF]
+; CHECK: ldr [[BASEADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: add [[ADDR:x[0-9]+]], [[BASEADDR]], w0, sxtw
+; w1 is %var truncated
+; CHECK-NEXT: sturb w1, {{\[}}[[ADDR]], #-1]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i8** @zptr8, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i8* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i8
+  store i8 %tmp, i8* %arrayidx9, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM64/umaxv.ll b/test/CodeGen/ARM64/umaxv.ll
new file mode 100644
index 0000000..15277d3
--- /dev/null
+++ b/test/CodeGen/ARM64/umaxv.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x8:
+; CHECK: umaxv.8b        b[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmax_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u4x16:
+; CHECK: umaxv.4h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmax_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x16:
+; CHECK: umaxv.8h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmax_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u16x8:
+; CHECK: umaxv.16b        b[[REG:[0-9]+]], v0
+; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/uminv.ll b/test/CodeGen/ARM64/uminv.ll
new file mode 100644
index 0000000..440522f
--- /dev/null
+++ b/test/CodeGen/ARM64/uminv.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x8:
+; CHECK: uminv.8b        b[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u4x16:
+; CHECK: uminv.4h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x16:
+; CHECK: uminv.8h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u16x8:
+; CHECK: uminv.16b        b[[REG:[0-9]+]], v0
+; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/umov.ll b/test/CodeGen/ARM64/umov.ll
new file mode 100644
index 0000000..7701874
--- /dev/null
+++ b/test/CodeGen/ARM64/umov.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define zeroext i8 @f1(<16 x i8> %a) {
+; CHECK-LABEL: f1:
+; CHECK: umov.b w0, v0[3]
+; CHECK-NEXT: ret
+  %vecext = extractelement <16 x i8> %a, i32 3
+  ret i8 %vecext
+}
+
+define zeroext i16 @f2(<4 x i16> %a) {
+; CHECK-LABEL: f2:
+; CHECK: umov.h w0, v0[2]
+; CHECK-NEXT: ret
+  %vecext = extractelement <4 x i16> %a, i32 2
+  ret i16 %vecext
+}
+
+define i32 @f3(<2 x i32> %a) {
+; CHECK-LABEL: f3:
+; CHECK: umov.s w0, v0[1]
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i32> %a, i32 1
+  ret i32 %vecext
+}
+
+define i64 @f4(<2 x i64> %a) {
+; CHECK-LABEL: f4:
+; CHECK: umov.d x0, v0[1]
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 1
+  ret i64 %vecext
+}
diff --git a/test/CodeGen/ARM64/unaligned_ldst.ll b/test/CodeGen/ARM64/unaligned_ldst.ll
new file mode 100644
index 0000000..20b80c0
--- /dev/null
+++ b/test/CodeGen/ARM64/unaligned_ldst.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://r11231896
+
+define void @t1(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: orr
+; CHECK: ldr [[X0:x[0-9]+]], [x1]
+; CHECK: str [[X0]], [x0]
+  %tmp1 = bitcast i8* %b to i64*
+  %tmp2 = bitcast i8* %a to i64*
+  %tmp3 = load i64* %tmp1, align 1
+  store i64 %tmp3, i64* %tmp2, align 1
+  ret void
+}
+
+define void @t2(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: orr
+; CHECK: ldr [[W0:w[0-9]+]], [x1]
+; CHECK: str [[W0]], [x0]
+  %tmp1 = bitcast i8* %b to i32*
+  %tmp2 = bitcast i8* %a to i32*
+  %tmp3 = load i32* %tmp1, align 1
+  store i32 %tmp3, i32* %tmp2, align 1
+  ret void
+}
+
+define void @t3(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: orr
+; CHECK: ldrh [[W0:w[0-9]+]], [x1]
+; CHECK: strh [[W0]], [x0]
+  %tmp1 = bitcast i8* %b to i16*
+  %tmp2 = bitcast i8* %a to i16*
+  %tmp3 = load i16* %tmp1, align 1
+  store i16 %tmp3, i16* %tmp2, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM64/uzp.ll b/test/CodeGen/ARM64/uzp.ll
new file mode 100644
index 0000000..60e16d0
--- /dev/null
+++ b/test/CodeGen/ARM64/uzp.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpi16:
+;CHECK: uzp1.4h
+;CHECK: uzp2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpQi8:
+;CHECK: uzp1.16b
+;CHECK: uzp2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vuzpQi32:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vuzpQf:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VUZP:
+
+define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8_undef:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16_undef:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
diff --git a/test/CodeGen/ARM64/vaargs.ll b/test/CodeGen/ARM64/vaargs.ll
new file mode 100644
index 0000000..ce07635
--- /dev/null
+++ b/test/CodeGen/ARM64/vaargs.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-darwin11.0.0"
+
+define float @t1(i8* nocapture %fmt, ...) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: fcvt
+  %argp = alloca i8*, align 8
+  %argp1 = bitcast i8** %argp to i8*
+  call void @llvm.va_start(i8* %argp1)
+  %0 = va_arg i8** %argp, i32
+  %1 = va_arg i8** %argp, float
+  call void @llvm.va_end(i8* %argp1)
+  ret float %1
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare void @llvm.va_end(i8*) nounwind
diff --git a/test/CodeGen/ARM64/vabs.ll b/test/CodeGen/ARM64/vabs.ll
new file mode 100644
index 0000000..0d8aa24
--- /dev/null
+++ b/test/CodeGen/ARM64/vabs.ll
@@ -0,0 +1,804 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+
+define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl8h:
+;CHECK: sabdl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl4s:
+;CHECK: sabdl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2d:
+;CHECK: sabdl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl2_8h:
+;CHECK: sabdl2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl2_4s:
+;CHECK: sabdl2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2_2d:
+;CHECK: sabdl2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl8h:
+;CHECK: uabdl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl4s:
+;CHECK: uabdl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2d:
+;CHECK: uabdl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl2_8h:
+;CHECK: uabdl2.8h
+  %load1 = load <16 x i8>* %A
+  %load2 = load <16 x i8>* %B
+  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl2_4s:
+;CHECK: uabdl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2_2d:
+;CHECK: uabdl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  ret <2 x i64> %tmp4
+}
+
+define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_2s:
+;CHECK: fabd.2s
+        %tmp1 = load <2 x float>* %A
+        %tmp2 = load <2 x float>* %B
+        %tmp3 = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        ret <2 x float> %tmp3
+}
+
+define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_4s:
+;CHECK: fabd.4s
+        %tmp1 = load <4 x float>* %A
+        %tmp2 = load <4 x float>* %B
+        %tmp3 = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        ret <4 x float> %tmp3
+}
+
+define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fabd_2d:
+;CHECK: fabd.2d
+        %tmp1 = load <2 x double>* %A
+        %tmp2 = load <2 x double>* %B
+        %tmp3 = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_8b:
+;CHECK: sabd.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_16b:
+;CHECK: sabd.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_4h:
+;CHECK: sabd.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_8h:
+;CHECK: sabd.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_2s:
+;CHECK: sabd.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_4s:
+;CHECK: sabd.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_8b:
+;CHECK: uabd.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_16b:
+;CHECK: uabd.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_4h:
+;CHECK: uabd.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_8h:
+;CHECK: uabd.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_2s:
+;CHECK: uabd.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_4s:
+;CHECK: uabd.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_8b:
+;CHECK: sqabs.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_16b:
+;CHECK: sqabs.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_4h:
+;CHECK: sqabs.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_8h:
+;CHECK: sqabs.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_2s:
+;CHECK: sqabs.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_4s:
+;CHECK: sqabs.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_8b:
+;CHECK: sqneg.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_16b:
+;CHECK: sqneg.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_4h:
+;CHECK: sqneg.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_8h:
+;CHECK: sqneg.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_2s:
+;CHECK: sqneg.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_4s:
+;CHECK: sqneg.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_8b:
+;CHECK: abs.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_16b:
+;CHECK: abs.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_4h:
+;CHECK: abs.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_8h:
+;CHECK: abs.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_2s:
+;CHECK: abs.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_4s:
+;CHECK: abs.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
+; CHECK-LABEL: abs_1d:
+; CHECK: abs d0, d0
+  %abs = call <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64> %A)
+  ret <1 x i64> %abs
+}
+
+define i64 @abs_1d_honestly(i64 %A) nounwind {
+; CHECK-LABEL: abs_1d_honestly:
+; CHECK: abs d0, d0
+  %abs = call i64 @llvm.arm64.neon.abs.i64(i64 %A)
+  ret i64 %abs
+}
+
+declare <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64>) nounwind readnone
+declare i64 @llvm.arm64.neon.abs.i64(i64) nounwind readnone
+
+define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal8h:
+;CHECK: sabal.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal4s:
+;CHECK: sabal.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2d:
+;CHECK: sabal.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal2_8h:
+;CHECK: sabal2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal2_4s:
+;CHECK: sabal2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2_2d:
+;CHECK: sabal2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal8h:
+;CHECK: uabal.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal4s:
+;CHECK: uabal.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2d:
+;CHECK: uabal.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal2_8h:
+;CHECK: uabal2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal2_4s:
+;CHECK: uabal2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2_2d:
+;CHECK: uabal2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_8b:
+;CHECK: saba.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = load <8 x i8>* %C
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_16b:
+;CHECK: saba.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp4 = load <16 x i8>* %C
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+        ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_4h:
+;CHECK: saba.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = load <4 x i16>* %C
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_8h:
+;CHECK: saba.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp4 = load <8 x i16>* %C
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_2s:
+;CHECK: saba.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = load <2 x i32>* %C
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_4s:
+;CHECK: saba.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp4 = load <4 x i32>* %C
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_8b:
+;CHECK: uaba.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = load <8 x i8>* %C
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_16b:
+;CHECK: uaba.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp4 = load <16 x i8>* %C
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+        ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_4h:
+;CHECK: uaba.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = load <4 x i16>* %C
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_8h:
+;CHECK: uaba.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp4 = load <8 x i16>* %C
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_2s:
+;CHECK: uaba.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = load <2 x i32>* %C
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_4s:
+;CHECK: uaba.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp4 = load <4 x i32>* %C
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+; Scalar FABD
+define float @fabds(float %a, float %b) nounwind {
+; CHECK-LABEL: fabds:
+; CHECK: fabd s0, s0, s1
+  %vabd.i = tail call float @llvm.arm64.sisd.fabd.f32(float %a, float %b) nounwind
+  ret float %vabd.i
+}
+
+define double @fabdd(double %a, double %b) nounwind {
+; CHECK-LABEL: fabdd:
+; CHECK: fabd d0, d0, d1
+  %vabd.i = tail call double @llvm.arm64.sisd.fabd.f64(double %a, double %b) nounwind
+  ret double %vabd.i
+}
+
+declare double @llvm.arm64.sisd.fabd.f64(double, double) nounwind readnone
+declare float @llvm.arm64.sisd.fabd.f32(float, float) nounwind readnone
+
+define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: uabdl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
+
+define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: sabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: sabdl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
diff --git a/test/CodeGen/ARM64/vadd.ll b/test/CodeGen/ARM64/vadd.ll
new file mode 100644
index 0000000..f674c6d
--- /dev/null
+++ b/test/CodeGen/ARM64/vadd.ll
@@ -0,0 +1,941 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b:
+;CHECK: addhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h:
+;CHECK: addhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s:
+;CHECK: addhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: addhn2_16b:
+;CHECK: addhn.8b
+;CHECK-NEXT: addhn2.16b
+  %vaddhn2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vaddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: addhn2_8h:
+;CHECK: addhn.4h
+;CHECK-NEXT: addhn2.8h
+  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vaddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: addhn2_4s:
+;CHECK: addhn.2s
+;CHECK-NEXT: addhn2.4s
+  %vaddhn2.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vaddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: raddhn8b:
+;CHECK: raddhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: raddhn4h:
+;CHECK: raddhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: raddhn2s:
+;CHECK: raddhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: raddhn2_16b:
+;CHECK: raddhn.8b
+;CHECK-NEXT: raddhn2.16b
+  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vraddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: raddhn2_8h:
+;CHECK: raddhn.4h
+;CHECK-NEXT: raddhn2.8h
+  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vraddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: raddhn2_4s:
+;CHECK: raddhn.2s
+;CHECK-NEXT: raddhn2.4s
+  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vraddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddl8h:
+;CHECK: saddl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddl4s:
+;CHECK: saddl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddl2d:
+;CHECK: saddl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
+; CHECK-LABEL: saddl2_8h:
+; CHECK-NEXT: saddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+  %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
+; CHECK-LABEL: saddl2_4s:
+; CHECK-NEXT: saddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+  %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
+; CHECK-LABEL: saddl2_2d:
+; CHECK-NEXT: saddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+  %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddl8h:
+;CHECK: uaddl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = add <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddl4s:
+;CHECK: uaddl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddl2d:
+;CHECK: uaddl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+
+define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
+; CHECK-LABEL: uaddl2_8h:
+; CHECK-NEXT: uaddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+  %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
+; CHECK-LABEL: uaddl2_4s:
+; CHECK-NEXT: uaddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+  %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
+; CHECK-LABEL: uaddl2_2d:
+; CHECK-NEXT: uaddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+  %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw8h:
+;CHECK: uaddw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = add <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw4s:
+;CHECK: uaddw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = add <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2d:
+;CHECK: uaddw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = add <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw2_8h:
+;CHECK: uaddw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+        %res = add <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw2_4s:
+;CHECK: uaddw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+        %res = add <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2_2d:
+;CHECK: uaddw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+        %res = add <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw8h:
+;CHECK: saddw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw4s:
+;CHECK: saddw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2d:
+;CHECK: saddw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw2_8h:
+;CHECK: saddw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = add <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw2_4s:
+;CHECK: saddw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = add <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2_2d:
+;CHECK: saddw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = add <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp4h:
+;CHECK: saddlp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp2s:
+;CHECK: saddlp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp1d:
+;CHECK: saddlp.1d
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp8h:
+;CHECK: saddlp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp4s:
+;CHECK: saddlp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp2d:
+;CHECK: saddlp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        ret <2 x i64> %tmp3
+}
+
+declare <4 x i16>  @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp4h:
+;CHECK: uaddlp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp2s:
+;CHECK: uaddlp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp1d:
+;CHECK: uaddlp.1d
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp8h:
+;CHECK: uaddlp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp4s:
+;CHECK: uaddlp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp2d:
+;CHECK: uaddlp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        ret <2 x i64> %tmp3
+}
+
+declare <4 x i16>  @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp4h:
+;CHECK: sadalp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp2s:
+;CHECK: sadalp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp8h:
+;CHECK: sadalp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp4s:
+;CHECK: sadalp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sadalp2d:
+;CHECK: sadalp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp4h:
+;CHECK: uadalp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp2s:
+;CHECK: uadalp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp8h:
+;CHECK: uadalp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp4s:
+;CHECK: uadalp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uadalp2d:
+;CHECK: uadalp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_8b:
+;CHECK: addp.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_16b:
+;CHECK: addp.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_4h:
+;CHECK: addp.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_8h:
+;CHECK: addp.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_2s:
+;CHECK: addp.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_4s:
+;CHECK: addp.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addp_2d:
+;CHECK: addp.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_2s:
+;CHECK: faddp.2s
+        %tmp1 = load <2 x float>* %A
+        %tmp2 = load <2 x float>* %B
+        %tmp3 = call <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        ret <2 x float> %tmp3
+}
+
+define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_4s:
+;CHECK: faddp.4s
+        %tmp1 = load <4 x float>* %A
+        %tmp2 = load <4 x float>* %B
+        %tmp3 = call <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        ret <4 x float> %tmp3
+}
+
+define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: faddp_2d:
+;CHECK: faddp.2d
+        %tmp1 = load <2 x double>* %A
+        %tmp2 = load <2 x double>* %B
+        %tmp3 = call <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uaddl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: uaddl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: saddl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: saddl2.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: usubl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: usubl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: ssubl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: ssubl2.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b_natural:
+;CHECK: addhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %sum = add <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h_natural:
+;CHECK: addhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %sum = add <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s_natural:
+;CHECK: addhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %sum = add <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn2_16b_natural:
+;CHECK: addhn2.16b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %sum = add <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn2_8h_natural:
+;CHECK: addhn2.8h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %sum = add <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2_4s_natural:
+;CHECK: addhn2.4s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %sum = add <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn8b_natural:
+;CHECK: subhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %diff = sub <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn4h_natural:
+;CHECK: subhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %diff = sub <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2s_natural:
+;CHECK: subhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %diff = sub <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn2_16b_natural:
+;CHECK: subhn2.16b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %diff = sub <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn2_8h_natural:
+;CHECK: subhn2.8h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %diff = sub <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2_4s_natural:
+;CHECK: subhn2.4s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %diff = sub <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
diff --git a/test/CodeGen/ARM64/vaddlv.ll b/test/CodeGen/ARM64/vaddlv.ll
new file mode 100644
index 0000000..d4d4608
--- /dev/null
+++ b/test/CodeGen/ARM64/vaddlv.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_s32
+; CHECK: saddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_u32
+; CHECK: uaddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+declare i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
+declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/vaddv.ll b/test/CodeGen/ARM64/vaddv.ll
new file mode 100644
index 0000000..44bfa84
--- /dev/null
+++ b/test/CodeGen/ARM64/vaddv.ll
@@ -0,0 +1,233 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_s8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_s16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddv_s32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_s32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define i64 @test_vaddv_s64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_s64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64> %a1)
+  ret i64 %vaddv.i
+}
+
+define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8_masked:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %0 = and i32 %vaddv.i, 511 ; 0x1ff
+  ret i32 %0
+}
+
+define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16_masked:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
+  ret i32 %0
+}
+
+define i32 @test_vaddv_u32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_u32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define float @test_vaddv_f32(<2 x float> %a1) {
+; CHECK-LABEL: test_vaddv_f32:
+; CHECK: faddp.2s s0, v0
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
+  ret float %vaddv.i
+}
+
+define float @test_vaddv_v4f32(<4 x float> %a1) {
+; CHECK-LABEL: test_vaddv_v4f32:
+; CHECK: faddp.4s [[REGNUM:v[0-9]+]], v0, v0
+; CHECK: faddp.2s s0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
+  ret float %vaddv.i
+}
+
+define double @test_vaddv_f64(<2 x double> %a1) {
+; CHECK-LABEL: test_vaddv_f64:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
+  ret double %vaddv.i
+}
+
+define i64 @test_vaddv_u64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_u64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  ret i64 %vaddv.i
+}
+
+define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_s8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_s16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_s32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_u8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_u16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_u32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8>)
+
+declare i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32>)
+
+declare i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8>)
+
+declare float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
+declare float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
+declare double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
diff --git a/test/CodeGen/ARM64/variadic-aapcs.ll b/test/CodeGen/ARM64/variadic-aapcs.ll
new file mode 100644
index 0000000..ac66902
--- /dev/null
+++ b/test/CodeGen/ARM64/variadic-aapcs.ll
@@ -0,0 +1,143 @@
+; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s
+
+%va_list = type {i8*, i8*, i8*, i32, i32}
+
+@var = global %va_list zeroinitializer, align 8
+
+declare void @llvm.va_start(i8*)
+
+define void @test_simple(i32 %n, ...) {
+; CHECK-LABEL: test_simple:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q0, q1, [sp]
+; ... omit middle ones ...
+; CHECK: stp q6, q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #56
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: orr [[VR_OFFS:w[0-9]+]], wzr, #0xffffff80
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+
+  ret void
+}
+
+define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
+; CHECK-LABEL: test_fewargs:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q1, q2, [sp]
+; ... omit middle ones ...
+; CHECK: str q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #40
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #112
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #39
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: movn [[VR_OFFS:w[0-9]+]], #111
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+
+  ret void
+}
+
+define void @test_nospare([8 x i64], [8 x float], ...) {
+; CHECK-LABEL: test_nospare:
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+; CHECK-NOT: sub sp, sp
+; CHECK: mov [[STACK:x[0-9]+]], sp
+; CHECK: str [[STACK]], [{{x[0-9]+}}, :lo12:var]
+
+  ret void
+}
+
+; If there are non-variadic arguments on the stack (here two i64s) then the
+; __stack field should point just past them.
+define void @test_offsetstack([10 x i64], [3 x float], ...) {
+; CHECK-LABEL: test_offsetstack:
+; CHECK: sub sp, sp, #80
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
+; CHECK: str [[STACK_TOP]], [{{x[0-9]+}}, :lo12:var]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+  ret void
+}
+
+declare void @llvm.va_end(i8*)
+
+define void @test_va_end() nounwind {
+; CHECK-LABEL: test_va_end:
+; CHECK-NEXT: BB#0
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_end(i8* %addr)
+
+  ret void
+; CHECK-NEXT: ret
+}
+
+declare void @llvm.va_copy(i8* %dest, i8* %src)
+
+@second_list = global %va_list zeroinitializer
+
+define void @test_va_copy() {
+; CHECK-LABEL: test_va_copy:
+  %srcaddr = bitcast %va_list* @var to i8*
+  %dstaddr = bitcast %va_list* @second_list to i8*
+  call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
+
+; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
+; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
+; CHECK: str [[BLOCK]], [x[[DST]]]
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
+; CHECK: str [[BLOCK]], [x[[DST]], #16]
+  ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/ARM64/vbitwise.ll b/test/CodeGen/ARM64/vbitwise.ll
new file mode 100644
index 0000000..7d8378d
--- /dev/null
+++ b/test/CodeGen/ARM64/vbitwise.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @rbit_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_8b:
+;CHECK: rbit.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @rbit_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_16b:
+;CHECK: rbit.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
+
+define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sxtl8h:
+;CHECK: sshll.8h
+	%tmp1 = load <8 x i8>* %A
+  %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+  ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @uxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uxtl8h:
+;CHECK: ushll.8h
+	%tmp1 = load <8 x i8>* %A
+  %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @sxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sxtl4s:
+;CHECK: sshll.4s
+	%tmp1 = load <4 x i16>* %A
+  %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @uxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uxtl4s:
+;CHECK: ushll.4s
+	%tmp1 = load <4 x i16>* %A
+  %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+  ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @sxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sxtl2d:
+;CHECK: sshll.2d
+	%tmp1 = load <2 x i32>* %A
+  %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+  ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @uxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uxtl2d:
+;CHECK: ushll.2d
+	%tmp1 = load <2 x i32>* %A
+  %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+  ret <2 x i64> %tmp2
+}
+
+; Check for incorrect use of vector bic.
+; rdar://11553859
+define void @test_vsliq(i8* nocapture %src, i8* nocapture %dest) nounwind noinline ssp {
+entry:
+; CHECK-LABEL: test_vsliq:
+; CHECK-NOT: bic
+; CHECK: movi.2d [[REG1:v[0-9]+]], #0x0000ff000000ff
+; CHECK: and.16b v{{[0-9]+}}, v{{[0-9]+}}, [[REG1]]
+  %0 = bitcast i8* %src to <16 x i8>*
+  %1 = load <16 x i8>* %0, align 16
+  %and.i = and <16 x i8> %1, <i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0>
+  %2 = bitcast <16 x i8> %and.i to <8 x i16>
+  %vshl_n = shl <8 x i16> %2, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %3 = or <8 x i16> %2, %vshl_n
+  %4 = bitcast <8 x i16> %3 to <4 x i32>
+  %vshl_n8 = shl <4 x i32> %4, <i32 16, i32 16, i32 16, i32 16>
+  %5 = or <4 x i32> %4, %vshl_n8
+  %6 = bitcast <4 x i32> %5 to <16 x i8>
+  %7 = bitcast i8* %dest to <16 x i8>*
+  store <16 x i8> %6, <16 x i8>* %7, align 16
+  ret void
+}
diff --git a/test/CodeGen/ARM64/vclz.ll b/test/CodeGen/ARM64/vclz.ll
new file mode 100644
index 0000000..ddc09ed
--- /dev/null
+++ b/test/CodeGen/ARM64/vclz.ll
@@ -0,0 +1,109 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u8:
+  ; CHECK: clz.8b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+  ret <8 x i8> %vclz.i
+}
+
+define <8 x i8> @test_vclz_s8(<8 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s8:
+  ; CHECK: clz.8b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+  ret <8 x i8> %vclz.i
+}
+
+define <4 x i16> @test_vclz_u16(<4 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u16:
+  ; CHECK: clz.4h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+  ret <4 x i16> %vclz1.i
+}
+
+define <4 x i16> @test_vclz_s16(<4 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s16:
+  ; CHECK: clz.4h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+  ret <4 x i16> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_u32(<2 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u32:
+  ; CHECK: clz.2s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+  ret <2 x i32> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_s32(<2 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s32:
+  ; CHECK: clz.2s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+  ret <2 x i32> %vclz1.i
+}
+
+define <16 x i8> @test_vclzq_u8(<16 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u8:
+  ; CHECK: clz.16b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+  ret <16 x i8> %vclz.i
+}
+
+define <16 x i8> @test_vclzq_s8(<16 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s8:
+  ; CHECK: clz.16b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+  ret <16 x i8> %vclz.i
+}
+
+define <8 x i16> @test_vclzq_u16(<8 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u16:
+  ; CHECK: clz.8h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+  ret <8 x i16> %vclz1.i
+}
+
+define <8 x i16> @test_vclzq_s16(<8 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s16:
+  ; CHECK: clz.8h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+  ret <8 x i16> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_u32(<4 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u32:
+  ; CHECK: clz.4s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+  ret <4 x i32> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_s32(<4 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s32:
+  ; CHECK: clz.4s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+  ret <4 x i32> %vclz1.i
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
+
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
+
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
+
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcmp.ll b/test/CodeGen/ARM64/vcmp.ll
new file mode 100644
index 0000000..f9275b8
--- /dev/null
+++ b/test/CodeGen/ARM64/vcmp.ll
@@ -0,0 +1,227 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+
+define void @fcmltz_4s(<4 x float> %a, <4 x i16>* %p) nounwind {
+;CHECK-LABEL: fcmltz_4s:
+;CHECK: fcmlt.4s [[REG:v[0-9]+]], v0, #0
+;CHECK-NEXT: xtn.4h v[[REG_1:[0-9]+]], [[REG]]
+;CHECK-NEXT: str d[[REG_1]], [x0]
+;CHECK-NEXT: ret
+  %tmp = fcmp olt <4 x float> %a, zeroinitializer
+  %tmp2 = sext <4 x i1> %tmp to <4 x i16>
+  store <4 x i16> %tmp2, <4 x i16>* %p, align 8
+  ret void
+}
+
+define <2 x i32> @facge_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facge_2s:
+;CHECK: facge.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facge_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facge_4s:
+;CHECK: facge.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facge_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facge_2d:
+;CHECK: facge.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i32> @facgt_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_2s:
+;CHECK: facgt.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facgt_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_4s:
+;CHECK: facgt.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facgt_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facgt_2d:
+;CHECK: facgt.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @facge_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facge_s:
+; CHECK: facge {{s[0-9]+}}, s0, s1
+  %mask = call i32 @llvm.arm64.neon.facge.i32.f32(float %A, float %B)
+  ret i32 %mask
+}
+
+define i64 @facge_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facge_d:
+; CHECK: facge {{d[0-9]+}}, d0, d1
+  %mask = call i64 @llvm.arm64.neon.facge.i64.f64(double %A, double %B)
+  ret i64 %mask
+}
+
+declare i64 @llvm.arm64.neon.facge.i64.f64(double, double)
+declare i32 @llvm.arm64.neon.facge.i32.f32(float, float)
+
+define i32 @facgt_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facgt_s:
+; CHECK: facgt {{s[0-9]+}}, s0, s1
+  %mask = call i32 @llvm.arm64.neon.facgt.i32.f32(float %A, float %B)
+  ret i32 %mask
+}
+
+define i64 @facgt_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facgt_d:
+; CHECK: facgt {{d[0-9]+}}, d0, d1
+  %mask = call i64 @llvm.arm64.neon.facgt.i64.f64(double %A, double %B)
+  ret i64 %mask
+}
+
+declare i64 @llvm.arm64.neon.facgt.i64.f64(double, double)
+declare i32 @llvm.arm64.neon.facgt.i32.f32(float, float)
+
+define <8 x i8> @cmtst_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_8b:
+;CHECK: cmtst.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %commonbits = and <8 x i8> %tmp1, %tmp2
+  %mask = icmp ne <8 x i8> %commonbits, zeroinitializer
+  %res = sext <8 x i1> %mask to <8 x i8>
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @cmtst_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_16b:
+;CHECK: cmtst.16b
+  %tmp1 = load <16 x i8>* %A
+  %tmp2 = load <16 x i8>* %B
+  %commonbits = and <16 x i8> %tmp1, %tmp2
+  %mask = icmp ne <16 x i8> %commonbits, zeroinitializer
+  %res = sext <16 x i1> %mask to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <4 x i16> @cmtst_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_4h:
+;CHECK: cmtst.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %commonbits = and <4 x i16> %tmp1, %tmp2
+  %mask = icmp ne <4 x i16> %commonbits, zeroinitializer
+  %res = sext <4 x i1> %mask to <4 x i16>
+  ret <4 x i16> %res
+}
+
+define <8 x i16> @cmtst_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_8h:
+;CHECK: cmtst.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %commonbits = and <8 x i16> %tmp1, %tmp2
+  %mask = icmp ne <8 x i16> %commonbits, zeroinitializer
+  %res = sext <8 x i1> %mask to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <2 x i32> @cmtst_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_2s:
+;CHECK: cmtst.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %commonbits = and <2 x i32> %tmp1, %tmp2
+  %mask = icmp ne <2 x i32> %commonbits, zeroinitializer
+  %res = sext <2 x i1> %mask to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @cmtst_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_4s:
+;CHECK: cmtst.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %commonbits = and <4 x i32> %tmp1, %tmp2
+  %mask = icmp ne <4 x i32> %commonbits, zeroinitializer
+  %res = sext <4 x i1> %mask to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @cmtst_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: cmtst_2d:
+;CHECK: cmtst.2d
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i64>* %B
+  %commonbits = and <2 x i64> %tmp1, %tmp2
+  %mask = icmp ne <2 x i64> %commonbits, zeroinitializer
+  %res = sext <2 x i1> %mask to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define <1 x i64> @fcmeq_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmeq_d:
+; CHECK: fcmeq {{d[0-9]+}}, d0, d1
+  %tst = fcmp oeq <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmge_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmge_d:
+; CHECK: fcmge {{d[0-9]+}}, d0, d1
+  %tst = fcmp oge <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmle_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmle_d:
+; CHECK: fcmge {{d[0-9]+}}, d1, d0
+  %tst = fcmp ole <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmgt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d0, d1
+  %tst = fcmp ogt <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmlt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d1, d0
+  %tst = fcmp olt <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
diff --git a/test/CodeGen/ARM64/vcnt.ll b/test/CodeGen/ARM64/vcnt.ll
new file mode 100644
index 0000000..e00658a
--- /dev/null
+++ b/test/CodeGen/ARM64/vcnt.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_8b:
+;CHECK: cls.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_16b:
+;CHECK: cls.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_4h:
+;CHECK: cls.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_8h:
+;CHECK: cls.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_2s:
+;CHECK: cls.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_4s:
+;CHECK: cls.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcombine.ll b/test/CodeGen/ARM64/vcombine.ll
new file mode 100644
index 0000000..16f591e
--- /dev/null
+++ b/test/CodeGen/ARM64/vcombine.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; LowerCONCAT_VECTORS() was reversing the order of two parts.
+; rdar://11558157
+; rdar://11559553
+define <16 x i8> @test(<16 x i8> %q0, <16 x i8> %q1, i8* nocapture %dest) nounwind {
+entry:
+; CHECK-LABEL: test:
+; CHECK: ins.d v0[1], v1[0]
+  %0 = bitcast <16 x i8> %q0 to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> zeroinitializer
+  %1 = bitcast <16 x i8> %q1 to <2 x i64>
+  %shuffle.i4 = shufflevector <2 x i64> %1, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i3 = shufflevector <1 x i64> %shuffle.i, <1 x i64> %shuffle.i4, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i3 to <16 x i8>
+  ret <16 x i8> %2
+}
diff --git a/test/CodeGen/ARM64/vcvt.ll b/test/CodeGen/ARM64/vcvt.ll
new file mode 100644
index 0000000..19bb8cb
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvt.ll
@@ -0,0 +1,686 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtas_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtau_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtms_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtps_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtns_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <2 x float> %A to <2 x i32>
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzs_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <4 x float> %A to <4 x i32>
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <2 x double> %A to <2 x i64>
+	ret <2 x i64> %tmp3
+}
+
+
+define <2 x i32> @fcvtzu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <2 x float> %A to <2 x i32>
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <4 x float> %A to <4 x i32>
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <2 x double> %A to <2 x i64>
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @frinta_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinta_2s:
+;CHECK-NOT: ld1
+;CHECK: frinta.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.round.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinta_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinta_4s:
+;CHECK-NOT: ld1
+;CHECK: frinta.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.round.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinta_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinta_2d:
+;CHECK-NOT: ld1
+;CHECK: frinta.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.round.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.round.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.round.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.round.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frinti_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinti_2s:
+;CHECK-NOT: ld1
+;CHECK: frinti.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinti_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinti_4s:
+;CHECK-NOT: ld1
+;CHECK: frinti.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinti_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinti_2d:
+;CHECK-NOT: ld1
+;CHECK: frinti.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintm_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintm_2s:
+;CHECK-NOT: ld1
+;CHECK: frintm.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.floor.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintm_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintm_4s:
+;CHECK-NOT: ld1
+;CHECK: frintm.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.floor.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintm_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintm_2d:
+;CHECK-NOT: ld1
+;CHECK: frintm.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.floor.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintn_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintn_2s:
+;CHECK-NOT: ld1
+;CHECK: frintn.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintn_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintn_4s:
+;CHECK-NOT: ld1
+;CHECK: frintn.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintn_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintn_2d:
+;CHECK-NOT: ld1
+;CHECK: frintn.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintp_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintp_2s:
+;CHECK-NOT: ld1
+;CHECK: frintp.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.ceil.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintp_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintp_4s:
+;CHECK-NOT: ld1
+;CHECK: frintp.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintp_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintp_2d:
+;CHECK-NOT: ld1
+;CHECK: frintp.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintx_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintx_2s:
+;CHECK-NOT: ld1
+;CHECK: frintx.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.rint.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintx_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintx_4s:
+;CHECK-NOT: ld1
+;CHECK: frintx.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintx_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintx_2d:
+;CHECK-NOT: ld1
+;CHECK: frintx.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintz_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintz_2s:
+;CHECK-NOT: ld1
+;CHECK: frintz.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.trunc.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintz_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintz_4s:
+;CHECK-NOT: ld1
+;CHECK: frintz.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintz_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintz_2d:
+;CHECK-NOT: ld1
+;CHECK: frintz.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn v0.2s, v0.2d
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn2 v0.4s, v1.2d
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+        %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+	ret <4 x float> %res
+}
+
+declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzsc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzsc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzuc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzuc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @scvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @scvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: scvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+
+define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @ucvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: ucvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	ret <2 x double> %tmp3
+}
+
+
+;CHECK-LABEL: autogen_SD28458:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD28458() {
+  %Tr53 = fptrunc <8 x double> undef to <8 x float>
+  store <8 x float> %Tr53, <8 x float>* undef
+  ret void
+}
+
+;CHECK-LABEL: autogen_SD19225:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD19225() {
+  %A = load <8 x float>* undef
+  %Tr53 = fpext <8 x float> %A to <8 x double>
+  store <8 x double> %Tr53, <8 x double>* undef
+  ret void
+}
+
+declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_f.ll b/test/CodeGen/ARM64/vcvt_f.ll
new file mode 100644
index 0000000..d67aa3b
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvt_f.ll
@@ -0,0 +1,82 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -O0 -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f64_f32:
+  %vcvt1.i = fpext <2 x float> %x to <2 x double>
+; CHECK: fcvtl	v0.2d, v0.2s
+  ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f64_f32:
+  %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
+; CHECK: fcvtl2	v0.2d, v0.4s
+  ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f32_f64:
+  %vcvt1.i = fptrunc <2 x double> %v to <2 x float>
+; CHECK: fcvtn
+  ret <2 x float> %vcvt1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f32_f64:
+
+  %cvt = fptrunc <2 x double> %v to <2 x float>
+  %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtn2
+  ret <4 x float> %vcvt2.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_f32_f64:
+  %vcvtx1.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+; CHECK: fcvtxn
+  ret <2 x float> %vcvtx1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_high_f32_f64:
+  %vcvtx2.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+  %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtxn2
+  ret <4 x float> %res
+; CHECK: ret
+}
+
+
+declare <2 x double> @llvm.arm64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfp2df(<2 x float>) nounwind readnone
+
+declare <2 x float> @llvm.arm64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
+
+declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define i16 @to_half(float %in) {
+; CHECK-LABEL: to_half:
+; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0
+; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}}
+  %res = call i16 @llvm.convert.to.fp16(float %in)
+  ret i16 %res
+}
+
+define float @from_half(i16 %in) {
+; CHECK-LABEL: from_half:
+; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}}
+; CHECK: fcvt s0, h[[HALFVAL]]
+  %res = call float @llvm.convert.from.fp16(i16 %in)
+  ret float %res
+}
+
+declare float @llvm.convert.from.fp16(i16) #1
+declare i16 @llvm.convert.to.fp16(float) #1
diff --git a/test/CodeGen/ARM64/vcvt_f32_su32.ll b/test/CodeGen/ARM64/vcvt_f32_su32.ll
new file mode 100644
index 0000000..51e053d
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvt_f32_su32.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvt:
+; CHECK: ucvtf.2s  v0, v0
+; CHECK: ret
+
+  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <2 x float> @scvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvt:
+; CHECK: scvtf.2s  v0, v0
+; CHECK: ret
+  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <4 x float> @ucvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvtq:
+; CHECK: ucvtf.4s  v0, v0
+; CHECK: ret
+  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @scvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvtq:
+; CHECK: scvtf.4s  v0, v0
+; CHECK: ret
+  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16:
+; CHECK: fcvtl  v0.4s, v0.4h
+; CHECK-NEXT: ret
+  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %a) nounwind
+  ret <4 x float> %vcvt1.i
+}
+
+define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16_high:
+; CHECK: fcvtl2  v0.4s, v0.8h
+; CHECK-NEXT: ret
+  %in = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %in) nounwind
+  ret <4 x float> %vcvt1.i
+}
+
+
+
+define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16f32:
+; CHECK: fcvtn  v0.4h, v0.4s
+; CHECK-NEXT: ret
+  %vcvt1.i = tail call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %a) nounwind
+  ret <4 x i16> %vcvt1.i
+}
+
+define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) {
+; CHECK-LABEL: cvtf16f32_high:
+; CHECK: fcvtn2 v0.8h, v1.4s
+; CHECK-NEXT: ret
+  %high = call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %high_big)
+  %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+declare <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_n.ll b/test/CodeGen/ARM64/vcvt_n.ll
new file mode 100644
index 0000000..46de557
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvt_n.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxpu:
+; CHECK: ucvtf.2s	v0, v0, #9
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
+  ret <2 x float> %vcvt_n1
+}
+
+define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxps:
+; CHECK: scvtf.2s	v0, v0, #12
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
+  ret <2 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxpu:
+; CHECK: ucvtf.4s	v0, v0, #18
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
+  ret <4 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxps:
+; CHECK: scvtf.4s	v0, v0, #30
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
+  ret <4 x float> %vcvt_n1
+}
+define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
+  ret <2 x double> %vcvt_n1
+}
+
+define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
+  ret <2 x double> %vcvt_n1
+}
+
+declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_su32_f32.ll b/test/CodeGen/ARM64/vcvt_su32_f32.ll
new file mode 100644
index 0000000..8c82fa0
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvt_su32_f32.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @c1(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c1
+; CHECK: fcvtzs.2s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <2 x i32> @c2(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c2
+; CHECK: fcvtzu.2s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <4 x i32> @c3(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c3
+; CHECK: fcvtzs.4s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
+define <4 x i32> @c4(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c4
+; CHECK: fcvtzu.4s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
diff --git a/test/CodeGen/ARM64/vcvtxd_f32_f64.ll b/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
new file mode 100644
index 0000000..bbe8f0b
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @fcvtxn(double %a) {
+; CHECK-LABEL: fcvtxn:
+; CHECK: fcvtxn s0, d0
+; CHECK-NEXT: ret
+  %vcvtxd.i = tail call float @llvm.arm64.sisd.fcvtxn(double %a) nounwind
+  ret float %vcvtxd.i
+}
+
+declare float @llvm.arm64.sisd.fcvtxn(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/vecCmpBr.ll b/test/CodeGen/ARM64/vecCmpBr.ll
new file mode 100644
index 0000000..e23ef25
--- /dev/null
+++ b/test/CodeGen/ARM64/vecCmpBr.ll
@@ -0,0 +1,207 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; ModuleID = 'arm64_vecCmpBr.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+
+define i32 @anyZero64(<4 x i16> %a) #0 {
+; CHECK: _anyZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...) #1
+
+define i32 @anyZero128(<8 x i16> %a) #0 {
+; CHECK: _anyZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @anyNonZero64(<4 x i16> %a) #0 {
+; CHECK: _anyNonZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @anyNonZero128(<8 x i16> %a) #0 {
+; CHECK: _anyNonZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allZero64(<4 x i16> %a) #0 {
+; CHECK: _allZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allZero128(<8 x i16> %a) #0 {
+; CHECK: _allZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allNonZero64(<4 x i16> %a) #0 {
+; CHECK: _allNonZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allNonZero128(<8 x i16> %a) #0 {
+; CHECK: _allNonZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) #2
+
+declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) #2
+
+attributes #0 = { nounwind ssp "target-cpu"="cyclone" }
+attributes #1 = { "target-cpu"="cyclone" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+attributes #4 = { nobuiltin nounwind }
diff --git a/test/CodeGen/ARM64/vecFold.ll b/test/CodeGen/ARM64/vecFold.ll
new file mode 100644
index 0000000..6888932
--- /dev/null
+++ b/test/CodeGen/ARM64/vecFold.ll
@@ -0,0 +1,145 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s| FileCheck %s
+
+define <16 x i8> @foov16i8(<8 x i16> %a0, <8 x i16> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov16i8:
+  %vshrn_low_shift = lshr <8 x i16> %a0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %vshrn_low = trunc <8 x i16> %vshrn_low_shift to <8 x i8>
+  %vshrn_high_shift = lshr <8 x i16> %b0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %vshrn_high = trunc <8 x i16> %vshrn_high_shift to <8 x i8>
+; CHECK: shrn.8b v0, v0, #5
+; CHECK-NEXT: shrn2.16b v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <8 x i8> %vshrn_low to <1 x i64>
+  %2 = bitcast <8 x i8> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @foov8i16(<4 x i32> %a0, <4 x i32> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov8i16:
+  %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16>
+  %vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: shrn.4h v0, v0, #5
+; CHECK-NEXT: shrn2.8h v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vshrn_low to <1 x i64>
+  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @foov4i32(<2 x i64> %a0, <2 x i64> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov4i32:
+  %vshrn_low_shift = lshr <2 x i64> %a0, <i64 5, i64 5>
+  %vshrn_low = trunc <2 x i64> %vshrn_low_shift to <2 x i32>
+  %vshrn_high_shift = lshr <2 x i64> %b0, <i64 5, i64 5>
+  %vshrn_high = trunc <2 x i64> %vshrn_high_shift to <2 x i32>
+; CHECK: shrn.2s v0, v0, #5
+; CHECK-NEXT: shrn2.4s v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <2 x i32> %vshrn_low to <1 x i64>
+  %2 = bitcast <2 x i32> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: bar:
+  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vaddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+; CHECK: addhn.4h	v0, v0, v1
+; CHECK-NEXT: addhn2.8h	v0, v2, v3
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vaddhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @baz(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: baz:
+  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vshrn_high_shift = ashr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: addhn.4h	v0, v0, v1
+; CHECK-NEXT: shrn2.8h	v0, v2, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @raddhn(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: raddhn:
+entry:
+; CHECK: 	raddhn.4h	v0, v0, v1
+; CHECK-NEXT: 	raddhn2.8h	v0, v2, v3
+; CHECK-NEXT: 	ret
+  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vraddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %0 = bitcast <4 x i16> %vraddhn2.i to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @vrshrn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrshrn:
+; CHECK: rshrn.8b	v0, v0, #5
+; CHECK-NEXT: rshrn2.16b	v0, v2, #6
+; CHECK-NEXT: ret
+  %vrshrn_n1 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
+  %vrshrn_n4 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
+  %1 = bitcast <8 x i8> %vrshrn_n1 to <1 x i64>
+  %2 = bitcast <8 x i8> %vrshrn_n4 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrsubhn:
+; CHECK: rsubhn.8b	v0, v0, v1
+; CHECK: rsubhn2.16b	v0, v2, v3
+; CHECK-NEXT: 	ret
+  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
+  %vrsubhn2.i10 = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
+  %1 = bitcast <8 x i8> %vrsubhn2.i to <1 x i64>
+  %2 = bitcast <8 x i8> %vrsubhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: noOpt1:
+  %vqsub2.i = tail call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+; CHECK:	sqsub.2s	v0, v0, v1
+; CHECK-NEXT:	addhn2.8h	v0, v2, v3
+  %1 = bitcast <2 x i32> %vqsub2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/vector-ext.ll b/test/CodeGen/ARM64/vector-ext.ll
new file mode 100644
index 0000000..88889fd
--- /dev/null
+++ b/test/CodeGen/ARM64/vector-ext.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func30
+;CHECK: ushll.4s  v0, v0, #0
+;CHECK: movi.4s v1, #1
+;CHECK: and.16b v0, v0, v1
+;CHECK: str  q0, [x0]
+;CHECK: ret
+
+%T0_30 = type <4 x i1>
+%T1_30 = type <4 x i32>
+define void @func30(%T0_30 %v0, %T1_30* %p1) {
+  %r = zext %T0_30 %v0 to %T1_30
+  store %T1_30 %r, %T1_30* %p1
+  ret void
+}
diff --git a/test/CodeGen/ARM64/vector-imm.ll b/test/CodeGen/ARM64/vector-imm.ll
new file mode 100644
index 0000000..f1fc3cc
--- /dev/null
+++ b/test/CodeGen/ARM64/vector-imm.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @v_orrimm(<8 x i8>* %A) nounwind {
+; CHECK-LABEL: v_orrimm:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: orr
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = or <8 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @v_orrimmQ(<16 x i8>* %A) nounwind {
+; CHECK: v_orrimmQ
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: orr
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = or <16 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @v_bicimm(<8 x i8>* %A) nounwind {
+; CHECK-LABEL: v_bicimm:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: bic
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @v_bicimmQ(<16 x i8>* %A) nounwind {
+; CHECK-LABEL: v_bicimmQ:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: bic
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+	ret <16 x i8> %tmp3
+}
+
+define <2 x double> @foo(<2 x double> %bar) nounwind {
+; CHECK: foo
+; CHECK: fmov.2d	v1, #1.000000e+00
+  %add = fadd <2 x double> %bar, <double 1.0, double 1.0>
+  ret <2 x double> %add
+}
+
+define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t1:
+; CHECK: movi.4s v0, #75
+  ret <4 x i32> <i32 75, i32 75, i32 75, i32 75>
+}
+
+define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t2:
+; CHECK: movi.4s v0, #75, lsl #8
+  ret <4 x i32> <i32 19200, i32 19200, i32 19200, i32 19200>
+}
+
+define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t3:
+; CHECK: movi.4s v0, #75, lsl #16
+  ret <4 x i32> <i32 4915200, i32 4915200, i32 4915200, i32 4915200>
+}
+
+define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t4:
+; CHECK: movi.4s v0, #75, lsl #24
+  ret <4 x i32> <i32 1258291200, i32 1258291200, i32 1258291200, i32 1258291200>
+}
+
+define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_8h_imm_t5:
+; CHECK: movi.8h v0, #75
+  ret <8 x i16> <i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75>
+}
+
+; rdar://11989841
+define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_8h_imm_t6:
+; CHECK: movi.8h v0, #75, lsl #8
+  ret <8 x i16> <i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200>
+}
+
+define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t7:
+; CHECK: movi.4s v0, #75, msl #8
+ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
+}
+
+define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t8:
+; CHECK: movi.4s v0, #75, msl #16
+ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
+}
+
+define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_16b_imm_t9:
+; CHECK: movi.16b v0, #75
+ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75,
+               i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
+}
+
+define <2 x i64> @movi_2d_imm_t10() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_2d_imm_t10:
+; CHECK: movi.2d v0, #0xff00ff00ff00ff
+ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
+}
+
+define <4 x i32> @movi_4s_imm_t11() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t11:
+; CHECK: fmov.4s v0, #-3.281250e-01
+ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
+}
+
+define <2 x i64> @movi_2d_imm_t12() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_2d_imm_t12:
+; CHECK: fmov.2d v0, #-1.718750e-01
+ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
+}
diff --git a/test/CodeGen/ARM64/vector-ldst.ll b/test/CodeGen/ARM64/vector-ldst.ll
new file mode 100644
index 0000000..154160e
--- /dev/null
+++ b/test/CodeGen/ARM64/vector-ldst.ll
@@ -0,0 +1,601 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+; rdar://9428579
+
+%type1 = type { <16 x i8> }
+%type2 = type { <8 x i8> }
+%type3 = type { <4 x i16> }
+
+
+define hidden fastcc void @t1(%type1** %argtable) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldr x[[REG:[0-9]+]], [x0]
+; CHECK: str q0, [x[[REG]]]
+  %tmp1 = load %type1** %argtable, align 8
+  %tmp2 = getelementptr inbounds %type1* %tmp1, i64 0, i32 0
+  store <16 x i8> zeroinitializer, <16 x i8>* %tmp2, align 16
+  ret void
+}
+
+define hidden fastcc void @t2(%type2** %argtable) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr x[[REG:[0-9]+]], [x0]
+; CHECK: str d0, [x[[REG]]]
+  %tmp1 = load %type2** %argtable, align 8
+  %tmp2 = getelementptr inbounds %type2* %tmp1, i64 0, i32 0
+  store <8 x i8> zeroinitializer, <8 x i8>* %tmp2, align 8
+  ret void
+}
+
+; add a bunch of tests for rdar://11246289
+
+@globalArray64x2 = common global <2 x i64>* null, align 8
+@globalArray32x4 = common global <4 x i32>* null, align 8
+@globalArray16x8 = common global <8 x i16>* null, align 8
+@globalArray8x16 = common global <16 x i8>* null, align 8
+@globalArray64x1 = common global <1 x i64>* null, align 8
+@globalArray32x2 = common global <2 x i32>* null, align 8
+@globalArray16x4 = common global <4 x i16>* null, align 8
+@globalArray8x8 = common global <8 x i8>* null, align 8
+@floatglobalArray64x2 = common global <2 x double>* null, align 8
+@floatglobalArray32x4 = common global <4 x float>* null, align 8
+@floatglobalArray64x1 = common global <1 x double>* null, align 8
+@floatglobalArray32x2 = common global <2 x float>* null, align 8
+
+define void @fct1_64x2(<2 x i64>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_64x2:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 %offset
+  %tmp = load <2 x i64>* %arrayidx, align 16
+  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 %offset
+  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_64x2(<2 x i64>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_64x2:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 3
+  %tmp = load <2 x i64>* %arrayidx, align 16
+  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 5
+  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_32x4(<4 x i32>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_32x4:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 %offset
+  %tmp = load <4 x i32>* %arrayidx, align 16
+  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 %offset
+  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_32x4(<4 x i32>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_32x4:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 3
+  %tmp = load <4 x i32>* %arrayidx, align 16
+  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 5
+  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_16x8(<8 x i16>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_16x8:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 %offset
+  %tmp = load <8 x i16>* %arrayidx, align 16
+  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 %offset
+  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_16x8(<8 x i16>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_16x8:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 3
+  %tmp = load <8 x i16>* %arrayidx, align 16
+  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 5
+  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_8x16(<16 x i8>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_8x16:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 %offset
+  %tmp = load <16 x i8>* %arrayidx, align 16
+  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
+  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 %offset
+  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_8x16(<16 x i8>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_8x16:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 3
+  %tmp = load <16 x i8>* %arrayidx, align 16
+  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
+  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 5
+  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_64x1(<1 x i64>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_64x1:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 %offset
+  %tmp = load <1 x i64>* %arrayidx, align 8
+  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
+  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 %offset
+  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_64x1(<1 x i64>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_64x1:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 3
+  %tmp = load <1 x i64>* %arrayidx, align 8
+  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
+  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 5
+  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_32x2(<2 x i32>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_32x2:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 %offset
+  %tmp = load <2 x i32>* %arrayidx, align 8
+  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 %offset
+  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_32x2(<2 x i32>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_32x2:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 3
+  %tmp = load <2 x i32>* %arrayidx, align 8
+  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 5
+  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_16x4(<4 x i16>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_16x4:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 %offset
+  %tmp = load <4 x i16>* %arrayidx, align 8
+  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 %offset
+  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_16x4(<4 x i16>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_16x4:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 3
+  %tmp = load <4 x i16>* %arrayidx, align 8
+  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 5
+  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_8x8(<8 x i8>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_8x8:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <8 x i8>* %array, i64 %offset
+  %tmp = load <8 x i8>* %arrayidx, align 8
+  %tmp1 = load <8 x i8>** @globalArray8x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i8>* %tmp1, i64 %offset
+  store <8 x i8> %tmp, <8 x i8>* %arrayidx1, align 8
+  ret void
+}
+
+; Add a bunch of tests for rdar://13258794: Match LDUR/STUR for D and Q
+; registers for unscaled vector accesses
+@str = global [63 x i8] c"Test case for rdar://13258794: LDUR/STUR for D and Q registers\00", align 1
+
+define <1 x i64> @fct0() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct0:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
+  ret <1 x i64> %0
+}
+
+define <2 x i32> @fct1() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct1:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
+  ret <2 x i32> %0
+}
+
+define <4 x i16> @fct2() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct2:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
+  ret <4 x i16> %0
+}
+
+define <8 x i8> @fct3() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct3:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
+  ret <8 x i8> %0
+}
+
+define <2 x i64> @fct4() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct4:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
+  ret <2 x i64> %0
+}
+
+define <4 x i32> @fct5() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct5:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
+  ret <4 x i32> %0
+}
+
+define <8 x i16> @fct6() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct6:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
+  ret <8 x i16> %0
+}
+
+define <16 x i8> @fct7() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct7:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
+  ret <16 x i8> %0
+}
+
+define void @fct8() nounwind ssp {
+entry:
+; CHECK-LABEL: fct8:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
+  store <1 x i64> %0, <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <1 x i64>*), align 8
+  ret void
+}
+
+define void @fct9() nounwind ssp {
+entry:
+; CHECK-LABEL: fct9:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
+  store <2 x i32> %0, <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i32>*), align 8
+  ret void
+}
+
+define void @fct10() nounwind ssp {
+entry:
+; CHECK-LABEL: fct10:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
+  store <4 x i16> %0, <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i16>*), align 8
+  ret void
+}
+
+define void @fct11() nounwind ssp {
+entry:
+; CHECK-LABEL: fct11:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
+  store <8 x i8> %0, <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i8>*), align 8
+  ret void
+}
+
+define void @fct12() nounwind ssp {
+entry:
+; CHECK-LABEL: fct12:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
+  store <2 x i64> %0, <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i64>*), align 16
+  ret void
+}
+
+define void @fct13() nounwind ssp {
+entry:
+; CHECK-LABEL: fct13:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
+  store <4 x i32> %0, <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i32>*), align 16
+  ret void
+}
+
+define void @fct14() nounwind ssp {
+entry:
+; CHECK-LABEL: fct14:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
+  store <8 x i16> %0, <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i16>*), align 16
+  ret void
+}
+
+define void @fct15() nounwind ssp {
+entry:
+; CHECK-LABEL: fct15:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
+  store <16 x i8> %0, <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <16 x i8>*), align 16
+  ret void
+}
+
+; Check the building of vector from a single loaded value.
+; Part of <rdar://problem/14170854>
+;
+; Single loads with immediate offset.
+define <8 x i8> @fct16(i8* nocapture %sp0) {
+; CHECK-LABEL: fct16:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i8> %vec, %vec
+  ret <8 x i8> %vmull.i
+}
+
+define <16 x i8> @fct17(i8* nocapture %sp0) {
+; CHECK-LABEL: fct17:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <16 x i8> %vec, %vec
+  ret <16 x i8> %vmull.i
+}
+
+define <4 x i16> @fct18(i16* nocapture %sp0) {
+; CHECK-LABEL: fct18:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i16> %vec, %vec
+  ret <4 x i16> %vmull.i
+}
+
+define <8 x i16> @fct19(i16* nocapture %sp0) {
+; CHECK-LABEL: fct19:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i16> %vec, %vec
+  ret <8 x i16> %vmull.i
+}
+
+define <2 x i32> @fct20(i32* nocapture %sp0) {
+; CHECK-LABEL: fct20:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <2 x i32> %vec, %vec
+  ret <2 x i32> %vmull.i
+}
+
+define <4 x i32> @fct21(i32* nocapture %sp0) {
+; CHECK-LABEL: fct21:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i32> %vec, %vec
+  ret <4 x i32> %vmull.i
+}
+
+define <1 x i64> @fct22(i64* nocapture %sp0) {
+; CHECK-LABEL: fct22:
+; CHECK: ldr d0, [x0, #8]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+   ret <1 x i64> %vec
+}
+
+define <2 x i64> @fct23(i64* nocapture %sp0) {
+; CHECK-LABEL: fct23:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+  ret <2 x i64> %vec
+}
+
+;
+; Single loads with register offset.
+define <8 x i8> @fct24(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct24:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i8> %vec, %vec
+  ret <8 x i8> %vmull.i
+}
+
+define <16 x i8> @fct25(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct25:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <16 x i8> %vec, %vec
+  ret <16 x i8> %vmull.i
+}
+
+define <4 x i16> @fct26(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct26:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i16> %vec, %vec
+  ret <4 x i16> %vmull.i
+}
+
+define <8 x i16> @fct27(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct27:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i16> %vec, %vec
+  ret <8 x i16> %vmull.i
+}
+
+define <2 x i32> @fct28(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct28:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <2 x i32> %vec, %vec
+  ret <2 x i32> %vmull.i
+}
+
+define <4 x i32> @fct29(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct29:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i32> %vec, %vec
+  ret <4 x i32> %vmull.i
+}
+
+define <1 x i64> @fct30(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct30:
+; CHECK: ldr d0, [x0, x1, lsl #3]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+   ret <1 x i64> %vec
+}
+
+define <2 x i64> @fct31(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct31:
+; CHECK: ldr d0, [x0, x1, lsl #3]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+  ret <2 x i64> %vec
+}
diff --git a/test/CodeGen/ARM64/vext.ll b/test/CodeGen/ARM64/vext.ll
new file mode 100644
index 0000000..c820439
--- /dev/null
+++ b/test/CodeGen/ARM64/vext.ll
@@ -0,0 +1,464 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define void @test_vext_s8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s8:
+  ; CHECK: {{ext.8.*#1}}
+  %xS8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xS8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xS8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  store <8 x i8> %vext, <8 x i8>* %xS8x8, align 8
+  ret void
+}
+
+define void @test_vext_u8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u8:
+  ; CHECK: {{ext.8.*#2}}
+  %xU8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xU8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xU8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  store <8 x i8> %vext, <8 x i8>* %xU8x8, align 8
+  ret void
+}
+
+define void @test_vext_p8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_p8:
+  ; CHECK: {{ext.8.*#3}}
+  %xP8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xP8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xP8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  store <8 x i8> %vext, <8 x i8>* %xP8x8, align 8
+  ret void
+}
+
+define void @test_vext_s16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s16:
+  ; CHECK: {{ext.8.*#2}}
+  %xS16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xS16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xS16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  store <4 x i16> %vext, <4 x i16>* %xS16x4, align 8
+  ret void
+}
+
+define void @test_vext_u16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u16:
+  ; CHECK: {{ext.8.*#4}}
+  %xU16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xU16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xU16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  store <4 x i16> %vext, <4 x i16>* %xU16x4, align 8
+  ret void
+}
+
+define void @test_vext_p16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_p16:
+  ; CHECK: {{ext.8.*#6}}
+  %xP16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xP16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xP16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  store <4 x i16> %vext, <4 x i16>* %xP16x4, align 8
+  ret void
+}
+
+define void @test_vext_s32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s32:
+  ; CHECK: {{ext.8.*#4}}
+  %xS32x2 = alloca <2 x i32>, align 8
+  %__a = alloca <2 x i32>, align 8
+  %__b = alloca <2 x i32>, align 8
+  %tmp = load <2 x i32>* %xS32x2, align 8
+  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
+  %tmp1 = load <2 x i32>* %xS32x2, align 8
+  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
+  %tmp2 = load <2 x i32>* %__a, align 8
+  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x i32>* %__b, align 8
+  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
+  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i32> %vext, <2 x i32>* %xS32x2, align 8
+  ret void
+}
+
+define void @test_vext_u32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u32:
+  ; CHECK: {{ext.8.*#4}}
+  %xU32x2 = alloca <2 x i32>, align 8
+  %__a = alloca <2 x i32>, align 8
+  %__b = alloca <2 x i32>, align 8
+  %tmp = load <2 x i32>* %xU32x2, align 8
+  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
+  %tmp1 = load <2 x i32>* %xU32x2, align 8
+  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
+  %tmp2 = load <2 x i32>* %__a, align 8
+  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x i32>* %__b, align 8
+  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
+  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i32> %vext, <2 x i32>* %xU32x2, align 8
+  ret void
+}
+
+define void @test_vext_f32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_f32:
+  ; CHECK: {{ext.8.*#4}}
+  %xF32x2 = alloca <2 x float>, align 8
+  %__a = alloca <2 x float>, align 8
+  %__b = alloca <2 x float>, align 8
+  %tmp = load <2 x float>* %xF32x2, align 8
+  store <2 x float> %tmp, <2 x float>* %__a, align 8
+  %tmp1 = load <2 x float>* %xF32x2, align 8
+  store <2 x float> %tmp1, <2 x float>* %__b, align 8
+  %tmp2 = load <2 x float>* %__a, align 8
+  %tmp3 = bitcast <2 x float> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x float>* %__b, align 8
+  %tmp5 = bitcast <2 x float> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x float>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x float>
+  %vext = shufflevector <2 x float> %tmp6, <2 x float> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x float> %vext, <2 x float>* %xF32x2, align 8
+  ret void
+}
+
+define void @test_vext_s64() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s64:
+  ; CHECK_FIXME: {{ext.8.*#1}}
+  ; this just turns into a load of the second element
+  %xS64x1 = alloca <1 x i64>, align 8
+  %__a = alloca <1 x i64>, align 8
+  %__b = alloca <1 x i64>, align 8
+  %tmp = load <1 x i64>* %xS64x1, align 8
+  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
+  %tmp1 = load <1 x i64>* %xS64x1, align 8
+  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
+  %tmp2 = load <1 x i64>* %__a, align 8
+  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
+  %tmp4 = load <1 x i64>* %__b, align 8
+  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
+  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
+  store <1 x i64> %vext, <1 x i64>* %xS64x1, align 8
+  ret void
+}
+
+define void @test_vext_u64() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u64:
+  ; CHECK_FIXME: {{ext.8.*#1}}
+  ; this is turned into a simple load of the 2nd element
+  %xU64x1 = alloca <1 x i64>, align 8
+  %__a = alloca <1 x i64>, align 8
+  %__b = alloca <1 x i64>, align 8
+  %tmp = load <1 x i64>* %xU64x1, align 8
+  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
+  %tmp1 = load <1 x i64>* %xU64x1, align 8
+  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
+  %tmp2 = load <1 x i64>* %__a, align 8
+  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
+  %tmp4 = load <1 x i64>* %__b, align 8
+  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
+  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
+  store <1 x i64> %vext, <1 x i64>* %xU64x1, align 8
+  ret void
+}
+
+define void @test_vextq_s8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s8:
+  ; CHECK: {{ext.16.*#4}}
+  %xS8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xS8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xS8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+  store <16 x i8> %vext, <16 x i8>* %xS8x16, align 16
+  ret void
+}
+
+define void @test_vextq_u8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u8:
+  ; CHECK: {{ext.16.*#5}}
+  %xU8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xU8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xU8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+  store <16 x i8> %vext, <16 x i8>* %xU8x16, align 16
+  ret void
+}
+
+define void @test_vextq_p8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_p8:
+  ; CHECK: {{ext.16.*#6}}
+  %xP8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xP8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xP8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+  store <16 x i8> %vext, <16 x i8>* %xP8x16, align 16
+  ret void
+}
+
+define void @test_vextq_s16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s16:
+  ; CHECK: {{ext.16.*#14}}
+  %xS16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xS16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xS16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  store <8 x i16> %vext, <8 x i16>* %xS16x8, align 16
+  ret void
+}
+
+define void @test_vextq_u16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u16:
+  ; CHECK: {{ext.16.*#8}}
+  %xU16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xU16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xU16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  store <8 x i16> %vext, <8 x i16>* %xU16x8, align 16
+  ret void
+}
+
+define void @test_vextq_p16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_p16:
+  ; CHECK: {{ext.16.*#10}}
+  %xP16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xP16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xP16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+  store <8 x i16> %vext, <8 x i16>* %xP16x8, align 16
+  ret void
+}
+
+define void @test_vextq_s32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s32:
+  ; CHECK: {{ext.16.*#4}}
+  %xS32x4 = alloca <4 x i32>, align 16
+  %__a = alloca <4 x i32>, align 16
+  %__b = alloca <4 x i32>, align 16
+  %tmp = load <4 x i32>* %xS32x4, align 16
+  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
+  %tmp1 = load <4 x i32>* %xS32x4, align 16
+  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
+  %tmp2 = load <4 x i32>* %__a, align 16
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x i32>* %__b, align 16
+  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
+  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  store <4 x i32> %vext, <4 x i32>* %xS32x4, align 16
+  ret void
+}
+
+define void @test_vextq_u32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u32:
+  ; CHECK: {{ext.16.*#8}}
+  %xU32x4 = alloca <4 x i32>, align 16
+  %__a = alloca <4 x i32>, align 16
+  %__b = alloca <4 x i32>, align 16
+  %tmp = load <4 x i32>* %xU32x4, align 16
+  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
+  %tmp1 = load <4 x i32>* %xU32x4, align 16
+  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
+  %tmp2 = load <4 x i32>* %__a, align 16
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x i32>* %__b, align 16
+  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
+  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  store <4 x i32> %vext, <4 x i32>* %xU32x4, align 16
+  ret void
+}
+
+define void @test_vextq_f32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_f32:
+  ; CHECK: {{ext.16.*#12}}
+  %xF32x4 = alloca <4 x float>, align 16
+  %__a = alloca <4 x float>, align 16
+  %__b = alloca <4 x float>, align 16
+  %tmp = load <4 x float>* %xF32x4, align 16
+  store <4 x float> %tmp, <4 x float>* %__a, align 16
+  %tmp1 = load <4 x float>* %xF32x4, align 16
+  store <4 x float> %tmp1, <4 x float>* %__b, align 16
+  %tmp2 = load <4 x float>* %__a, align 16
+  %tmp3 = bitcast <4 x float> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x float>* %__b, align 16
+  %tmp5 = bitcast <4 x float> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x float>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x float>
+  %vext = shufflevector <4 x float> %tmp6, <4 x float> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  store <4 x float> %vext, <4 x float>* %xF32x4, align 16
+  ret void
+}
+
+define void @test_vextq_s64() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s64:
+  ; CHECK: {{ext.16.*#8}}
+  %xS64x2 = alloca <2 x i64>, align 16
+  %__a = alloca <2 x i64>, align 16
+  %__b = alloca <2 x i64>, align 16
+  %tmp = load <2 x i64>* %xS64x2, align 16
+  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
+  %tmp1 = load <2 x i64>* %xS64x2, align 16
+  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
+  %tmp2 = load <2 x i64>* %__a, align 16
+  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
+  %tmp4 = load <2 x i64>* %__b, align 16
+  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
+  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i64> %vext, <2 x i64>* %xS64x2, align 16
+  ret void
+}
+
+define void @test_vextq_u64() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u64:
+  ; CHECK: {{ext.16.*#8}}
+  %xU64x2 = alloca <2 x i64>, align 16
+  %__a = alloca <2 x i64>, align 16
+  %__b = alloca <2 x i64>, align 16
+  %tmp = load <2 x i64>* %xU64x2, align 16
+  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
+  %tmp1 = load <2 x i64>* %xU64x2, align 16
+  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
+  %tmp2 = load <2 x i64>* %__a, align 16
+  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
+  %tmp4 = load <2 x i64>* %__b, align 16
+  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
+  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i64> %vext, <2 x i64>* %xU64x2, align 16
+  ret void
+}
+
+; shuffles with an undef second operand can use an EXT also so long as the
+; indices wrap and stay sequential.
+; rdar://12051674
+define <16 x i8> @vext1(<16 x i8> %_a) nounwind {
+; CHECK-LABEL: vext1:
+; CHECK: ext.16b  v0, v0, v0, #8
+  %vext = shufflevector <16 x i8> %_a, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i8> %vext
+}
+
+; <rdar://problem/12212062>
+define <2 x i64> @vext2(<2 x i64> %p0, <2 x i64> %p1) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vext2:
+; CHECK: ext.16b v1, v1, v1, #8
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: add.2d  v0, v0, v1
+  %t0 = shufflevector <2 x i64> %p1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  %t1 = shufflevector <2 x i64> %p0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  %t2 = add <2 x i64> %t1, %t0
+  ret <2 x i64> %t2
+}
diff --git a/test/CodeGen/ARM64/vfloatintrinsics.ll b/test/CodeGen/ARM64/vfloatintrinsics.ll
new file mode 100644
index 0000000..a8c882b
--- /dev/null
+++ b/test/CodeGen/ARM64/vfloatintrinsics.ll
@@ -0,0 +1,375 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+;;; Float vectors
+
+%v2f32 = type <2 x float>
+; CHECK: test_v2f32.sqrt:
+define %v2f32 @test_v2f32.sqrt(%v2f32 %a) {
+  ; CHECK: fsqrt.2s
+  %1 = call %v2f32 @llvm.sqrt.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.powi:
+define %v2f32 @test_v2f32.powi(%v2f32 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f32 @llvm.powi.v2f32(%v2f32 %a, i32 %b)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.sin:
+define %v2f32 @test_v2f32.sin(%v2f32 %a) {
+  ; CHECK: sin
+  %1 = call %v2f32 @llvm.sin.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.cos:
+define %v2f32 @test_v2f32.cos(%v2f32 %a) {
+  ; CHECK: cos
+  %1 = call %v2f32 @llvm.cos.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.pow:
+define %v2f32 @test_v2f32.pow(%v2f32 %a, %v2f32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f32 @llvm.pow.v2f32(%v2f32 %a, %v2f32 %b)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.exp:
+define %v2f32 @test_v2f32.exp(%v2f32 %a) {
+  ; CHECK: exp
+  %1 = call %v2f32 @llvm.exp.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.exp2:
+define %v2f32 @test_v2f32.exp2(%v2f32 %a) {
+  ; CHECK: exp
+  %1 = call %v2f32 @llvm.exp2.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log:
+define %v2f32 @test_v2f32.log(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log10:
+define %v2f32 @test_v2f32.log10(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log10.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log2:
+define %v2f32 @test_v2f32.log2(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log2.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.fma:
+define %v2f32 @test_v2f32.fma(%v2f32 %a, %v2f32 %b, %v2f32 %c) {
+  ; CHECK: fma
+  %1 = call %v2f32 @llvm.fma.v2f32(%v2f32 %a, %v2f32 %b, %v2f32 %c)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.fabs:
+define %v2f32 @test_v2f32.fabs(%v2f32 %a) {
+  ; CHECK: fabs
+  %1 = call %v2f32 @llvm.fabs.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.floor:
+define %v2f32 @test_v2f32.floor(%v2f32 %a) {
+  ; CHECK: frintm.2s
+  %1 = call %v2f32 @llvm.floor.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.ceil:
+define %v2f32 @test_v2f32.ceil(%v2f32 %a) {
+  ; CHECK: frintp.2s
+  %1 = call %v2f32 @llvm.ceil.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.trunc:
+define %v2f32 @test_v2f32.trunc(%v2f32 %a) {
+  ; CHECK: frintz.2s
+  %1 = call %v2f32 @llvm.trunc.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.rint:
+define %v2f32 @test_v2f32.rint(%v2f32 %a) {
+  ; CHECK: frintx.2s
+  %1 = call %v2f32 @llvm.rint.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.nearbyint:
+define %v2f32 @test_v2f32.nearbyint(%v2f32 %a) {
+  ; CHECK: frinti.2s
+  %1 = call %v2f32 @llvm.nearbyint.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+
+declare %v2f32 @llvm.sqrt.v2f32(%v2f32) #0
+declare %v2f32 @llvm.powi.v2f32(%v2f32, i32) #0
+declare %v2f32 @llvm.sin.v2f32(%v2f32) #0
+declare %v2f32 @llvm.cos.v2f32(%v2f32) #0
+declare %v2f32 @llvm.pow.v2f32(%v2f32, %v2f32) #0
+declare %v2f32 @llvm.exp.v2f32(%v2f32) #0
+declare %v2f32 @llvm.exp2.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log10.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log2.v2f32(%v2f32) #0
+declare %v2f32 @llvm.fma.v2f32(%v2f32, %v2f32, %v2f32) #0
+declare %v2f32 @llvm.fabs.v2f32(%v2f32) #0
+declare %v2f32 @llvm.floor.v2f32(%v2f32) #0
+declare %v2f32 @llvm.ceil.v2f32(%v2f32) #0
+declare %v2f32 @llvm.trunc.v2f32(%v2f32) #0
+declare %v2f32 @llvm.rint.v2f32(%v2f32) #0
+declare %v2f32 @llvm.nearbyint.v2f32(%v2f32) #0
+
+;;;
+
+%v4f32 = type <4 x float>
+; CHECK: test_v4f32.sqrt:
+define %v4f32 @test_v4f32.sqrt(%v4f32 %a) {
+  ; CHECK: fsqrt.4s
+  %1 = call %v4f32 @llvm.sqrt.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.powi:
+define %v4f32 @test_v4f32.powi(%v4f32 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v4f32 @llvm.powi.v4f32(%v4f32 %a, i32 %b)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.sin:
+define %v4f32 @test_v4f32.sin(%v4f32 %a) {
+  ; CHECK: sin
+  %1 = call %v4f32 @llvm.sin.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.cos:
+define %v4f32 @test_v4f32.cos(%v4f32 %a) {
+  ; CHECK: cos
+  %1 = call %v4f32 @llvm.cos.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.pow:
+define %v4f32 @test_v4f32.pow(%v4f32 %a, %v4f32 %b) {
+  ; CHECK: pow
+  %1 = call %v4f32 @llvm.pow.v4f32(%v4f32 %a, %v4f32 %b)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.exp:
+define %v4f32 @test_v4f32.exp(%v4f32 %a) {
+  ; CHECK: exp
+  %1 = call %v4f32 @llvm.exp.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.exp2:
+define %v4f32 @test_v4f32.exp2(%v4f32 %a) {
+  ; CHECK: exp
+  %1 = call %v4f32 @llvm.exp2.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log:
+define %v4f32 @test_v4f32.log(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log10:
+define %v4f32 @test_v4f32.log10(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log10.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log2:
+define %v4f32 @test_v4f32.log2(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log2.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.fma:
+define %v4f32 @test_v4f32.fma(%v4f32 %a, %v4f32 %b, %v4f32 %c) {
+  ; CHECK: fma
+  %1 = call %v4f32 @llvm.fma.v4f32(%v4f32 %a, %v4f32 %b, %v4f32 %c)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.fabs:
+define %v4f32 @test_v4f32.fabs(%v4f32 %a) {
+  ; CHECK: fabs
+  %1 = call %v4f32 @llvm.fabs.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.floor:
+define %v4f32 @test_v4f32.floor(%v4f32 %a) {
+  ; CHECK: frintm.4s
+  %1 = call %v4f32 @llvm.floor.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.ceil:
+define %v4f32 @test_v4f32.ceil(%v4f32 %a) {
+  ; CHECK: frintp.4s
+  %1 = call %v4f32 @llvm.ceil.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.trunc:
+define %v4f32 @test_v4f32.trunc(%v4f32 %a) {
+  ; CHECK: frintz.4s
+  %1 = call %v4f32 @llvm.trunc.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.rint:
+define %v4f32 @test_v4f32.rint(%v4f32 %a) {
+  ; CHECK: frintx.4s
+  %1 = call %v4f32 @llvm.rint.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.nearbyint:
+define %v4f32 @test_v4f32.nearbyint(%v4f32 %a) {
+  ; CHECK: frinti.4s
+  %1 = call %v4f32 @llvm.nearbyint.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+
+declare %v4f32 @llvm.sqrt.v4f32(%v4f32) #0
+declare %v4f32 @llvm.powi.v4f32(%v4f32, i32) #0
+declare %v4f32 @llvm.sin.v4f32(%v4f32) #0
+declare %v4f32 @llvm.cos.v4f32(%v4f32) #0
+declare %v4f32 @llvm.pow.v4f32(%v4f32, %v4f32) #0
+declare %v4f32 @llvm.exp.v4f32(%v4f32) #0
+declare %v4f32 @llvm.exp2.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log10.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log2.v4f32(%v4f32) #0
+declare %v4f32 @llvm.fma.v4f32(%v4f32, %v4f32, %v4f32) #0
+declare %v4f32 @llvm.fabs.v4f32(%v4f32) #0
+declare %v4f32 @llvm.floor.v4f32(%v4f32) #0
+declare %v4f32 @llvm.ceil.v4f32(%v4f32) #0
+declare %v4f32 @llvm.trunc.v4f32(%v4f32) #0
+declare %v4f32 @llvm.rint.v4f32(%v4f32) #0
+declare %v4f32 @llvm.nearbyint.v4f32(%v4f32) #0
+
+;;; Double vector
+
+%v2f64 = type <2 x double>
+; CHECK: test_v2f64.sqrt:
+define %v2f64 @test_v2f64.sqrt(%v2f64 %a) {
+  ; CHECK: fsqrt.2d
+  %1 = call %v2f64 @llvm.sqrt.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.powi:
+define %v2f64 @test_v2f64.powi(%v2f64 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f64 @llvm.powi.v2f64(%v2f64 %a, i32 %b)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.sin:
+define %v2f64 @test_v2f64.sin(%v2f64 %a) {
+  ; CHECK: sin
+  %1 = call %v2f64 @llvm.sin.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.cos:
+define %v2f64 @test_v2f64.cos(%v2f64 %a) {
+  ; CHECK: cos
+  %1 = call %v2f64 @llvm.cos.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.pow:
+define %v2f64 @test_v2f64.pow(%v2f64 %a, %v2f64 %b) {
+  ; CHECK: pow
+  %1 = call %v2f64 @llvm.pow.v2f64(%v2f64 %a, %v2f64 %b)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.exp:
+define %v2f64 @test_v2f64.exp(%v2f64 %a) {
+  ; CHECK: exp
+  %1 = call %v2f64 @llvm.exp.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.exp2:
+define %v2f64 @test_v2f64.exp2(%v2f64 %a) {
+  ; CHECK: exp
+  %1 = call %v2f64 @llvm.exp2.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log:
+define %v2f64 @test_v2f64.log(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log10:
+define %v2f64 @test_v2f64.log10(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log10.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log2:
+define %v2f64 @test_v2f64.log2(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log2.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.fma:
+define %v2f64 @test_v2f64.fma(%v2f64 %a, %v2f64 %b, %v2f64 %c) {
+  ; CHECK: fma
+  %1 = call %v2f64 @llvm.fma.v2f64(%v2f64 %a, %v2f64 %b, %v2f64 %c)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.fabs:
+define %v2f64 @test_v2f64.fabs(%v2f64 %a) {
+  ; CHECK: fabs
+  %1 = call %v2f64 @llvm.fabs.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.floor:
+define %v2f64 @test_v2f64.floor(%v2f64 %a) {
+  ; CHECK: frintm.2d
+  %1 = call %v2f64 @llvm.floor.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.ceil:
+define %v2f64 @test_v2f64.ceil(%v2f64 %a) {
+  ; CHECK: frintp.2d
+  %1 = call %v2f64 @llvm.ceil.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.trunc:
+define %v2f64 @test_v2f64.trunc(%v2f64 %a) {
+  ; CHECK: frintz.2d
+  %1 = call %v2f64 @llvm.trunc.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.rint:
+define %v2f64 @test_v2f64.rint(%v2f64 %a) {
+  ; CHECK: frintx.2d
+  %1 = call %v2f64 @llvm.rint.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.nearbyint:
+define %v2f64 @test_v2f64.nearbyint(%v2f64 %a) {
+  ; CHECK: frinti.2d
+  %1 = call %v2f64 @llvm.nearbyint.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+
+declare %v2f64 @llvm.sqrt.v2f64(%v2f64) #0
+declare %v2f64 @llvm.powi.v2f64(%v2f64, i32) #0
+declare %v2f64 @llvm.sin.v2f64(%v2f64) #0
+declare %v2f64 @llvm.cos.v2f64(%v2f64) #0
+declare %v2f64 @llvm.pow.v2f64(%v2f64, %v2f64) #0
+declare %v2f64 @llvm.exp.v2f64(%v2f64) #0
+declare %v2f64 @llvm.exp2.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log10.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log2.v2f64(%v2f64) #0
+declare %v2f64 @llvm.fma.v2f64(%v2f64, %v2f64, %v2f64) #0
+declare %v2f64 @llvm.fabs.v2f64(%v2f64) #0
+declare %v2f64 @llvm.floor.v2f64(%v2f64) #0
+declare %v2f64 @llvm.ceil.v2f64(%v2f64) #0
+declare %v2f64 @llvm.trunc.v2f64(%v2f64) #0
+declare %v2f64 @llvm.rint.v2f64(%v2f64) #0
+declare %v2f64 @llvm.nearbyint.v2f64(%v2f64) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/ARM64/vhadd.ll b/test/CodeGen/ARM64/vhadd.ll
new file mode 100644
index 0000000..aed7681
--- /dev/null
+++ b/test/CodeGen/ARM64/vhadd.ll
@@ -0,0 +1,249 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd8b:
+;CHECK: shadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd16b:
+;CHECK: shadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd4h:
+;CHECK: shadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd8h:
+;CHECK: shadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd2s:
+;CHECK: shadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd4s:
+;CHECK: shadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd8b:
+;CHECK: uhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd16b:
+;CHECK: uhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd4h:
+;CHECK: uhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd8h:
+;CHECK: uhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd2s:
+;CHECK: uhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd4s:
+;CHECK: uhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd8b:
+;CHECK: srhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd16b:
+;CHECK: srhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd4h:
+;CHECK: srhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd8h:
+;CHECK: srhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd2s:
+;CHECK: srhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd4s:
+;CHECK: srhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd8b:
+;CHECK: urhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd16b:
+;CHECK: urhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd4h:
+;CHECK: urhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd8h:
+;CHECK: urhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd2s:
+;CHECK: urhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd4s:
+;CHECK: urhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vhsub.ll b/test/CodeGen/ARM64/vhsub.ll
new file mode 100644
index 0000000..85df4d4
--- /dev/null
+++ b/test/CodeGen/ARM64/vhsub.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub8b:
+;CHECK: shsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub16b:
+;CHECK: shsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub4h:
+;CHECK: shsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub8h:
+;CHECK: shsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub2s:
+;CHECK: shsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub4s:
+;CHECK: shsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub8b:
+;CHECK: uhsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub16b:
+;CHECK: uhsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub4h:
+;CHECK: uhsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub8h:
+;CHECK: uhsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub2s:
+;CHECK: uhsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub4s:
+;CHECK: uhsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/virtual_base.ll b/test/CodeGen/ARM64/virtual_base.ll
new file mode 100644
index 0000000..cb95954
--- /dev/null
+++ b/test/CodeGen/ARM64/virtual_base.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -O3 -march arm64 | FileCheck %s
+; <rdar://13463602>
+
+%struct.Counter_Struct = type { i64, i64 }
+%struct.Bicubic_Patch_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64, i32, i32, i32, [4 x [4 x [3 x double]]], [3 x double], double, double, %struct.Bezier_Node_Struct* }
+%struct.Method_Struct = type { i32 (%struct.Object_Struct*, %struct.Ray_Struct*, %struct.istack_struct*)*, i32 (double*, %struct.Object_Struct*)*, void (double*, %struct.Object_Struct*, %struct.istk_entry*)*, i8* (%struct.Object_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*)*, void (%struct.Object_Struct*)* }
+%struct.Object_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64 }
+%struct.Texture_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.9, %struct.Texture_Struct*, %struct.Pigment_Struct*, %struct.Tnormal_Struct*, %struct.Finish_Struct*, %struct.Texture_Struct*, i32 }
+%struct.Warps_Struct = type { i16, %struct.Warps_Struct* }
+%struct.Pattern_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.6 }
+%struct.Blend_Map_Struct = type { i16, i16, i16, i64, %struct.Blend_Map_Entry* }
+%struct.Blend_Map_Entry = type { float, i8, %union.anon }
+%union.anon = type { [2 x double], [8 x i8] }
+%union.anon.6 = type { %struct.anon.7 }
+%struct.anon.7 = type { float, [3 x double] }
+%union.anon.9 = type { %struct.anon.10 }
+%struct.anon.10 = type { float, [3 x double] }
+%struct.Pigment_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.0, [5 x float] }
+%union.anon.0 = type { %struct.anon }
+%struct.anon = type { float, [3 x double] }
+%struct.Tnormal_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.3, float }
+%union.anon.3 = type { %struct.anon.4 }
+%struct.anon.4 = type { float, [3 x double] }
+%struct.Finish_Struct = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, [3 x float], [3 x float] }
+%struct.Interior_Struct = type { i64, i32, float, float, float, float, float, %struct.Media_Struct* }
+%struct.Media_Struct = type { i32, i32, i32, i32, i32, double, double, i32, i32, i32, i32, [5 x float], [5 x float], [5 x float], [5 x float], double, double, double, double*, %struct.Pigment_Struct*, %struct.Media_Struct* }
+%struct.Bounding_Box_Struct = type { [3 x float], [3 x float] }
+%struct.Ray_Struct = type { [3 x double], [3 x double], i32, [100 x %struct.Interior_Struct*] }
+%struct.istack_struct = type { %struct.istack_struct*, %struct.istk_entry*, i32 }
+%struct.istk_entry = type { double, [3 x double], [3 x double], %struct.Object_Struct*, i32, i32, double, double, i8* }
+%struct.Transform_Struct = type { [4 x [4 x double]], [4 x [4 x double]] }
+%struct.Bezier_Node_Struct = type { i32, [3 x double], double, i32, i8* }
+
+define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
+; CHECK: Precompute_Patch_Values
+; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
+; CHECK-NEXT: str [[VAL]], [sp, #232]
+; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
+; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
+entry:
+  %Control_Points = alloca [16 x [3 x double]], align 8
+  %arraydecay5.3.1 = getelementptr inbounds [16 x [3 x double]]* %Control_Points, i64 0, i64 9, i64 0
+  %tmp14 = bitcast double* %arraydecay5.3.1 to i8*
+  %arraydecay11.3.1 = getelementptr inbounds %struct.Bicubic_Patch_Struct* %Shape, i64 0, i32 12, i64 1, i64 3, i64 0
+  %tmp15 = bitcast double* %arraydecay11.3.1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp14, i8* %tmp15, i64 24, i32 1, i1 false)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
diff --git a/test/CodeGen/ARM64/vmax.ll b/test/CodeGen/ARM64/vmax.ll
new file mode 100644
index 0000000..b2426f3
--- /dev/null
+++ b/test/CodeGen/ARM64/vmax.ll
@@ -0,0 +1,679 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_8b:
+;CHECK: smax.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_16b:
+;CHECK: smax.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_4h:
+;CHECK: smax.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_8h:
+;CHECK: smax.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_2s:
+;CHECK: smax.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_4s:
+;CHECK: smax.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_8b:
+;CHECK: umax.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_16b:
+;CHECK: umax.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_4h:
+;CHECK: umax.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_8h:
+;CHECK: umax.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_2s:
+;CHECK: umax.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_4s:
+;CHECK: umax.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @smin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_8b:
+;CHECK: smin.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_16b:
+;CHECK: smin.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_4h:
+;CHECK: smin.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_8h:
+;CHECK: smin.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_2s:
+;CHECK: smin.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_4s:
+;CHECK: smin.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_8b:
+;CHECK: umin.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_16b:
+;CHECK: umin.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_4h:
+;CHECK: umin.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_8h:
+;CHECK: umin.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_2s:
+;CHECK: umin.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_4s:
+;CHECK: umin.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_8b:
+;CHECK: smaxp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_16b:
+;CHECK: smaxp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_4h:
+;CHECK: smaxp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_8h:
+;CHECK: smaxp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_2s:
+;CHECK: smaxp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_4s:
+;CHECK: smaxp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_8b:
+;CHECK: umaxp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_16b:
+;CHECK: umaxp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_4h:
+;CHECK: umaxp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_8h:
+;CHECK: umaxp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_2s:
+;CHECK: umaxp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_4s:
+;CHECK: umaxp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_8b:
+;CHECK: sminp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_16b:
+;CHECK: sminp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_4h:
+;CHECK: sminp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_8h:
+;CHECK: sminp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_2s:
+;CHECK: sminp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_4s:
+;CHECK: sminp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_8b:
+;CHECK: uminp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_16b:
+;CHECK: uminp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_4h:
+;CHECK: uminp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_8h:
+;CHECK: uminp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_2s:
+;CHECK: uminp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_4s:
+;CHECK: uminp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x float> @fmax_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_2s:
+;CHECK: fmax.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmax_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_4s:
+;CHECK: fmax.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmax_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmax_2d:
+;CHECK: fmax.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2s:
+;CHECK: fmaxp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_4s:
+;CHECK: fmaxp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2d:
+;CHECK: fmaxp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmin_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_2s:
+;CHECK: fmin.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmin_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_4s:
+;CHECK: fmin.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmin_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmin_2d:
+;CHECK: fmin.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_2s:
+;CHECK: fminp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_4s:
+;CHECK: fminp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminp_2d:
+;CHECK: fminp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2s:
+;CHECK: fminnmp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_4s:
+;CHECK: fminnmp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2d:
+;CHECK: fminnmp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2s:
+;CHECK: fmaxnmp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_4s:
+;CHECK: fmaxnmp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2d:
+;CHECK: fmaxnmp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vminmaxnm.ll b/test/CodeGen/ARM64/vminmaxnm.ll
new file mode 100644
index 0000000..6286407
--- /dev/null
+++ b/test/CodeGen/ARM64/vminmaxnm.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vmaxnm2.i
+}
+
+define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.4s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vmaxnm2.i
+}
+
+define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2d	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vmaxnm2.i
+}
+
+define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.2s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vminnm2.i
+}
+
+define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.4s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vminnm2.i
+}
+
+define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fminnm.2d	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vminnm2.i
+}
+
+declare <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+
+define double @test_fmaxnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fmaxnmv:
+; CHECK: fmaxnmp.2d d0, v0
+  %max = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+define double @test_fminnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fminnmv:
+; CHECK: fminnmp.2d d0, v0
+  %min = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
+declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/vmovn.ll b/test/CodeGen/ARM64/vmovn.ll
new file mode 100644
index 0000000..675633b
--- /dev/null
+++ b/test/CodeGen/ARM64/vmovn.ll
@@ -0,0 +1,242 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @xtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: xtn8b:
+;CHECK-NOT: ld1
+;CHECK: xtn.8b v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <8 x i16> %A to <8 x i8>
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @xtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: xtn4h:
+;CHECK-NOT: ld1
+;CHECK: xtn.4h v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <4 x i32> %A to <4 x i16>
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @xtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: xtn2s:
+;CHECK-NOT: ld1
+;CHECK: xtn.2s v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <2 x i64> %A to <2 x i32>
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @xtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: xtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: xtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <8 x i16> %A to <8 x i8>
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @xtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: xtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: xtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <4 x i32> %A to <4 x i16>
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @xtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: xtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: xtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <2 x i64> %A to <2 x i32>
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+define <8 x i8> @sqxtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtn8b:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqxtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtn4h:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqxtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtn2s:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @sqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
+
+define <8 x i8> @uqxtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: uqxtn8b:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqxtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: uqxtn4h:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqxtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: uqxtn2s:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: uqxtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: uqxtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @uqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: uqxtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.arm64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
+
+define <8 x i8> @sqxtun8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtun8b:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqxtun4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtun4h:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqxtun2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtun2s:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtun2_16b:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtun2_8h:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @sqxtun2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtun2_4s:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/vmul.ll b/test/CodeGen/ARM64/vmul.ll
new file mode 100644
index 0000000..3ef0a76
--- /dev/null
+++ b/test/CodeGen/ARM64/vmul.ll
@@ -0,0 +1,2003 @@
+; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+
+define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smull8h:
+;CHECK: smull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smull4s:
+;CHECK: smull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smull2d:
+;CHECK: smull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+declare <8 x i16>  @llvm.arm64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umull8h:
+;CHECK: umull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umull4s:
+;CHECK: umull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umull2d:
+;CHECK: umull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+declare <8 x i16>  @llvm.arm64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull4s:
+;CHECK: sqdmull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2d:
+;CHECK: sqdmull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_4s:
+;CHECK: sqdmull2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_2d:
+;CHECK: sqdmull2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+
+declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: pmull8h:
+;CHECK: pmull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+declare <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_4h:
+;CHECK: sqdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_8h:
+;CHECK: sqdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_2s:
+;CHECK: sqdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_4s:
+;CHECK: sqdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
+;CHECK-LABEL: sqdmulh_1s:
+;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
+  %tmp1 = load i32* %A
+  %tmp2 = load i32* %B
+  %tmp3 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
+  ret i32 %tmp3
+}
+
+declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.arm64.neon.sqdmulh.i32(i32, i32) nounwind readnone
+
+define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_4h:
+;CHECK: sqrdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_8h:
+;CHECK: sqrdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_2s:
+;CHECK: sqrdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_4s:
+;CHECK: sqrdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_1s:
+;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
+  %tmp1 = load i32* %A
+  %tmp2 = load i32* %B
+  %tmp3 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
+  ret i32 %tmp3
+}
+
+declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.arm64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
+
+define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_2s:
+;CHECK: fmulx.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+  ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_4s:
+;CHECK: fmulx.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+  ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmulx_2d:
+;CHECK: fmulx.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+  ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlal4s:
+;CHECK: smlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlal2d:
+;CHECK: smlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlsl4s:
+;CHECK: smlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlsl2d:
+;CHECK: smlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal4s:
+;CHECK: sqdmlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2d:
+;CHECK: sqdmlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_4s:
+;CHECK: sqdmlal2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_2d:
+;CHECK: sqdmlal2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl4s:
+;CHECK: sqdmlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2d:
+;CHECK: sqdmlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_4s:
+;CHECK: sqdmlsl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_2d:
+;CHECK: sqdmlsl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlal4s:
+;CHECK: umlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlal2d:
+;CHECK: umlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlsl4s:
+;CHECK: umlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlsl2d:
+;CHECK: umlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmla_2s:
+;CHECK: fmla.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmla_4s:
+;CHECK: fmla.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmla_2d:
+;CHECK: fmla.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_2s:
+;CHECK: fmls.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
+  ret <2 x float> %tmp5
+}
+
+define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_4s:
+;CHECK: fmls.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
+  ret <4 x float> %tmp5
+}
+
+define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmls_2d:
+;CHECK: fmls.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
+  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
+  ret <2 x double> %tmp5
+}
+
+define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_2s:
+;CHECK: fmls.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
+  ret <2 x float> %tmp5
+}
+
+define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_4s:
+;CHECK: fmls.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
+  ret <4 x float> %tmp5
+}
+
+define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_2d:
+;CHECK: fmls.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
+  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
+  ret <2 x double> %tmp5
+}
+
+define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_2s:
+;CHECK: fmls.2s
+entry:
+  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
+  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
+  ret <2 x float> %fmls1
+}
+
+define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_4s:
+;CHECK: fmls.4s
+entry:
+  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+  %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
+  ret <4 x float> %fmls1
+}
+
+define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_2d:
+;CHECK: fmls.2d
+entry:
+  %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
+  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
+  ret <2 x double> %fmls1
+}
+
+define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fmla_indexed_scalar_2s:
+; CHECK-NEXT: fmla.2s
+; CHECK-NEXT: ret
+  %v1 = insertelement <2 x float> undef, float %c, i32 0
+  %v2 = insertelement <2 x float> %v1, float %c, i32 1
+  %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
+  ret <2 x float> %fmla1
+}
+
+define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fmla_indexed_scalar_4s:
+; CHECK-NEXT: fmla.4s
+; CHECK-NEXT: ret
+  %v1 = insertelement <4 x float> undef, float %c, i32 0
+  %v2 = insertelement <4 x float> %v1, float %c, i32 1
+  %v3 = insertelement <4 x float> %v2, float %c, i32 2
+  %v4 = insertelement <4 x float> %v3, float %c, i32 3
+  %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %fmla1
+}
+
+define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fmla_indexed_scalar_2d:
+; CHECK-NEXT: fmla.2d
+; CHECK-NEXT: ret
+entry:
+  %v1 = insertelement <2 x double> undef, double %c, i32 0
+  %v2 = insertelement <2 x double> %v1, double %c, i32 1
+  %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %fmla1
+}
+
+define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: mul_4h:
+;CHECK-NOT: dup
+;CHECK: mul.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <4 x i16> %tmp1, %tmp3
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: mul_8h:
+;CHECK-NOT: dup
+;CHECK: mul.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <8 x i16> %tmp1, %tmp3
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: mul_2s:
+;CHECK-NOT: dup
+;CHECK: mul.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = mul <2 x i32> %tmp1, %tmp3
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: mul_4s:
+;CHECK-NOT: dup
+;CHECK: mul.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <4 x i32> %tmp1, %tmp3
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK-LABEL: mul_2d:
+; CHECK: mul
+; CHECK: mul
+  %tmp1 = mul <2 x i64> %A, %B
+  ret <2 x i64> %tmp1
+}
+
+define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_2s:
+;CHECK-NOT: dup
+;CHECK: fmul.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = fmul <2 x float> %tmp1, %tmp3
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_4s:
+;CHECK-NOT: dup
+;CHECK: fmul.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = fmul <4 x float> %tmp1, %tmp3
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_2d:
+;CHECK-NOT: dup
+;CHECK: fmul.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = fmul <2 x double> %tmp1, %tmp3
+  ret <2 x double> %tmp4
+}
+
+define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
+;CHECK-LABEL: fmul_lane_s:
+;CHECK-NOT: dup
+;CHECK: fmul.s s0, s0, v1[3]
+  %B = extractelement <4 x float> %vec, i32 3
+  %res = fmul float %A, %B
+  ret float %res
+}
+
+define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
+;CHECK-LABEL: fmul_lane_d:
+;CHECK-NOT: dup
+;CHECK: fmul.d d0, d0, v1[1]
+  %B = extractelement <2 x double> %vec, i32 1
+  %res = fmul double %A, %B
+  ret double %res
+}
+
+
+
+define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_2s:
+;CHECK-NOT: dup
+;CHECK: fmulx.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_4s:
+;CHECK-NOT: dup
+;CHECK: fmulx.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_2d:
+;CHECK-NOT: dup
+;CHECK: fmulx.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_4h:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_8h:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_2s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_1s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
+  %tmp1 = extractelement <4 x i32> %B, i32 1
+  %tmp2 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
+  ret i32 %tmp2
+}
+
+define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_4h:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_8h:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_2s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_1s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
+  %tmp1 = extractelement <4 x i32> %B, i32 1
+  %tmp2 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
+  ret i32 %tmp2
+}
+
+define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmull2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmull2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = add <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = add <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlal2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlal2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_1s:
+;CHECK: sqdmlal.4s
+  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod = extractelement <4 x i32> %prod.vec, i32 0
+  %res = call i32 @llvm.arm64.neon.sqadd.i32(i32 %A, i32 %prod)
+  ret i32 %res
+}
+declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32)
+
+define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_1s:
+;CHECK: sqdmlsl.4s
+  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod = extractelement <4 x i32> %prod.vec, i32 0
+  %res = call i32 @llvm.arm64.neon.sqsub.i32(i32 %A, i32 %prod)
+  ret i32 %res
+}
+declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32)
+
+define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_1d:
+;CHECK: sqdmlal.s
+  %rhs = extractelement <2 x i32> %C, i32 1
+  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.arm64.neon.sqadd.i64(i64 %A, i64 %prod)
+  ret i64 %res
+}
+declare i64 @llvm.arm64.neon.sqdmulls.scalar(i32, i32)
+declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64)
+
+define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_1d:
+;CHECK: sqdmlsl.s
+  %rhs = extractelement <2 x i32> %C, i32 1
+  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.arm64.neon.sqsub.i64(i64 %A, i64 %prod)
+  ret i64 %res
+}
+declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64)
+
+
+define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = add <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = add <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+
+define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = sub <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = sub <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = sub <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = sub <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+; Scalar FMULX
+define float @fmulxs(float %a, float %b) nounwind {
+; CHECK-LABEL: fmulxs:
+; CHECKNEXT: fmulx s0, s0, s1
+  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
+; CHECKNEXT: ret
+  ret float %fmulx.i
+}
+
+define double @fmulxd(double %a, double %b) nounwind {
+; CHECK-LABEL: fmulxd:
+; CHECKNEXT: fmulx d0, d0, d1
+  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
+; CHECKNEXT: ret
+  ret double %fmulx.i
+}
+
+define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
+; CHECK-LABEL: fmulxs_lane:
+; CHECKNEXT: fmulx.s s0, s0, v1[3]
+  %b = extractelement <4 x float> %vec, i32 3
+  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
+; CHECKNEXT: ret
+  ret float %fmulx.i
+}
+
+define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
+; CHECK-LABEL: fmulxd_lane:
+; CHECKNEXT: fmulx d0, d0, v1[1]
+  %b = extractelement <2 x double> %vec, i32 1
+  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
+; CHECKNEXT: ret
+  ret double %fmulx.i
+}
+
+declare double @llvm.arm64.neon.fmulx.f64(double, double) nounwind readnone
+declare float @llvm.arm64.neon.fmulx.f32(float, float) nounwind readnone
+
+
+define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: smull2_8h_simple:
+; CHECK-NEXT: smull2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: foo0:
+; CHECK: smull2.8h v0, v0, v1
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: smull2.4s v0, v0, v1
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK: smull2.2d v0, v0, v1
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK: umull2.8h v0, v0, v1
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK: umull2.4s v0, v0, v1
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK: umull2.2d v0, v0, v1
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo6:
+; CHECK-NEXT: smull2.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo7:
+; CHECK-NEXT: smull2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo8:
+; CHECK-NEXT: umull2.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo9:
+; CHECK-NEXT: umull2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: bar0:
+; CHECK: smlal2.8h v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %add.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: bar1:
+; CHECK: smlal2.4s v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: bar2:
+; CHECK: smlal2.2d v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: bar3:
+; CHECK: umlal2.8h v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %add.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: bar4:
+; CHECK: umlal2.4s v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: bar5:
+; CHECK: umlal2.2d v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: mlal2_1:
+; CHECK: smlal2.4s v0, v1, v2[3]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: mlal2_2:
+; CHECK: smlal2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: mlal2_4:
+; CHECK: umlal2.4s v0, v1, v2[2]
+; CHECK-NEXT: ret
+
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: mlal2_5:
+; CHECK: umlal2.2d v0, v1, v2[0]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add
+}
+
+; rdar://12328502
+define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmulq_n_f64:
+; CHECK-NOT: dup.2d
+; CHECK: fmul.2d v0, v0, v1[0]
+  %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
+  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
+  %mul.i = fmul <2 x double> %vecinit1.i, %x
+  ret <2 x double> %mul.i
+}
+
+define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmulq_n_f32:
+; CHECK-NOT: dup.4s
+; CHECK: fmul.4s v0, v0, v1[0]
+  %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
+  %mul.i = fmul <4 x float> %vecinit3.i, %x
+  ret <4 x float> %mul.i
+}
+
+define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmul_n_f32:
+; CHECK-NOT: dup.2s
+; CHECK: fmul.2s v0, v0, v1[0]
+  %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
+  %mul.i = fmul <2 x float> %vecinit1.i, %x
+  ret <2 x float> %mul.i
+}
+
+define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
+entry:
+; CHECK: vmla_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: mla.4h v0, v1, v2[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
+entry:
+; CHECK: vmla_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: mla.2s v0, v1, v2[3]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: smull.4s v0, v0, v1[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: smull.2d v0, v0, v1[2]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  ret <2 x i64> %vmull2.i
+}
+define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_u16_test
+; CHECK-NOT: ext
+; CHECK: umull.4s v0, v0, v1[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_u32_test
+; CHECK-NOT: ext
+; CHECK: umull.2d v0, v0, v1[2]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_s16_test
+; CHECK-NOT: ext
+; CHECK: smull2.4s
+; CHECK-NEXT: ret
+  %conv = trunc i32 %d to i16
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_s32_test
+; CHECK-NOT: ext
+; CHECK: smull2.2d
+; CHECK-NEXT: ret
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_u16_test
+; CHECK-NOT: ext
+; CHECK: umull2.4s
+; CHECK-NEXT: ret
+  %conv = trunc i32 %d to i16
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_u32_test
+; CHECK-NOT: ext
+; CHECK: umull2.2d
+; CHECK-NEXT: ret
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmul_built_dup_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1]
+  %vget_lane = extractelement <4 x i32> %b, i32 1
+  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
+  %prod = mul <4 x i32> %a, %vecinit3.i
+  ret <4 x i32> %prod
+}
+
+define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmul_built_dup_fromsmall_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3]
+  %vget_lane = extractelement <4 x i16> %b, i32 3
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %prod = mul <4 x i16> %a, %vecinit3.i
+  ret <4 x i16> %prod
+}
+
+define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+  %vget_lane = extractelement <4 x i16> %b, i32 0
+  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+  %prod = mul <8 x i16> %a, %vecinit7.i
+  ret <8 x i16> %prod
+}
+
+define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: mull_from_two_extracts:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: mlal_from_two_extracts:
+; CHECK-NOT: ext
+; CHECK: sqdmlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  ret <2 x i64> %res
+}
+
+define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup:
+; CHECK-NOT: ext
+; CHECK: pmull2.8h
+  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
+  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: pmull2.8h
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+  ret <8 x i16> %res
+}
+
+define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: sqdmlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: umlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = add <2 x i64> %accum, %res
+  ret <2 x i64> %sum
+}
+
+define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
+; CHECK: fmla.s s0, s1, v2[3]
+  %rhs = extractelement <4 x float> %rvec, i32 3
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
+; CHECK: fmla.s s0, s1, v2[1]
+  %rhs = extractelement <2 x float> %rvec, i32 1
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
+; CHECK: fmls.s s0, s1, v2[3]
+  %rhs.scal = extractelement <4 x float> %rvec, i32 3
+  %rhs = fsub float -0.0, %rhs.scal
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
+; CHECK: fmls.s s0, s1, v2[1]
+  %rhs.scal = extractelement <2 x float> %rvec, i32 1
+  %rhs = fsub float -0.0, %rhs.scal
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
+; CHECK: fmla.d d0, d1, v2[1]
+  %rhs = extractelement <2 x double> %rvec, i32 1
+  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+  ret double %res
+}
+
+define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
+; CHECK: fmls.d d0, d1, v2[1]
+  %rhs.scal = extractelement <2 x double> %rvec, i32 1
+  %rhs = fsub double -0.0, %rhs.scal
+  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+  ret double %res
+}
+
+declare double @llvm.fma.f64(double, double, double)
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
+; CHECK: fmls.2s v0, v1, v2[3]
+  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+  ret <2 x float> %res
+}
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
+; CHECK: fmls.2s v0, v1, v2[1]
+  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+  ret <2 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
+; CHECK: fmls.4s v0, v1, v2[3]
+  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+  ret <4 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
+; CHECK: fmls.4s v0, v1, v2[1]
+  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+  ret <4 x float> %res
+}
+
+define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
+; CHECK: fmls.2d v0, v1, v2[1]
+  %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
+  %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
+  ret <2 x double> %res
+}
+
+define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: test_fmul_v1f64:
+; CHECK: fmul
+  %prod = fmul <1 x double> %L, %R
+  ret <1 x double> %prod
+}
+
+define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: test_fdiv_v1f64:
+; CHECK-LABEL: fdiv
+  %prod = fdiv <1 x double> %L, %R
+  ret <1 x double> %prod
+}
+
+define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
+;CHECK-LABEL: sqdmlal_d:
+;CHECK: sqdmlal
+  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.arm64.neon.sqadd.i64(i64 %C, i64 %tmp4)
+  ret i64 %tmp5
+}
+
+define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
+;CHECK-LABEL: sqdmlsl_d:
+;CHECK: sqdmlsl
+  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.arm64.neon.sqsub.i64(i64 %C, i64 %tmp4)
+  ret i64 %tmp5
+}
+
+define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: test_pmull_64:
+; CHECK: pmull.1q
+  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l, i64 %r)
+  ret <16 x i8> %val
+}
+
+define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
+; CHECK-LABEL: test_pmull_high_64:
+; CHECK: pmull2.1q
+  %l_hi = extractelement <2 x i64> %l, i32 1
+  %r_hi = extractelement <2 x i64> %r, i32 1
+  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l_hi, i64 %r_hi)
+  ret <16 x i8> %val
+}
+
+declare <16 x i8> @llvm.arm64.neon.pmull64(i64, i64)
diff --git a/test/CodeGen/ARM64/volatile.ll b/test/CodeGen/ARM64/volatile.ll
new file mode 100644
index 0000000..e00ac5a
--- /dev/null
+++ b/test/CodeGen/ARM64/volatile.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define i64 @normal_load(i64* nocapture %bar) nounwind readonly {
+; CHECK: normal_load
+; CHECK: ldp
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+  %add.ptr = getelementptr inbounds i64* %bar, i64 1
+  %tmp = load i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
+  %tmp1 = load i64* %add.ptr1, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
+
+define i64 @volatile_load(i64* nocapture %bar) nounwind {
+; CHECK: volatile_load
+; CHECK: ldr
+; CHECK-NEXT: ldr
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+  %add.ptr = getelementptr inbounds i64* %bar, i64 1
+  %tmp = load volatile i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
+  %tmp1 = load volatile i64* %add.ptr1, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
diff --git a/test/CodeGen/ARM64/vqadd.ll b/test/CodeGen/ARM64/vqadd.ll
new file mode 100644
index 0000000..0b7f7e5
--- /dev/null
+++ b/test/CodeGen/ARM64/vqadd.ll
@@ -0,0 +1,332 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd8b:
+;CHECK: sqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd4h:
+;CHECK: sqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd2s:
+;CHECK: sqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd8b:
+;CHECK: uqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd4h:
+;CHECK: uqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd2s:
+;CHECK: uqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd16b:
+;CHECK: sqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd8h:
+;CHECK: sqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd4s:
+;CHECK: sqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqadd2d:
+;CHECK: sqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd16b:
+;CHECK: uqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd8h:
+;CHECK: uqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd4s:
+;CHECK: uqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqadd2d:
+;CHECK: uqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @usqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd8b:
+;CHECK: usqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @usqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd4h:
+;CHECK: usqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @usqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd2s:
+;CHECK: usqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @usqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd16b:
+;CHECK: usqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @usqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd8h:
+;CHECK: usqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @usqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd4s:
+;CHECK: usqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @usqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: usqadd2d:
+;CHECK: usqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define i64 @usqadd_d(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: usqadd_d:
+; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call i64 @llvm.arm64.neon.usqadd.i64(i64 %l, i64 %r)
+  ret i64 %sum
+}
+
+define i32 @usqadd_s(i32 %l, i32 %r) nounwind {
+; CHECK-LABEL: usqadd_s:
+; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
+  %sum = call i32 @llvm.arm64.neon.usqadd.i32(i32 %l, i32 %r)
+  ret i32 %sum
+}
+
+declare <8 x i8>  @llvm.arm64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.arm64.neon.usqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.usqadd.i32(i32, i32) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @suqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd8b:
+;CHECK: suqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @suqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd4h:
+;CHECK: suqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @suqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd2s:
+;CHECK: suqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @suqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd16b:
+;CHECK: suqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @suqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd8h:
+;CHECK: suqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @suqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd4s:
+;CHECK: suqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @suqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: suqadd2d:
+;CHECK: suqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i64> @suqadd_1d(<1 x i64> %l, <1 x i64> %r) nounwind {
+; CHECK-LABEL: suqadd_1d:
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
+  ret <1 x i64> %sum
+}
+
+define i64 @suqadd_d(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: suqadd_d:
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call i64 @llvm.arm64.neon.suqadd.i64(i64 %l, i64 %r)
+  ret i64 %sum
+}
+
+define i32 @suqadd_s(i32 %l, i32 %r) nounwind {
+; CHECK-LABEL: suqadd_s:
+; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
+  %sum = call i32 @llvm.arm64.neon.suqadd.i32(i32 %l, i32 %r)
+  ret i32 %sum
+}
+
+declare <8 x i8>  @llvm.arm64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.arm64.neon.suqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.suqadd.i32(i32, i32) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vqsub.ll b/test/CodeGen/ARM64/vqsub.ll
new file mode 100644
index 0000000..0afeb68
--- /dev/null
+++ b/test/CodeGen/ARM64/vqsub.ll
@@ -0,0 +1,147 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub8b:
+;CHECK: sqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub4h:
+;CHECK: sqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub2s:
+;CHECK: sqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub8b:
+;CHECK: uqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub4h:
+;CHECK: uqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub2s:
+;CHECK: uqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub16b:
+;CHECK: sqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub8h:
+;CHECK: sqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub4s:
+;CHECK: sqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqsub2d:
+;CHECK: sqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub16b:
+;CHECK: uqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub8h:
+;CHECK: uqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub4s:
+;CHECK: uqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqsub2d:
+;CHECK: uqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vselect.ll b/test/CodeGen/ARM64/vselect.ll
new file mode 100644
index 0000000..07274a0
--- /dev/null
+++ b/test/CodeGen/ARM64/vselect.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func63
+;CHECK: cmeq.4h v0, v0, v1
+;CHECK: sshll.4s  v0, v0, #0
+;CHECK: bsl.16b v0, v2, v3
+;CHECK: str  q0, [x0]
+;CHECK: ret
+
+%T0_63 = type <4 x i16>
+%T1_63 = type <4 x i32>
+%T2_63 = type <4 x i1>
+define void @func63(%T1_63* %out, %T0_63 %v0, %T0_63 %v1, %T1_63 %v2, %T1_63 %v3) {
+  %cond = icmp eq %T0_63 %v0, %v1
+  %r = select %T2_63 %cond, %T1_63 %v2, %T1_63 %v3
+  store %T1_63 %r, %T1_63* %out
+  ret void
+}
diff --git a/test/CodeGen/ARM64/vsetcc_fp.ll b/test/CodeGen/ARM64/vsetcc_fp.ll
new file mode 100644
index 0000000..c93aad5
--- /dev/null
+++ b/test/CodeGen/ARM64/vsetcc_fp.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+define <2 x i32> @fcmp_one(<2 x float> %x, <2 x float> %y) nounwind optsize readnone {
+; CHECK-LABEL: fcmp_one:
+; CHECK-NEXT: fcmgt.2s [[REG:v[0-9]+]], v0, v1
+; CHECK-NEXT: fcmgt.2s [[REG2:v[0-9]+]], v1, v0
+; CHECK-NEXT: orr.8b v0, [[REG2]], [[REG]]
+; CHECK-NEXT: ret
+  %tmp = fcmp one <2 x float> %x, %y
+  %or = sext <2 x i1> %tmp to <2 x i32>
+  ret <2 x i32> %or
+}
diff --git a/test/CodeGen/ARM64/vshift.ll b/test/CodeGen/ARM64/vshift.ll
new file mode 100644
index 0000000..ae5da38
--- /dev/null
+++ b/test/CodeGen/ARM64/vshift.ll
@@ -0,0 +1,1909 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -enable-misched=false | FileCheck %s
+
+define <8 x i8> @sqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqshl8b:
+;CHECK: sqshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqshl4h:
+;CHECK: sqshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqshl2s:
+;CHECK: sqshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqshl8b:
+;CHECK: uqshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqshl4h:
+;CHECK: uqshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqshl2s:
+;CHECK: uqshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqshl16b:
+;CHECK: sqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqshl8h:
+;CHECK: sqshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqshl4s:
+;CHECK: sqshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqshl2d:
+;CHECK: sqshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqshl16b:
+;CHECK: uqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqshl8h:
+;CHECK: uqshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqshl4s:
+;CHECK: uqshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqshl2d:
+;CHECK: uqshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @srshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srshl8b:
+;CHECK: srshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @srshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srshl4h:
+;CHECK: srshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @srshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srshl2s:
+;CHECK: srshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @urshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: urshl8b:
+;CHECK: urshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @urshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: urshl4h:
+;CHECK: urshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @urshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: urshl2s:
+;CHECK: urshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @srshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srshl16b:
+;CHECK: srshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @srshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srshl8h:
+;CHECK: srshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @srshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srshl4s:
+;CHECK: srshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @srshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: srshl2d:
+;CHECK: srshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @urshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: urshl16b:
+;CHECK: urshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @urshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: urshl8h:
+;CHECK: urshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @urshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: urshl4s:
+;CHECK: urshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @urshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: urshl2d:
+;CHECK: urshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @sqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqrshl8b:
+;CHECK: sqrshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrshl4h:
+;CHECK: sqrshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrshl2s:
+;CHECK: sqrshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqrshl8b:
+;CHECK: uqrshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqrshl4h:
+;CHECK: uqrshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqrshl2s:
+;CHECK: uqrshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqrshl16b:
+;CHECK: sqrshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrshl8h:
+;CHECK: sqrshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrshl4s:
+;CHECK: sqrshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqrshl2d:
+;CHECK: sqrshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqrshl16b:
+;CHECK: uqrshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqrshl8h:
+;CHECK: uqrshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqrshl4s:
+;CHECK: uqrshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqrshl2d:
+;CHECK: uqrshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @urshr8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: urshr8b:
+;CHECK: urshr.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @urshr4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: urshr4h:
+;CHECK: urshr.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @urshr2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: urshr2s:
+;CHECK: urshr.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @urshr16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: urshr16b:
+;CHECK: urshr.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @urshr8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: urshr8h:
+;CHECK: urshr.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @urshr4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: urshr4s:
+;CHECK: urshr.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @urshr2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: urshr2d:
+;CHECK: urshr.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @srshr8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: srshr8b:
+;CHECK: srshr.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @srshr4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: srshr4h:
+;CHECK: srshr.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @srshr2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: srshr2s:
+;CHECK: srshr.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @srshr16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: srshr16b:
+;CHECK: srshr.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @srshr8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: srshr8h:
+;CHECK: srshr.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @srshr4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: srshr4s:
+;CHECK: srshr.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @srshr2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: srshr2d:
+;CHECK: srshr.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @sqshlu8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshlu8b:
+;CHECK: sqshlu.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshlu4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshlu4h:
+;CHECK: sqshlu.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshlu2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshlu2s:
+;CHECK: sqshlu.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshlu16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshlu16b:
+;CHECK: sqshlu.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshlu8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshlu8h:
+;CHECK: sqshlu.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshlu4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshlu4s:
+;CHECK: sqshlu.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshlu2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshlu2d:
+;CHECK: sqshlu.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @rshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: rshrn8b:
+;CHECK: rshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @rshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: rshrn4h:
+;CHECK: rshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @rshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: rshrn2s:
+;CHECK: rshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @rshrn16b(<8 x i8> *%ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: rshrn16b:
+;CHECK: rshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @rshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: rshrn8h:
+;CHECK: rshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @rshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: rshrn4s:
+;CHECK: rshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare <8 x i8>  @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define <8 x i8> @shrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: shrn8b:
+;CHECK: shrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @shrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: shrn4h:
+;CHECK: shrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @shrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: shrn2s:
+;CHECK: shrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @shrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: shrn16b:
+;CHECK: shrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @shrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: shrn8h:
+;CHECK: shrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @shrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: shrn4s:
+;CHECK: shrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare <8 x i8>  @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: sqshrn1s:
+; CHECK: sqshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.sqshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrn8b:
+;CHECK: sqshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrn4h:
+;CHECK: sqshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrn2s:
+;CHECK: sqshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+
+define <16 x i8> @sqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrn16b:
+;CHECK: sqshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrn8h:
+;CHECK: sqshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrn4s:
+;CHECK: sqshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.sqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqshrun1s(i64 %A) nounwind {
+; CHECK-LABEL: sqshrun1s:
+; CHECK: sqshrun {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.sqshrun.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqshrun8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrun8b:
+;CHECK: sqshrun.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshrun4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrun4h:
+;CHECK: sqshrun.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshrun2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrun2s:
+;CHECK: sqshrun.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrun16b:
+;CHECK: sqshrun2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrun8h:
+;CHECK: sqshrun2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrun4s:
+;CHECK: sqshrun2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.sqshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqrshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: sqrshrn1s:
+; CHECK: sqrshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.sqrshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqrshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrn8b:
+;CHECK: sqrshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrn4h:
+;CHECK: sqrshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrn2s:
+;CHECK: sqrshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrn16b:
+;CHECK: sqrshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrn8h:
+;CHECK: sqrshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrn4s:
+;CHECK: sqrshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.sqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqrshrun1s(i64 %A) nounwind {
+; CHECK-LABEL: sqrshrun1s:
+; CHECK: sqrshrun {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.sqrshrun.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqrshrun8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrun8b:
+;CHECK: sqrshrun.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshrun4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrun4h:
+;CHECK: sqrshrun.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshrun2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrun2s:
+;CHECK: sqrshrun.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrun16b:
+;CHECK: sqrshrun2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqrshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrun8h:
+;CHECK: sqrshrun2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqrshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrun4s:
+;CHECK: sqrshrun2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.sqrshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @uqrshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: uqrshrn1s:
+; CHECK: uqrshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.uqrshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @uqrshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqrshrn8b:
+;CHECK: uqrshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqrshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqrshrn4h:
+;CHECK: uqrshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqrshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqrshrn2s:
+;CHECK: uqrshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqrshrn16b:
+;CHECK: uqrshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @uqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqrshrn8h:
+;CHECK: uqrshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqrshrn4s:
+;CHECK: uqrshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.uqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @uqshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: uqshrn1s:
+; CHECK: uqshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.uqshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @uqshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshrn8b:
+;CHECK: uqshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshrn4h:
+;CHECK: uqshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshrn2s:
+;CHECK: uqshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshrn16b:
+;CHECK: uqshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @uqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshrn8h:
+;CHECK: uqshrn2.8h v0, {{v[0-9]+}}, #1
+  %out = load <4 x i16>* %ret
+  %tmp1 = load <4 x i32>* %A
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+  %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshrn4s:
+;CHECK: uqshrn2.4s v0, {{v[0-9]+}}, #1
+  %out = load <2 x i32>* %ret
+  %tmp1 = load <2 x i64>* %A
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+  %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.uqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define <8 x i16> @ushll8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: ushll8h:
+;CHECK: ushll.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ushll4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: ushll4s:
+;CHECK: ushll.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: ushll2d:
+;CHECK: ushll.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: ushll2_8h:
+;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1
+        %load1 = load <16 x i8>* %A
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: ushll2_4s:
+;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1
+        %load1 = load <8 x i16>* %A
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: ushll2_2d:
+;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1
+        %load1 = load <4 x i32>* %A
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @sshll8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sshll8h:
+;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sshll4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sshll4s:
+;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sshll2d:
+;CHECK: sshll.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sshll2_8h:
+;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1
+        %load1 = load <16 x i8>* %A
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sshll2_4s:
+;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1
+        %load1 = load <8 x i16>* %A
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sshll2_2d:
+;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1
+        %load1 = load <4 x i32>* %A
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @sqshli8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshli8b:
+;CHECK: sqshl.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshli4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshli4h:
+;CHECK: sqshl.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshli2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshli2s:
+;CHECK: sqshl.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshli16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshli16b:
+;CHECK: sqshl.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshli8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshli8h:
+;CHECK: sqshl.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshli4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshli4s:
+;CHECK: sqshl.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshli2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshli2d:
+;CHECK: sqshl.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli8b:
+;CHECK: uqshl.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshli4h:
+;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshli2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshli2s:
+;CHECK: uqshl.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqshli16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli16b:
+;CHECK: uqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqshli8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshli8h:
+;CHECK: uqshl.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqshli4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshli4s:
+;CHECK: uqshl.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqshli2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshli2d:
+;CHECK: uqshl.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @ursra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ursra8b:
+;CHECK: ursra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @ursra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ursra4h:
+;CHECK: ursra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @ursra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ursra2s:
+;CHECK: ursra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @ursra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ursra16b:
+;CHECK: ursra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @ursra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ursra8h:
+;CHECK: ursra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ursra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ursra4s:
+;CHECK: ursra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ursra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: ursra2d:
+;CHECK: ursra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @srsra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srsra8b:
+;CHECK: srsra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @srsra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srsra4h:
+;CHECK: srsra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @srsra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srsra2s:
+;CHECK: srsra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @srsra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srsra16b:
+;CHECK: srsra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @srsra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srsra8h:
+;CHECK: srsra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @srsra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srsra4s:
+;CHECK: srsra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @srsra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: srsra2d:
+;CHECK: srsra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @usra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usra8b:
+;CHECK: usra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @usra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usra4h:
+;CHECK: usra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @usra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usra2s:
+;CHECK: usra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @usra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usra16b:
+;CHECK: usra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @usra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usra8h:
+;CHECK: usra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @usra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usra4s:
+;CHECK: usra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @usra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: usra2d:
+;CHECK: usra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @ssra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssra8b:
+;CHECK: ssra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = ashr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @ssra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssra4h:
+;CHECK: ssra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = ashr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @ssra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssra2s:
+;CHECK: ssra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = ashr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @ssra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssra16b:
+;CHECK: ssra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = ashr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @ssra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssra8h:
+;CHECK: ssra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = ashr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ssra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssra4s:
+;CHECK: ssra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = ashr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ssra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: ssra2d:
+;CHECK: ssra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = ashr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @shr_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shr_orr8b:
+;CHECK: shr.8b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i8>* %A
+        %tmp4 = load <8 x i8>* %B
+        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @shr_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shr_orr4h:
+;CHECK: shr.4h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i16>* %A
+        %tmp4 = load <4 x i16>* %B
+        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @shr_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shr_orr2s:
+;CHECK: shr.2s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i32>* %A
+        %tmp4 = load <2 x i32>* %B
+        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp5 = or <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @shr_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shr_orr16b:
+;CHECK: shr.16b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <16 x i8>* %A
+        %tmp4 = load <16 x i8>* %B
+        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @shr_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shr_orr8h:
+;CHECK: shr.8h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp4 = load <8 x i16>* %B
+        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @shr_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shr_orr4s:
+;CHECK: shr.4s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp4 = load <4 x i32>* %B
+        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp5 = or <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @shr_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: shr_orr2d:
+;CHECK: shr.2d v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp4 = load <2 x i64>* %B
+        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp5 = or <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @shl_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shl_orr8b:
+;CHECK: shl.8b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i8>* %A
+        %tmp4 = load <8 x i8>* %B
+        %tmp3 = shl <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @shl_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shl_orr4h:
+;CHECK: shl.4h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i16>* %A
+        %tmp4 = load <4 x i16>* %B
+        %tmp3 = shl <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @shl_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shl_orr2s:
+;CHECK: shl.2s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i32>* %A
+        %tmp4 = load <2 x i32>* %B
+        %tmp3 = shl <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp5 = or <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @shl_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shl_orr16b:
+;CHECK: shl.16b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <16 x i8>* %A
+        %tmp4 = load <16 x i8>* %B
+        %tmp3 = shl <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @shl_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shl_orr8h:
+;CHECK: shl.8h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp4 = load <8 x i16>* %B
+        %tmp3 = shl <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @shl_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shl_orr4s:
+;CHECK: shl.4s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp4 = load <4 x i32>* %B
+        %tmp3 = shl <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp5 = or <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @shl_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: shl_orr2d:
+;CHECK: shl.2d v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp4 = load <2 x i64>* %B
+        %tmp3 = shl <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp5 = or <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @shll(<8 x i8> %in) {
+; CHECK-LABEL: shll:
+; CHECK: shll.8h v0, {{v[0-9]+}}, #8
+  %ext = zext <8 x i8> %in to <8 x i16>
+  %res = shl <8 x i16> %ext, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @shll_high(<8 x i16> %in) {
+; CHECK-LABEL: shll_high
+; CHECK: shll2.4s v0, {{v[0-9]+}}, #16
+  %extract = shufflevector <8 x i16> %in, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext = zext <4 x i16> %extract to <4 x i32>
+  %res = shl <4 x i32> %ext, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %res
+}
+
+define <8 x i8> @sli8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sli8b:
+;CHECK: sli.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sli4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sli4h:
+;CHECK: sli.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sli2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sli2s:
+;CHECK: sli.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @sli1d(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: sli1d:
+;CHECK: sli d0, {{d[0-9]+}}, #1
+        %tmp1 = load <1 x i64>* %A
+        %tmp2 = load <1 x i64>* %B
+        %tmp3 = call <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
+        ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @sli16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sli16b:
+;CHECK: sli.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sli8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sli8h:
+;CHECK: sli.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sli4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sli4s:
+;CHECK: sli.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sli2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sli2d:
+;CHECK: sli.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vshr.ll b/test/CodeGen/ARM64/vshr.ll
new file mode 100644
index 0000000..6adb81c
--- /dev/null
+++ b/test/CodeGen/ARM64/vshr.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define <8 x i16> @testShiftRightArith_v8i16(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK-LABEL: testShiftRightArith_v8i16:
+; CHECK: neg.8h	[[REG1:v[0-9]+]], [[REG1]]
+; CHECK-NEXT: sshl.8h [[REG2:v[0-9]+]], [[REG2]], [[REG1]]
+
+entry:
+  %a.addr = alloca <8 x i16>, align 16
+  %b.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
+  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
+  %0 = load <8 x i16>* %a.addr, align 16
+  %1 = load <8 x i16>* %b.addr, align 16
+  %shr = ashr <8 x i16> %0, %1
+  ret <8 x i16> %shr
+}
+
+define <4 x i32> @testShiftRightArith_v4i32(<4 x i32> %a, <4 x i32> %b) #0 {
+; CHECK-LABEL: testShiftRightArith_v4i32:
+; CHECK: neg.4s	[[REG3:v[0-9]+]], [[REG3]]
+; CHECK-NEXT: sshl.4s [[REG4:v[0-9]+]], [[REG4]], [[REG3]]
+entry:
+  %a.addr = alloca <4 x i32>, align 32
+  %b.addr = alloca <4 x i32>, align 32
+  store <4 x i32> %a, <4 x i32>* %a.addr, align 32
+  store <4 x i32> %b, <4 x i32>* %b.addr, align 32
+  %0 = load <4 x i32>* %a.addr, align 32
+  %1 = load <4 x i32>* %b.addr, align 32
+  %shr = ashr <4 x i32> %0, %1
+  ret <4 x i32> %shr
+}
+
+define <8 x i16> @testShiftRightLogical(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK: testShiftRightLogical
+; CHECK: neg.8h	[[REG5:v[0-9]+]], [[REG5]]
+; CHECK-NEXT: ushl.8h [[REG6:v[0-9]+]], [[REG6]], [[REG5]]
+entry:
+  %a.addr = alloca <8 x i16>, align 16
+  %b.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
+  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
+  %0 = load <8 x i16>* %a.addr, align 16
+  %1 = load <8 x i16>* %b.addr, align 16
+  %shr = lshr <8 x i16> %0, %1
+  ret <8 x i16> %shr
+}
+
+define <1 x i64> @sshr_v1i64(<1 x i64> %A) nounwind {
+; CHECK-LABEL: sshr_v1i64:
+; CHECK: sshr d0, d0, #63
+  %tmp3 = ashr <1 x i64> %A, < i64 63 >
+  ret <1 x i64> %tmp3
+}
+
+define <1 x i64> @ushr_v1i64(<1 x i64> %A) nounwind {
+; CHECK-LABEL: ushr_v1i64:
+; CHECK: ushr d0, d0, #63
+  %tmp3 = lshr <1 x i64> %A, < i64 63 >
+  ret <1 x i64> %tmp3
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/ARM64/vshuffle.ll b/test/CodeGen/ARM64/vshuffle.ll
new file mode 100644
index 0000000..f90200c
--- /dev/null
+++ b/test/CodeGen/ARM64/vshuffle.ll
@@ -0,0 +1,115 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+
+; The mask:
+; CHECK: lCPI0_0:
+; CHECK:  .byte   2                       ; 0x2
+; CHECK:  .byte   255                     ; 0xff
+; CHECK:  .byte   6                       ; 0x6
+; CHECK:  .byte   255                     ; 0xff
+; The second vector is legalized to undef and the elements of the first vector
+; are used instead.
+; CHECK:  .byte   2                       ; 0x2
+; CHECK:  .byte   4                       ; 0x4
+; CHECK:  .byte   6                       ; 0x6
+; CHECK:  .byte   0                       ; 0x0
+; CHECK: test1
+; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0
+; CHECK: movi.8h v[[REG1:[0-9]+]], #1, lsl #8
+; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <8 x i1> @test1() {
+entry:
+  %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
+                                   i1 7>,
+                         <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
+                                   i1 7>,
+                         <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10,
+                                    i32 12, i32 14, i32 0>
+  ret <8 x i1> %Shuff
+}
+
+; CHECK: lCPI1_0:
+; CHECK:          .byte   2                       ; 0x2
+; CHECK:          .byte   255                     ; 0xff
+; CHECK:          .byte   6                       ; 0x6
+; CHECK:          .byte   255                     ; 0xff
+; CHECK:          .byte   10                      ; 0xa
+; CHECK:          .byte   12                      ; 0xc
+; CHECK:          .byte   14                      ; 0xe
+; CHECK:          .byte   0                       ; 0x0
+; CHECK: test2
+; CHECK: ldr     d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF]
+; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_1@PAGE
+; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF]
+; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <8 x i1>@test2() {
+bb:
+  %Shuff = shufflevector <8 x i1> zeroinitializer,
+     <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
+     <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
+                i32 0>
+  ret <8 x i1> %Shuff
+}
+
+; CHECK: lCPI2_0:
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK: test3
+; CHECK: adrp    x[[REG3:[0-9]+]], lCPI2_0@PAGE
+; CHECK: ldr     q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF]
+; CHECK: movi.2d v[[REG1:[0-9]+]], #0000000000000000
+; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <16 x i1> @test3(i1* %ptr, i32 %v) {
+bb:
+  %Shuff = shufflevector <16 x i1> zeroinitializer, <16 x i1> undef,
+     <16 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
+                 i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12,
+                 i32 14, i32 0>
+  ret <16 x i1> %Shuff
+}
+; CHECK: lCPI3_1:
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   1                       ; 0x1
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   18                      ; 0x12
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   31                      ; 0x1f
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   30                      ; 0x1e
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK: _test4:
+; CHECK:         ldr     q[[REG1:[0-9]+]]
+; CHECK:         movi.2d v[[REG0:[0-9]+]], #0000000000000000
+; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_1@PAGE
+; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF]
+; CHECK:         tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]]
+define <16 x i1> @test4(i1* %ptr, i32 %v) {
+bb:
+  %Shuff = shufflevector <16 x i1> zeroinitializer,
+     <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1,
+                i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
+     <16 x i32> <i32 2, i32 1, i32 6, i32 18, i32 10, i32 12, i32 14, i32 0,
+                 i32 2, i32 31, i32 6, i32 30, i32 10, i32 12, i32 14, i32 0>
+  ret <16 x i1> %Shuff
+}
diff --git a/test/CodeGen/ARM64/vsqrt.ll b/test/CodeGen/ARM64/vsqrt.ll
new file mode 100644
index 0000000..094d704
--- /dev/null
+++ b/test/CodeGen/ARM64/vsqrt.ll
@@ -0,0 +1,232 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @frecps_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_2s:
+;CHECK: frecps.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecps_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_4s:
+;CHECK: frecps.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecps_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frecps_2d:
+;CHECK: frecps.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2s:
+;CHECK: frsqrts.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrts_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_4s:
+;CHECK: frsqrts.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrts_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2d:
+;CHECK: frsqrts.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @frecpe_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_2s:
+;CHECK: frecpe.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp3 = call <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecpe_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_4s:
+;CHECK: frecpe.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp3 = call <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecpe_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frecpe_2d:
+;CHECK: frecpe.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp3 = call <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double> %tmp1)
+	ret <2 x double> %tmp3
+}
+
+define float @frecpe_s(float* %A) nounwind {
+;CHECK-LABEL: frecpe_s:
+;CHECK: frecpe s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.arm64.neon.frecpe.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frecpe_d(double* %A) nounwind {
+;CHECK-LABEL: frecpe_d:
+;CHECK: frecpe d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.arm64.neon.frecpe.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.arm64.neon.frecpe.f32(float) nounwind readnone
+declare double @llvm.arm64.neon.frecpe.f64(double) nounwind readnone
+
+define float @frecpx_s(float* %A) nounwind {
+;CHECK-LABEL: frecpx_s:
+;CHECK: frecpx s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.arm64.neon.frecpx.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frecpx_d(double* %A) nounwind {
+;CHECK-LABEL: frecpx_d:
+;CHECK: frecpx d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.arm64.neon.frecpx.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare float @llvm.arm64.neon.frecpx.f32(float) nounwind readnone
+declare double @llvm.arm64.neon.frecpx.f64(double) nounwind readnone
+
+define <2 x float> @frsqrte_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2s:
+;CHECK: frsqrte.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrte_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_4s:
+;CHECK: frsqrte.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrte_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2d:
+;CHECK: frsqrte.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double> %tmp1)
+	ret <2 x double> %tmp3
+}
+
+define float @frsqrte_s(float* %A) nounwind {
+;CHECK-LABEL: frsqrte_s:
+;CHECK: frsqrte s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.arm64.neon.frsqrte.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frsqrte_d(double* %A) nounwind {
+;CHECK-LABEL: frsqrte_d:
+;CHECK: frsqrte d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.arm64.neon.frsqrte.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.arm64.neon.frsqrte.f32(float) nounwind readnone
+declare double @llvm.arm64.neon.frsqrte.f64(double) nounwind readnone
+
+define <2 x i32> @urecpe_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_2s:
+;CHECK: urecpe.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urecpe_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_4s:
+;CHECK: urecpe.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
+
+define <2 x i32> @ursqrte_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_2s:
+;CHECK: ursqrte.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ursqrte_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_4s:
+;CHECK: ursqrte.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
+
+define float @f1(float %a, float %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f1:
+; CHECK: frsqrts s0, s0, s1
+; CHECK-NEXT: ret
+  %vrsqrtss.i = tail call float @llvm.arm64.neon.frsqrts.f32(float %a, float %b) nounwind
+  ret float %vrsqrtss.i
+}
+
+define double @f2(double %a, double %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f2:
+; CHECK: frsqrts d0, d0, d1
+; CHECK-NEXT: ret
+  %vrsqrtsd.i = tail call double @llvm.arm64.neon.frsqrts.f64(double %a, double %b) nounwind
+  ret double %vrsqrtsd.i
+}
+
+declare double @llvm.arm64.neon.frsqrts.f64(double, double) nounwind readnone
+declare float @llvm.arm64.neon.frsqrts.f32(float, float) nounwind readnone
diff --git a/test/CodeGen/ARM64/vsra.ll b/test/CodeGen/ARM64/vsra.ll
new file mode 100644
index 0000000..3611eb3
--- /dev/null
+++ b/test/CodeGen/ARM64/vsra.ll
@@ -0,0 +1,150 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vsras8:
+;CHECK: ssra.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = ashr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vsras16:
+;CHECK: ssra.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = ashr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vsras32:
+;CHECK: ssra.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = ashr <2 x i32> %tmp2, < i32 31, i32 31 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vsraQs8:
+;CHECK: ssra.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = ashr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vsraQs16:
+;CHECK: ssra.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = ashr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vsraQs32:
+;CHECK: ssra.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = ashr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vsraQs64:
+;CHECK: ssra.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = ashr <2 x i64> %tmp2, < i64 63, i64 63 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vsrau8:
+;CHECK: usra.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = lshr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vsrau16:
+;CHECK: usra.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = lshr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vsrau32:
+;CHECK: usra.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = lshr <2 x i32> %tmp2, < i32 31, i32 31 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+
+define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vsraQu8:
+;CHECK: usra.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = lshr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vsraQu16:
+;CHECK: usra.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = lshr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vsraQu32:
+;CHECK: usra.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = lshr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vsraQu64:
+;CHECK: usra.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = lshr <2 x i64> %tmp2, < i64 63, i64 63 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <1 x i64> @vsra_v1i64(<1 x i64> %A, <1 x i64> %B) nounwind {
+; CHECK-LABEL: vsra_v1i64:
+; CHECK: ssra d0, d1, #63
+  %tmp3 = ashr <1 x i64> %B, < i64 63 >
+  %tmp4 = add <1 x i64> %A, %tmp3
+  ret <1 x i64> %tmp4
+}
diff --git a/test/CodeGen/ARM64/vsub.ll b/test/CodeGen/ARM64/vsub.ll
new file mode 100644
index 0000000..5c7e84f
--- /dev/null
+++ b/test/CodeGen/ARM64/vsub.ll
@@ -0,0 +1,417 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn8b:
+;CHECK: subhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn4h:
+;CHECK: subhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2s:
+;CHECK: subhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
+;CHECK-LABEL: subhn2_16b:
+;CHECK: subhn.8b
+;CHECK-NEXT: subhn2.16b
+  %vsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
+;CHECK-LABEL: subhn2_8h:
+;CHECK: subhn.4h
+;CHECK-NEXT: subhn2.8h
+  %vsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
+;CHECK-LABEL: subhn2_4s:
+;CHECK: subhn.2s
+;CHECK-NEXT: subhn2.4s
+  %vsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: rsubhn8b:
+;CHECK: rsubhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: rsubhn4h:
+;CHECK: rsubhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: rsubhn2s:
+;CHECK: rsubhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_16b:
+;CHECK: rsubhn.8b
+;CHECK-NEXT: rsubhn2.16b
+  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vrsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_8h:
+;CHECK: rsubhn.4h
+;CHECK-NEXT: rsubhn2.8h
+  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vrsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_4s:
+;CHECK: rsubhn.2s
+;CHECK-NEXT: rsubhn2.4s
+  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vrsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubl8h:
+;CHECK: ssubl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sub <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ssubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubl4s:
+;CHECK: ssubl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubl2d:
+;CHECK: ssubl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubl2_8h:
+;CHECK: ssubl2.8h
+        %tmp1 = load <16 x i8>* %A
+        %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext1 = sext <8 x i8> %high1 to <8 x i16>
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %ext1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubl2_4s:
+;CHECK: ssubl2.4s
+        %tmp1 = load <8 x i16>* %A
+        %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext1 = sext <4 x i16> %high1 to <4 x i32>
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %ext1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubl2_2d:
+;CHECK: ssubl2.2d
+        %tmp1 = load <4 x i32>* %A
+        %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext1 = sext <2 x i32> %high1 to <2 x i64>
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %ext1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @usubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usubl8h:
+;CHECK: usubl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sub <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @usubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usubl4s:
+;CHECK: usubl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usubl2d:
+;CHECK: usubl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usubl2_8h:
+;CHECK: usubl2.8h
+  %tmp1 = load <16 x i8>* %A
+  %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ext1 = zext <8 x i8> %high1 to <8 x i16>
+
+  %tmp2 = load <16 x i8>* %B
+  %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+  %res = sub <8 x i16> %ext1, %ext2
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usubl2_4s:
+;CHECK: usubl2.4s
+  %tmp1 = load <8 x i16>* %A
+  %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext1 = zext <4 x i16> %high1 to <4 x i32>
+
+  %tmp2 = load <8 x i16>* %B
+  %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+  %res = sub <4 x i32> %ext1, %ext2
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usubl2_2d:
+;CHECK: usubl2.2d
+  %tmp1 = load <4 x i32>* %A
+  %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %ext1 = zext <2 x i32> %high1 to <2 x i64>
+
+  %tmp2 = load <4 x i32>* %B
+  %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+  %res = sub <2 x i64> %ext1, %ext2
+  ret <2 x i64> %res
+}
+
+define <8 x i16> @ssubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubw8h:
+;CHECK: ssubw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = sub <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ssubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubw4s:
+;CHECK: ssubw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = sub <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubw2d:
+;CHECK: ssubw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = sub <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubw2_8h:
+;CHECK: ssubw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubw2_4s:
+;CHECK: ssubw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubw2_2d:
+;CHECK: ssubw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @usubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usubw8h:
+;CHECK: usubw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = sub <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @usubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usubw4s:
+;CHECK: usubw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = sub <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usubw2d:
+;CHECK: usubw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = sub <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usubw2_8h:
+;CHECK: usubw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usubw2_4s:
+;CHECK: usubw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usubw2_2d:
+;CHECK: usubw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
diff --git a/test/CodeGen/ARM64/weak-reference.ll b/test/CodeGen/ARM64/weak-reference.ll
new file mode 100644
index 0000000..b2135e0
--- /dev/null
+++ b/test/CodeGen/ARM64/weak-reference.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+@x = extern_weak global i32
+
+define i32 @fn() nounwind ssp {
+; CHECK-LABEL: fn:
+; CHECK: .weak_reference
+  %val = load i32* @x, align 4
+  ret i32 %val
+}
diff --git a/test/CodeGen/ARM64/xaluo.ll b/test/CodeGen/ARM64/xaluo.ll
new file mode 100644
index 0000000..6a8520d
--- /dev/null
+++ b/test/CodeGen/ARM64/xaluo.ll
@@ -0,0 +1,524 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+;
+; Get the actual value of the overflow bit.
+;
+define i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo.i32
+; CHECK:        adds w8, w0, w1
+; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo.i64
+; CHECK:        adds x8, x0, x1
+; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  uaddo.i32
+; CHECK:        adds w8, w0, w1
+; CHECK-NEXT:   csinc w0, wzr, wzr, cc
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  uaddo.i64
+; CHECK:        adds x8, x0, x1
+; CHECK-NEXT:   csinc w0, wzr, wzr, cc
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  ssubo.i32
+; CHECK:        subs w8, w0, w1
+; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  ssubo.i64
+; CHECK:        subs x8, x0, x1
+; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  usubo.i32
+; CHECK:        subs w8, w0, w1
+; CHECK-NEXT:   csinc w0, wzr, wzr, cs
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  usubo.i64
+; CHECK:        subs x8, x0, x1
+; CHECK-NEXT:   csinc w0, wzr, wzr, cs
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  smulo.i32
+; CHECK:        smull x8, w0, w1
+; CHECK-NEXT:   lsr x9, x8, #32
+; CHECK-NEXT:   cmp w9, w8, asr #31
+; CHECK-NEXT:   csinc w0, wzr, wzr, eq
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  smulo.i64
+; CHECK:        mul x8, x0, x1
+; CHECK-NEXT:   smulh x9, x0, x1
+; CHECK-NEXT:   cmp x9, x8, asr #63
+; CHECK-NEXT:   csinc w0, wzr, wzr, eq
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  umulo.i32
+; CHECK:        umull x8, w0, w1
+; CHECK-NEXT:   cmp xzr, x8, lsr #32
+; CHECK-NEXT:   csinc w0, wzr, wzr, eq
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  umulo.i64
+; CHECK:        umulh x8, x0, x1
+; CHECK-NEXT:   cmp xzr, x8
+; CHECK-NEXT:   csinc w8, wzr, wzr, eq
+; CHECK-NEXT:   mul x9, x0, x1
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+
+;
+; Check the use of the overflow bit in combination with a select instruction.
+;
+define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  saddo.select.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  saddo.select.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.select.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, cs
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.select.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, cs
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.select.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, vs
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.select.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, vs
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  usubo.select.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, cc
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  usubo.select.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, cc
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  smulo.select.i32
+; CHECK:        smull    x8, w0, w1
+; CHECK-NEXT:   lsr     x9, x8, #32
+; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK-NEXT:   csel    w0, w0, w1, ne
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  smulo.select.i64
+; CHECK:        mul      x8, x0, x1
+; CHECK-NEXT:   smulh   x9, x0, x1
+; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK-NEXT:   csel    x0, x0, x1, ne
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  umulo.select.i32
+; CHECK:        umull    x8, w0, w1
+; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK-NEXT:   csel    w0, w0, w1, ne
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  umulo.select.i64
+; CHECK:        umulh   x8, x0, x1
+; CHECK-NEXT:   cmp     xzr, x8
+; CHECK-NEXT:   csel    x0, x0, x1, ne
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+
+;
+; Check the use of the overflow bit in combination with a branch instruction.
+;
+define i1 @saddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  saddo.br.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   b.vc
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @saddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  saddo.br.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   b.vc
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.br.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   b.cc
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.br.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   b.cc
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.br.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   b.vc
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.br.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   b.vc
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @usubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  usubo.br.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   b.cs
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @usubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  usubo.br.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   b.cs
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @smulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  smulo.br.i32
+; CHECK:        smull    x8, w0, w1
+; CHECK-NEXT:   lsr     x9, x8, #32
+; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK-NEXT:   b.eq
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @smulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  smulo.br.i64
+; CHECK:        mul      x8, x0, x1
+; CHECK-NEXT:   smulh   x9, x0, x1
+; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK-NEXT:   b.eq
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @umulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  umulo.br.i32
+; CHECK:        umull    x8, w0, w1
+; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK-NEXT:   b.eq
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @umulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  umulo.br.i64
+; CHECK:        umulh   x8, x0, x1
+; CHECK-NEXT:   cbz
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/zero-cycle-regmov.ll b/test/CodeGen/ARM64/zero-cycle-regmov.ll
new file mode 100644
index 0000000..c56d607
--- /dev/null
+++ b/test/CodeGen/ARM64/zero-cycle-regmov.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; rdar://12254953
+
+define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: mov x0, [[REG1:x[0-9]+]]
+; CHECK: mov x1, [[REG2:x[0-9]+]]
+; CHECK: bl _foo
+; CHECK: mov x0, [[REG1]]
+; CHECK: mov x1, [[REG2]]
+  %call = call i32 @foo(i32 %c, i32 %d) nounwind
+  %call1 = call i32 @foo(i32 %c, i32 %d) nounwind
+  unreachable
+}
+
+declare i32 @foo(i32, i32)
diff --git a/test/CodeGen/ARM64/zero-cycle-zeroing.ll b/test/CodeGen/ARM64/zero-cycle-zeroing.ll
new file mode 100644
index 0000000..349bb6f
--- /dev/null
+++ b/test/CodeGen/ARM64/zero-cycle-zeroing.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; rdar://11481771
+; rdar://13713797
+
+define void @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: fmov
+; CHECK: movi.2d v0, #0000000000000000
+; CHECK: movi.2d v1, #0000000000000000
+; CHECK: movi.2d v2, #0000000000000000
+; CHECK: movi.2d v3, #0000000000000000
+  tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind
+  ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: mov w0, wzr
+; CHECK: movz w0, #0
+; CHECK: movz w1, #0
+  tail call void @bari(i32 0, i32 0) nounwind
+  ret void
+}
+
+define void @t3() nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: mov x0, xzr
+; CHECK: movz x0, #0
+; CHECK: movz x1, #0
+  tail call void @barl(i64 0, i64 0) nounwind
+  ret void
+}
+
+define void @t4() nounwind ssp {
+; CHECK-LABEL: t4:
+; CHECK-NOT: fmov
+; CHECK: movi.2d v0, #0000000000000000
+; CHECK: movi.2d v1, #0000000000000000
+  tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
+  ret void
+}
+
+declare void @bar(double, double, double, double)
+declare void @bari(i32, i32)
+declare void @barl(i64, i64)
+declare void @barf(float, float)
diff --git a/test/CodeGen/ARM64/zext.ll b/test/CodeGen/ARM64/zext.ll
new file mode 100644
index 0000000..8d9e5ea
--- /dev/null
+++ b/test/CodeGen/ARM64/zext.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i64 @foo(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: add w0, w1, w0
+; CHECK: ret
+  %add = add i32 %b, %a
+  %conv = zext i32 %add to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/ARM64/zextload-unscaled.ll b/test/CodeGen/ARM64/zextload-unscaled.ll
new file mode 100644
index 0000000..c475dbd
--- /dev/null
+++ b/test/CodeGen/ARM64/zextload-unscaled.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+@var32 = global i32 0
+
+define void @test_zextloadi1_unscaled(i1* %base) {
+; CHECK-LABEL: test_zextloadi1_unscaled:
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
+
+  %addr = getelementptr i1* %base, i32 -7
+  %val = load i1* %addr, align 1
+
+  %extended = zext i1 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
+define void @test_zextloadi8_unscaled(i8* %base) {
+; CHECK-LABEL: test_zextloadi8_unscaled:
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
+
+  %addr = getelementptr i8* %base, i32 -7
+  %val = load i8* %addr, align 1
+
+  %extended = zext i8 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
+define void @test_zextloadi16_unscaled(i16* %base) {
+; CHECK-LABEL: test_zextloadi16_unscaled:
+; CHECK: ldurh {{w[0-9]+}}, [{{x[0-9]+}}, #-14]
+
+  %addr = getelementptr i16* %base, i32 -7
+  %val = load i16* %addr, align 2
+
+  %extended = zext i16 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/zip.ll b/test/CodeGen/ARM64/zip.ll
new file mode 100644
index 0000000..d06a9f8
--- /dev/null
+++ b/test/CodeGen/ARM64/zip.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipi8:
+;CHECK: zip1.8b
+;CHECK: zip2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vzipi16:
+;CHECK: zip1.4h
+;CHECK: zip2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipQi8:
+;CHECK: zip1.16b
+;CHECK: zip2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vzipQi16:
+;CHECK: zip1.8h
+;CHECK: zip2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vzipQi32:
+;CHECK: zip1.4s
+;CHECK: zip2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vzipQf:
+;CHECK: zip1.4s
+;CHECK: zip2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VZIP:
+
+define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipi8_undef:
+;CHECK: zip1.8b
+;CHECK: zip2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 undef, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 undef, i32 undef, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @vzipQi8_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipQi8_undef:
+;CHECK: zip1.16b
+;CHECK: zip2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
diff --git a/test/CodeGen/CPP/attributes.ll b/test/CodeGen/CPP/attributes.ll
new file mode 100644
index 0000000..3dab617
--- /dev/null
+++ b/test/CodeGen/CPP/attributes.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=cpp | FileCheck %s
+
+define void @f1(i8* byval, i8* inalloca) {
+; CHECK: ByVal
+; CHECK: InAlloca
+  ret void
+}
diff --git a/test/CodeGen/Generic/2007-04-08-MultipleFrameIndices.ll b/test/CodeGen/Generic/2007-04-08-MultipleFrameIndices.ll
index 339f0f7..21c05f1 100644
--- a/test/CodeGen/Generic/2007-04-08-MultipleFrameIndices.ll
+++ b/test/CodeGen/Generic/2007-04-08-MultipleFrameIndices.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -no-integrated-as < %s
 ; XFAIL: sparc-sun-solaris2
 ; PR1308
 ; PR1557
diff --git a/test/CodeGen/Generic/2007-04-27-InlineAsm-X-Dest.ll b/test/CodeGen/Generic/2007-04-27-InlineAsm-X-Dest.ll
index af522dc..0f82ba6 100644
--- a/test/CodeGen/Generic/2007-04-27-InlineAsm-X-Dest.ll
+++ b/test/CodeGen/Generic/2007-04-27-InlineAsm-X-Dest.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -no-integrated-as < %s
 
 ; Test that we can have an "X" output constraint.
 
diff --git a/test/CodeGen/Generic/2007-04-27-LargeMemObject.ll b/test/CodeGen/Generic/2007-04-27-LargeMemObject.ll
index f2c9b7f..05989a0 100644
--- a/test/CodeGen/Generic/2007-04-27-LargeMemObject.ll
+++ b/test/CodeGen/Generic/2007-04-27-LargeMemObject.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -no-integrated-as < %s
 
         %struct..0anon = type { [100 x i32] }
 
diff --git a/test/CodeGen/Generic/2007-12-17-InvokeAsm.ll b/test/CodeGen/Generic/2007-12-17-InvokeAsm.ll
index 27c7162..03ccbdf 100644
--- a/test/CodeGen/Generic/2007-12-17-InvokeAsm.ll
+++ b/test/CodeGen/Generic/2007-12-17-InvokeAsm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -no-integrated-as < %s
 
 define fastcc void @bc__support__high_resolution_time__initialize_clock_rate() {
 entry:
diff --git a/test/CodeGen/Generic/2008-02-20-MatchingMem.ll b/test/CodeGen/Generic/2008-02-20-MatchingMem.ll
index 7ffb734..5ddb515 100644
--- a/test/CodeGen/Generic/2008-02-20-MatchingMem.ll
+++ b/test/CodeGen/Generic/2008-02-20-MatchingMem.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -no-integrated-as < %s
 ; PR1133
 ; XFAIL: hexagon
 define void @test(i32* %X) nounwind  {
diff --git a/test/CodeGen/Generic/2014-02-05-OpaqueConstants.ll b/test/CodeGen/Generic/2014-02-05-OpaqueConstants.ll
new file mode 100644
index 0000000..5c1cd05
--- /dev/null
+++ b/test/CodeGen/Generic/2014-02-05-OpaqueConstants.ll
@@ -0,0 +1,19 @@
+; Test that opaque constants are not creating an infinite DAGCombine loop
+; RUN: llc < %s
+; XFAIL: r600, xcore
+
+@a = common global i32* null, align 8
+@c = common global i32 0, align 4
+@b = common global i32* null, align 8
+
+; Function Attrs: nounwind ssp uwtable
+define void @fn() {
+  store i32* inttoptr (i64 68719476735 to i32*), i32** @a, align 8
+  %1 = load i32* @c, align 4
+  %2 = sext i32 %1 to i64
+  %3 = lshr i64 %2, 12
+  %4 = and i64 %3, 68719476735
+  %5 = getelementptr inbounds i32* null, i64 %4
+  store i32* %5, i32** @b, align 8
+  ret void
+}
diff --git a/test/CodeGen/Generic/asm-large-immediate.ll b/test/CodeGen/Generic/asm-large-immediate.ll
index 891bbc9..67a7a1e 100644
--- a/test/CodeGen/Generic/asm-large-immediate.ll
+++ b/test/CodeGen/Generic/asm-large-immediate.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -no-integrated-as < %s | FileCheck %s
 
 define void @test() {
 entry:
diff --git a/test/CodeGen/Generic/inline-asm-mem-clobber.ll b/test/CodeGen/Generic/inline-asm-mem-clobber.ll
index e523d03..5aa827a 100644
--- a/test/CodeGen/Generic/inline-asm-mem-clobber.ll
+++ b/test/CodeGen/Generic/inline-asm-mem-clobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O2 < %s | FileCheck %s
+; RUN: llc -O2 -no-integrated-as < %s | FileCheck %s
 
 @G = common global i32 0, align 4
 
diff --git a/test/CodeGen/Generic/inline-asm-special-strings.ll b/test/CodeGen/Generic/inline-asm-special-strings.ll
index d18221e..5ef5688 100644
--- a/test/CodeGen/Generic/inline-asm-special-strings.ll
+++ b/test/CodeGen/Generic/inline-asm-special-strings.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep "foo 0 0"
+; RUN: llc -no-integrated-as < %s | grep "foo 0 0"
 
 define void @bar() nounwind {
 	tail call void asm sideeffect "foo ${:uid} ${:uid}", ""() nounwind
diff --git a/test/CodeGen/Generic/no-target.ll b/test/CodeGen/Generic/no-target.ll
new file mode 100644
index 0000000..4a4724f
--- /dev/null
+++ b/test/CodeGen/Generic/no-target.ll
@@ -0,0 +1,3 @@
+; RUN: not llc -mtriple le32-unknown-nacl %s -o - 2>&1 | FileCheck %s
+
+; CHECK: error: unable to get target for 'le32-unknown-nacl'
diff --git a/test/CodeGen/Generic/print-after.ll b/test/CodeGen/Generic/print-after.ll
index 7505907..1b7ce84 100644
--- a/test/CodeGen/Generic/print-after.ll
+++ b/test/CodeGen/Generic/print-after.ll
@@ -1,4 +1,4 @@
-; RUN: not llc --help-hidden 2>&1 | FileCheck %s
+; RUN: llc --help-hidden 2>&1 | FileCheck %s
 
 ; CHECK: -print-after
 ; CHECK-NOT: -print-after-all
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index bfdd813..4e858f7 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -7,7 +7,7 @@ define void @foo(i32* nocapture %a, i32* nocapture %b) nounwind {
 entry:
   tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !13), !dbg !17
   tail call void @llvm.dbg.value(metadata !{i32* %b}, i64 0, metadata !14), !dbg !18
-  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !15), !dbg !19
+  tail call void @llvm.dbg.value(metadata !30, i64 0, metadata !15), !dbg !19
   br label %for.body, !dbg !19
 
 for.body:                                         ; preds = %for.body, %entry
@@ -38,7 +38,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !llvm.module.flags = !{!29}
 
 !0 = metadata !{i32 786449, metadata !28, i32 12, metadata !"QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c] [DW_LANG_C99]
-!2 = metadata !{i32 0}
+!2 = metadata !{}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !28, null, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32*)* @foo, null, null, metadata !11, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
 !6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
@@ -62,3 +62,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !27 = metadata !{i32 6, i32 1, metadata !16, null}
 !28 = metadata !{metadata !"hwloop-dbg.c", metadata !"/usr2/kparzysz/s.hex/t"}
 !29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!30 = metadata !{i32 0}
diff --git a/test/CodeGen/Hexagon/packetize_cond_inst.ll b/test/CodeGen/Hexagon/packetize_cond_inst.ll
index a48a9f6..1fc6e82 100644
--- a/test/CodeGen/Hexagon/packetize_cond_inst.ll
+++ b/test/CodeGen/Hexagon/packetize_cond_inst.ll
@@ -12,7 +12,7 @@ target triple = "hexagon-unknown--elf"
 ;     }
 ; CHECK: cmp
 ; CHECK-NEXT: add
-; CHECH-NEXT: add
+; CHECK-NEXT: add
 define i32 @ifcnv_add(i32, i32, i32) nounwind readnone {
   %4 = icmp sgt i32 %2, %1
   br i1 %4, label %5, label %7
diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll
index 0180905..b6ba22e 100644
--- a/test/CodeGen/MSP430/fp.ll
+++ b/test/CodeGen/MSP430/fp.ll
@@ -15,3 +15,15 @@ entry:
 ; CHECK: pop.w r4
   ret void
 }
+
+; Due to FPB not being marked as reserved, the register allocator used to select
+; r4 as the register for the "r" constraint below. This test verifies that this
+; does not happen anymore. Note that the only reason an ISR is used here is that
+; the register allocator selects r4 first instead of fifth in a normal function.
+define msp430_intrcc void @fpb_alloced() #0 {
+; CHECK_LABEL: fpb_alloced:
+; CHECK-NOT: mov.b #0, r4
+; CHECK: nop
+  call void asm sideeffect "nop", "r"(i8 0)
+  ret void
+}
diff --git a/test/CodeGen/MSP430/misched-msp430.ll b/test/CodeGen/MSP430/misched-msp430.ll
new file mode 100644
index 0000000..c8541ef
--- /dev/null
+++ b/test/CodeGen/MSP430/misched-msp430.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=msp430-unknown-unknown -enable-misched | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
+
+@y = common global i16 0, align 2
+@x = common global i16 0, align 2
+
+; Test that the MI Scheduler's initPolicy does not crash when i32 is
+; unsupported. The content of the asm check below is unimportant. It
+; only verifies that the code generator ran successfully.
+;
+; CHECK-LABEL: @f
+; CHECK: mov.w &y, &x
+; CHECK: ret
+define void @f() {
+entry:
+  %0 = load i16* @y, align 2
+  store i16 %0, i16* @x, align 2
+  ret void
+}
diff --git a/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll b/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
index 3381143..8807d75 100644
--- a/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
+++ b/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s 
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 -soft-float -mips16-hard-float   < %s | FileCheck %s 
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 < %s | FileCheck %s
 
 define signext i8 @A(i8 %e.0, i8 signext %sum)  nounwind {
 entry:
diff --git a/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll b/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
index 2b2ee0f..c3791df 100644
--- a/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
+++ b/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-O32
 ; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-O32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n32 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
 
 define float @h() nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/abicalls.ll b/test/CodeGen/Mips/abicalls.ll
new file mode 100644
index 0000000..7b98b02
--- /dev/null
+++ b/test/CodeGen/Mips/abicalls.ll
@@ -0,0 +1,15 @@
+; 
+; When the assembler is ready a .s file for it will
+; be created.
+
+; Note that EF_MIPS_CPIC is set by -mabicalls which is the default on Linux
+; TODO need to support -mno-abicalls
+
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-STATIC %s
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck -check-prefix=CHECK-PIC %s
+; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-PIC %s
+
+; CHECK-STATIC: .abicalls
+; CHECK-STATIC-NEXT: pic0
+; CHECK-PIC: .abicalls
+; CHECK-PIC-NOT: pic0
diff --git a/test/CodeGen/Mips/addi.ll b/test/CodeGen/Mips/addi.ll
index 8f70a46..01d409e 100644
--- a/test/CodeGen/Mips/addi.ll
+++ b/test/CodeGen/Mips/addi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s -check-prefix=16
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 6, align 4
 @j = global i32 12, align 4
diff --git a/test/CodeGen/Mips/align16.ll b/test/CodeGen/Mips/align16.ll
index 267cff5..689ae83 100644
--- a/test/CodeGen/Mips/align16.ll
+++ b/test/CodeGen/Mips/align16.ll
@@ -25,7 +25,7 @@ entry:
   call void @p(i32* %arrayidx1)
   ret void
 }
-; 16:	save	$ra, $s0, $s1, $s2, 2040
-; 16:	addiu	$sp, -56 # 16 bit inst
-; 16:	addiu	$sp, 56 # 16 bit inst
-; 16:	restore	$ra,  $s0, $s1, $s2, 2040
+; 16:	save	$ra, 2040
+; 16:	addiu	$sp, -40 # 16 bit inst
+; 16:	addiu	$sp, 40 # 16 bit inst
+; 16:	restore	$ra, 2040
diff --git a/test/CodeGen/Mips/alloca16.ll b/test/CodeGen/Mips/alloca16.ll
index 017665f..4f60598 100644
--- a/test/CodeGen/Mips/alloca16.ll
+++ b/test/CodeGen/Mips/alloca16.ll
@@ -19,8 +19,8 @@ entry:
 
 define void @test() nounwind {
 entry:
-; 16: 	.frame	$sp,24,$ra
-; 16: 	save 	$ra, $s0, $s1, $s2, 24
+; 16: 	.frame	$sp,8,$ra
+; 16: 	save 	8 # 16 bit inst
 ; 16: 	move	$16, $sp
 ; 16:	move	${{[0-9]+}}, $sp
 ; 16:	subu	$[[REGISTER:[0-9]+]], ${{[0-9]+}}, ${{[0-9]+}}
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 0e60fe1..77d7bf3 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -77,7 +77,7 @@ entry:
   %newval.addr = alloca i32, align 4
   store i32 %newval, i32* %newval.addr, align 4
   %tmp = load i32* %newval.addr, align 4
-  %0 = cmpxchg i32* @x, i32 %oldval, i32 %tmp monotonic
+  %0 = cmpxchg i32* @x, i32 %oldval, i32 %tmp monotonic monotonic
   ret i32 %0
 
 ; CHECK-EL-LABEL:   AtomicCmpSwap32:
@@ -333,7 +333,7 @@ entry:
 
 define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwind {
 entry:
-  %0 = cmpxchg i8* @y, i8 %oldval, i8 %newval monotonic
+  %0 = cmpxchg i8* @y, i8 %oldval, i8 %newval monotonic monotonic
   ret i8 %0
 
 ; CHECK-EL-LABEL:   AtomicCmpSwap8:
@@ -429,7 +429,7 @@ entry:
 
 define i32 @zeroreg() nounwind {
 entry:
-  %0 = cmpxchg i32* @a, i32 1, i32 0 seq_cst
+  %0 = cmpxchg i32* @a, i32 1, i32 0 seq_cst seq_cst
   %1 = icmp eq i32 %0, 1
   %conv = zext i1 %1 to i32
   ret i32 %conv
diff --git a/test/CodeGen/Mips/atomicops.ll b/test/CodeGen/Mips/atomicops.ll
index 0f0f01a..dc07c63 100644
--- a/test/CodeGen/Mips/atomicops.ll
+++ b/test/CodeGen/Mips/atomicops.ll
@@ -20,7 +20,7 @@ entry:
   %add.i = add nsw i32 %0, 2
   %1 = load volatile i32* %x, align 4
   %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %add.i, i32 %1) nounwind
-  %2 = cmpxchg i32* %x, i32 1, i32 2 seq_cst
+  %2 = cmpxchg i32* %x, i32 1, i32 2 seq_cst seq_cst
   %3 = load volatile i32* %x, align 4
   %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %2, i32 %3) nounwind
   %4 = atomicrmw xchg i32* %x, i32 1 seq_cst
diff --git a/test/CodeGen/Mips/blez_bgez.ll b/test/CodeGen/Mips/blez_bgez.ll
index f6a5e4f..dcda047 100644
--- a/test/CodeGen/Mips/blez_bgez.ll
+++ b/test/CodeGen/Mips/blez_bgez.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
+; RUN: llc -march=mips64el < %s | FileCheck %s
 
 ; CHECK-LABEL: test_blez:
 ; CHECK: blez ${{[0-9]+}}, $BB
diff --git a/test/CodeGen/Mips/blockaddr.ll b/test/CodeGen/Mips/blockaddr.ll
index beab65f..d6dc7e7 100644
--- a/test/CodeGen/Mips/blockaddr.ll
+++ b/test/CodeGen/Mips/blockaddr.ll
@@ -1,11 +1,11 @@
 ; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-O32
 ; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-O32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n32 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -soft-float -mips16-hard-float -relocation-model=static   < %s | FileCheck %s -check-prefix=STATIC-MIPS16-1
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -soft-float -mips16-hard-float -relocation-model=static   < %s | FileCheck %s -check-prefix=STATIC-MIPS16-2
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-MIPS16-1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-MIPS16-2
 
 @reg = common global i8* null, align 4
 
@@ -43,8 +43,8 @@ entry:
 ; STATIC-MIPS16-1: li  $[[R1_16:[0-9]+]], %hi($tmp[[TI_16:[0-9]+]])
 ; STATIC-MIPS16-1: sll ${{[0-9]+}},  $[[R1_16]], 16
 ; STATIC-MIPS16-2: li  ${{[0-9]+}}, %lo($tmp{{[0-9]+}})
-; STATIC-MIPS16-1 jal	dummy
-; STATIC-MIPS16-2 jal	dummy
+; STATIC-MIPS16-1: jal	dummy
+; STATIC-MIPS16-2: jal	dummy
 
 define void @f() nounwind {
 entry:
diff --git a/test/CodeGen/Mips/bswap.ll b/test/CodeGen/Mips/bswap.ll
index f17b91a..812eef1 100644
--- a/test/CodeGen/Mips/bswap.ll
+++ b/test/CodeGen/Mips/bswap.ll
@@ -1,28 +1,105 @@
 ; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=MIPS32
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=MIPS64
-; RUN: llc  < %s -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 -soft-float -mips16-hard-float   | FileCheck %s -check-prefix=mips16 
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 | FileCheck %s -check-prefix=MIPS16
 
 define i32 @bswap32(i32 %x) nounwind readnone {
 entry:
 ; MIPS32-LABEL: bswap32:
 ; MIPS32: wsbh $[[R0:[0-9]+]]
 ; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
-; mips16: .ent bswap32
+
+; MIPS64-LABEL: bswap32:
+; MIPS64: wsbh $[[R0:[0-9]+]]
+; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
+
+; MIPS16-LABEL: bswap32:
+; MIPS16-DAG: srl $[[R0:[0-9]+]], $4, 8
+; MIPS16-DAG: srl $[[R1:[0-9]+]], $4, 24
+; MIPS16-DAG: sll $[[R2:[0-9]+]], $4, 8
+; MIPS16-DAG: sll $[[R3:[0-9]+]], $4, 24
+; MIPS16-DAG: li  $[[R4:[0-9]+]], 65280
+; MIPS16-DAG: and $[[R4]], $[[R0]]
+; MIPS16-DAG: or  $[[R1]], $[[R4]]
+; MIPS16-DAG: lw  $[[R7:[0-9]+]], $CPI
+; MIPS16-DAG: and $[[R7]], $[[R2]]
+; MIPS16-DAG: or  $[[R3]], $[[R7]]
+; MIPS16-DAG: or  $[[R3]], $[[R1]]
+
   %or.3 = call i32 @llvm.bswap.i32(i32 %x)
   ret i32 %or.3
 }
 
 define i64 @bswap64(i64 %x) nounwind readnone {
 entry:
+; MIPS32-LABEL: bswap64:
+; MIPS32: wsbh $[[R0:[0-9]+]]
+; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32: wsbh $[[R0:[0-9]+]]
+; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
+
 ; MIPS64-LABEL: bswap64:
 ; MIPS64: dsbh $[[R0:[0-9]+]]
 ; MIPS64: dshd ${{[0-9]+}}, $[[R0]]
-; mips16: .ent bswap64
+
+; MIPS16-LABEL: bswap64:
+; MIPS16-DAG: srl $[[R0:[0-9]+]], $5, 8
+; MIPS16-DAG: srl $[[R1:[0-9]+]], $5, 24
+; MIPS16-DAG: sll $[[R2:[0-9]+]], $5, 8
+; MIPS16-DAG: sll $[[R3:[0-9]+]], $5, 24
+; MIPS16-DAG: li  $[[R4:[0-9]+]], 65280
+; MIPS16-DAG: and $[[R0]], $[[R4]]
+; MIPS16-DAG: or  $[[R1]], $[[R0]]
+; MIPS16-DAG: lw  $[[R7:[0-9]+]], 1f
+; MIPS16-DAG: and $[[R2]], $[[R7]]
+; MIPS16-DAG: or  $[[R3]], $[[R2]]
+; MIPS16-DAG: or  $[[R3]], $[[R1]]
+; MIPS16-DAG: srl $[[R0:[0-9]+]], $4, 8
+; MIPS16-DAG: srl $[[R1:[0-9]+]], $4, 24
+; MIPS16-DAG: sll $[[R2:[0-9]+]], $4, 8
+; MIPS16-DAG: sll $[[R3:[0-9]+]], $4, 24
+; MIPS16-DAG: li  $[[R4:[0-9]+]], 65280
+; MIPS16-DAG: and $[[R0]], $[[R4]]
+; MIPS16-DAG: or  $[[R1]], $[[R0]]
+; MIPS16-DAG: lw  $[[R7:[0-9]+]], 1f
+; MIPS16-DAG: and $[[R2]], $[[R7]]
+; MIPS16-DAG: or  $[[R3]], $[[R2]]
+; MIPS16-DAG: or  $[[R3]], $[[R1]]
+
   %or.7 = call i64 @llvm.bswap.i64(i64 %x)
   ret i64 %or.7
 }
 
+define <4 x i32> @bswapv4i32(<4 x i32> %x) nounwind readnone {
+entry:
+; MIPS32-LABEL: bswapv4i32:
+; MIPS32: wsbh $[[R0:[0-9]+]]
+; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32: wsbh $[[R0:[0-9]+]]
+; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32: wsbh $[[R0:[0-9]+]]
+; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32: wsbh $[[R0:[0-9]+]]
+; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
+
+; MIPS64-LABEL: bswapv4i32:
+; MIPS64: wsbh $[[R0:[0-9]+]]
+; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS64: wsbh $[[R0:[0-9]+]]
+; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS64: wsbh $[[R0:[0-9]+]]
+; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS64: wsbh $[[R0:[0-9]+]]
+; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
+
+; Don't bother with a MIPS16 version. It's just bswap32 repeated four times and
+; would be very long
+
+  %ret = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %x)
+  ret <4 x i32> %ret
+}
+
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 
 declare i64 @llvm.bswap.i64(i64) nounwind readnone
 
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/Mips/buildpairextractelementf64.ll b/test/CodeGen/Mips/buildpairextractelementf64.ll
index 490d427..b9bf2b6 100644
--- a/test/CodeGen/Mips/buildpairextractelementf64.ll
+++ b/test/CodeGen/Mips/buildpairextractelementf64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=FP32
-; RUN: llc -march=mips  < %s | FileCheck %s -check-prefix=FP32
-; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64
-; RUN: llc -march=mips -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK
+; RUN: llc -march=mips  < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK
+; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
+; RUN: llc -march=mips -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
 
 @a = external global i32
 
diff --git a/test/CodeGen/Mips/cache-intrinsic.ll b/test/CodeGen/Mips/cache-intrinsic.ll
new file mode 100644
index 0000000..2fa4115
--- /dev/null
+++ b/test/CodeGen/Mips/cache-intrinsic.ll
@@ -0,0 +1,26 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+@buffer = global [32 x i8] c"This is a largely unused buffer\00", align 1
+@.str = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
+@.str1 = private unnamed_addr constant [25 x i8] c"Still, largely unused...\00", align 1
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0))
+  %call1 = call i8* @strcpy(i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8]* @.str1, i32 0, i32 0)) #3
+  call void @llvm.clear_cache(i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds (i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i32 32)) #3
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0))
+  ret i32 0
+}
+
+; CHECK: __clear_cache
+
+declare i32 @printf(i8*, ...)
+
+declare i8* @strcpy(i8*, i8*)
+
+declare void @llvm.clear_cache(i8*, i8*)
diff --git a/test/CodeGen/Mips/call-optimization.ll b/test/CodeGen/Mips/call-optimization.ll
new file mode 100644
index 0000000..bfa09ea
--- /dev/null
+++ b/test/CodeGen/Mips/call-optimization.ll
@@ -0,0 +1,91 @@
+; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | \
+; RUN:  FileCheck %s -check-prefix=O32
+; RUN: llc -march=mipsel -mips-load-target-from-got=false \
+; RUN:  -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=O32-LOADTGT
+
+@gd1 = common global double 0.000000e+00, align 8
+@gd2 = common global double 0.000000e+00, align 8
+
+; O32-LABEL: caller3:
+; O32-DAG:   lw $25, %call16(callee3)
+; O32-DAG:   move $gp
+; O32:       jalr $25
+; O32-NOT:   move $gp
+; O32:       lw $25, %call16(callee3)
+; O32-NOT:   move $gp
+; O32:       jalr $25
+; O32-NOT:   move $gp
+; O32:       lw $25, %call16(callee3)
+; O32-NOT:   move $gp
+; O32:       jalr $25
+
+; O32-LOADTGT-LABEL: caller3:
+; O32-LOADTGT-DAG:   lw $25, %call16(callee3)
+; O32-LOADTGT-DAG:   move $gp
+; O32-LOADTGT:       jalr $25
+; O32-LOADTGT-NOT:   move $gp
+; O32-LOADTGT:       move $25
+; O32-LOADTGT-NOT:   move $gp
+; O32-LOADTGT:       jalr $25
+; O32-LOADTGT-NOT:   move $gp
+; O32-LOADTGT:       move $25
+; O32-LOADTGT-NOT:   move $gp
+; O32-LOADTGT:       jalr $25
+
+define void @caller3(i32 %n) {
+entry:
+  tail call void @callee3()
+  tail call void @callee3()
+  %tobool1 = icmp eq i32 %n, 0
+  br i1 %tobool1, label %while.end, label %while.body
+
+while.body:
+  %n.addr.02 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+  %dec = add nsw i32 %n.addr.02, -1
+  tail call void @callee3()
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+declare void @callee3()
+
+; O32-LABEL: caller4:
+; O32-DAG:   lw $25, %call16(ceil)
+; O32-DAG:   move $gp
+; O32:       jalr $25
+; O32-NOT:   move $gp
+; O32:       lw $25, %call16(ceil)
+; O32-NOT:   move $gp
+; O32:       jalr $25
+; O32-NOT:   move $gp
+; O32:       lw $25, %call16(ceil)
+; O32-NOT:   move $gp
+; O32:       jalr $25
+
+; O32-LOADTGT-LABEL: caller4:
+; O32-LOADTGT-DAG:   lw $25, %call16(ceil)
+; O32-LOADTGT-DAG:   move $gp
+; O32-LOADTGT:       jalr $25
+; O32-LOADTGT-NOT:   move $gp
+; O32-LOADTGT:       move $25
+; O32-LOADTGT-NOT:   move $gp
+; O32-LOADTGT:       jalr $25
+; O32-LOADTGT-NOT:   move $gp
+; O32-LOADTGT:       move $25
+; O32-LOADTGT-NOT:   move $gp
+; O32-LOADTGT:       jalr $25
+
+define void @caller4(double %d) {
+entry:
+  %call = tail call double @ceil(double %d)
+  %call1 = tail call double @ceil(double %call)
+  store double %call1, double* @gd2, align 8
+  %call2 = tail call double @ceil(double %call1)
+  store double %call2, double* @gd1, align 8
+  ret void
+}
+
+declare double @ceil(double)
diff --git a/test/CodeGen/Mips/ci2.ll b/test/CodeGen/Mips/ci2.ll
new file mode 100644
index 0000000..7187f0c
--- /dev/null
+++ b/test/CodeGen/Mips/ci2.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands   < %s | FileCheck %s -check-prefix=constisle
+
+@i = common global i32 0, align 4
+@b = common global i32 0, align 4
+@l = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @foo() #0 {
+entry:
+  store i32 305419896, i32* @i, align 4
+  %0 = load i32* @b, align 4
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 10, i32* @b, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 20, i32* @b, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  call void asm sideeffect ".space 100000", ""() #1, !srcloc !1
+  store i32 305419896, i32* @l, align 4
+  ret void
+; constisle: $CPI0_1:
+; constisle	.4byte	305419896               # 0x12345678
+; constisle	#APP
+; constisle	.space 100000
+; constisle	#NO_APP
+; constisle $CPI0_0:
+; constisle	.4byte	305419896               # 0x12345678
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+!1 = metadata !{i32 103}
diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index c24c5ac..f2009fa 100644
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll
@@ -74,7 +74,7 @@ entry:
 define i32 @slti0(i32 %a) {
 entry:
   %cmp = icmp sgt i32 %a, 32766
-  %cond = select i1 %cmp, i32 3, i32 4
+  %cond = select i1 %cmp, i32 3, i32 5
   ret i32 %cond
 }
 
@@ -84,7 +84,7 @@ entry:
 define i32 @slti1(i32 %a) {
 entry:
   %cmp = icmp sgt i32 %a, 32767
-  %cond = select i1 %cmp, i32 3, i32 4
+  %cond = select i1 %cmp, i32 3, i32 5
   ret i32 %cond
 }
 
@@ -95,7 +95,7 @@ entry:
 define i32 @slti2(i32 %a) {
 entry:
   %cmp = icmp sgt i32 %a, -32769
-  %cond = select i1 %cmp, i32 3, i32 4
+  %cond = select i1 %cmp, i32 3, i32 5
   ret i32 %cond
 }
 
@@ -105,7 +105,7 @@ entry:
 define i32 @slti3(i32 %a) {
 entry:
   %cmp = icmp sgt i32 %a, -32770
-  %cond = select i1 %cmp, i32 3, i32 4
+  %cond = select i1 %cmp, i32 3, i32 5
   ret i32 %cond
 }
 
@@ -162,7 +162,7 @@ entry:
 define i32 @sltiu0(i32 %a) {
 entry:
   %cmp = icmp ugt i32 %a, 32766
-  %cond = select i1 %cmp, i32 3, i32 4
+  %cond = select i1 %cmp, i32 3, i32 5
   ret i32 %cond
 }
 
@@ -172,7 +172,7 @@ entry:
 define i32 @sltiu1(i32 %a) {
 entry:
   %cmp = icmp ugt i32 %a, 32767
-  %cond = select i1 %cmp, i32 3, i32 4
+  %cond = select i1 %cmp, i32 3, i32 5
   ret i32 %cond
 }
 
@@ -183,7 +183,7 @@ entry:
 define i32 @sltiu2(i32 %a) {
 entry:
   %cmp = icmp ugt i32 %a, -32769
-  %cond = select i1 %cmp, i32 3, i32 4
+  %cond = select i1 %cmp, i32 3, i32 5
   ret i32 %cond
 }
 
@@ -193,6 +193,49 @@ entry:
 define i32 @sltiu3(i32 %a) {
 entry:
   %cmp = icmp ugt i32 %a, -32770
-  %cond = select i1 %cmp, i32 3, i32 4
+  %cond = select i1 %cmp, i32 3, i32 5
   ret i32 %cond
 }
+
+; Check if
+;  (select (setxx a, N), x, x-1) or
+;  (select (setxx a, N), x-1, x)
+; doesn't generate conditional moves
+; for constant operands whose difference is |1|
+
+define i32 @slti4(i32 %a) nounwind readnone {
+  %1 = icmp slt i32 %a, 7
+  %2 = select i1 %1, i32 4, i32 3
+  ret i32 %2
+}
+
+; O32-LABEL: slti4:
+; O32-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; O32-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
+; O32-NOT: movn
+; O32:.size slti4
+
+define i32 @slti5(i32 %a) nounwind readnone {
+  %1 = icmp slt i32 %a, 7
+  %2 = select i1 %1, i32 -3, i32 -4
+  ret i32 %2
+}
+
+; O32-LABEL: slti5:
+; O32-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; O32-DAG: addiu [[R3:\$[0-9]+]], [[R2:\$[a-z0-9]+]], -4
+; O32-NOT: movn
+; O32:.size slti5
+
+define i32 @slti6(i32 %a) nounwind readnone {
+  %1 = icmp slt i32 %a, 7
+  %2 = select i1 %1, i32 3, i32 4
+  ret i32 %2
+}
+
+; O32-LABEL: slti6:
+; O32-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; O32-DAG: xori [[R1]], [[R1]], 1
+; O32-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
+; O32-NOT: movn
+; O32:.size slti6
+\ No newline at end of file
diff --git a/test/CodeGen/Mips/const-mult.ll b/test/CodeGen/Mips/const-mult.ll
index 8c0cbe3..1862021 100644
--- a/test/CodeGen/Mips/const-mult.ll
+++ b/test/CodeGen/Mips/const-mult.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=CHECK
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=CHECK
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=CHECK64
+; RUN: llc -march=mips64el < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK64
 
 ; CHECK-LABEL: mul5_32:
 ; CHECK: sll $[[R0:[0-9]+]], $4, 2
diff --git a/test/CodeGen/Mips/const4a.ll b/test/CodeGen/Mips/const4a.ll
index 0332327..b4c509f 100644
--- a/test/CodeGen/Mips/const4a.ll
+++ b/test/CodeGen/Mips/const4a.ll
@@ -15,14 +15,14 @@ define void @t() #0 {
 entry:
   store i32 -559023410, i32* @i, align 4
   %0 = load i32* @b, align 4
-; no-load-relax	lw	${{[0-9]+}}, $CPI0_1	# 16 bit inst
+; no-load-relax:	lw	${{[0-9]+}}, $CPI0_1	# 16 bit inst
   %tobool = icmp ne i32 %0, 0
   br i1 %tobool, label %if.then, label %if.else
 ; no-load-relax:	beqz	${{[0-9]+}}, $BB0_3
 ; no-load-relax:	lw	${{[0-9]+}}, %call16(foo)(${{[0-9]+}})
 ; no-load-relax:	b	$BB0_4
 ; no-load-relax:	.align	2
-; no-load-relax: $CPI0_0:
+; no-load-relax: $CPI0_1:
 ; no-load-relax:	.4byte	3735943886
 ; no-load-relax: $BB0_3:
 ; no-load-relax:	lw	${{[0-9]+}}, %call16(goo)(${{[0-9]+}})
diff --git a/test/CodeGen/Mips/const6.ll b/test/CodeGen/Mips/const6.ll
index 20cdc09..3f02ab9 100644
--- a/test/CodeGen/Mips/const6.ll
+++ b/test/CodeGen/Mips/const6.ll
@@ -27,7 +27,7 @@ entry:
 ; no-load-relax:	jalrc 	${{[0-9]+}}
 ; no-load-relax:	b	$BB0_2
 ; no-load-relax:	.align	2
-; no-load-relax: $CPI0_0:
+; no-load-relax: $CPI0_1:
 ; no-load-relax:	.4byte	3735943886
 ; no-load-relax: $BB0_2:
 
diff --git a/test/CodeGen/Mips/const6a.ll b/test/CodeGen/Mips/const6a.ll
index 8b402ac..d342390 100644
--- a/test/CodeGen/Mips/const6a.ll
+++ b/test/CodeGen/Mips/const6a.ll
@@ -19,7 +19,7 @@ entry:
 ; load-relax: $CPI0_0:
 ; load-relax:	.4byte	3735943886
 ; load-relax:	.end	t
-  call void asm sideeffect ".space 40000", ""() #1, !srcloc !1
+  call void asm sideeffect ".space 10000", ""() #1, !srcloc !1
   ret void
 }
 
diff --git a/test/CodeGen/Mips/ctlz.ll b/test/CodeGen/Mips/ctlz.ll
index 2ddb727..1f87166 100644
--- a/test/CodeGen/Mips/ctlz.ll
+++ b/test/CodeGen/Mips/ctlz.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=static
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=static
 
 @x = global i32 28912, align 4
 @y = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/elf_eflags.ll b/test/CodeGen/Mips/elf_eflags.ll
new file mode 100644
index 0000000..336ed7b
--- /dev/null
+++ b/test/CodeGen/Mips/elf_eflags.ll
@@ -0,0 +1,83 @@
+; This tests ELF EFLAGS setting with direct object.
+; When the assembler is ready a .s file for it will
+; be created.
+
+; Non-shared (static) is the absence of pic and or cpic.
+
+; EF_MIPS_NOREORDER (0x00000001) is always on by default currently
+; EF_MIPS_PIC (0x00000002)
+; EF_MIPS_CPIC (0x00000004) - See note below
+; EF_MIPS_ABI2 (0x00000020) - n32 not tested yet
+; EF_MIPS_ARCH_32 (0x50000000)
+; EF_MIPS_ARCH_64 (0x60000000)
+; EF_MIPS_ARCH_32R2 (0x70000000)
+; EF_MIPS_ARCH_64R2 (0x80000000)
+
+; Note that EF_MIPS_CPIC is set by -mabicalls which is the default on Linux
+; TODO need to support -mno-abicalls
+
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE32 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck -check-prefix=CHECK-LE32_PIC %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE32R2 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 %s -o - | FileCheck -check-prefix=CHECK-LE32R2_PIC %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE32R2-MICROMIPS %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | FileCheck -check-prefix=CHECK-LE32R2-MICROMIPS_PIC %s
+
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 %s -o - | FileCheck -check-prefix=CHECK-LE64_PIC %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64r2 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64R2 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64r2 %s -o - | FileCheck -check-prefix=CHECK-LE64R2_PIC %s
+
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+mips16 -relocation-model=pic %s -o - | FileCheck -check-prefix=CHECK-LE32R2-MIPS16 %s
+
+; 32(R1) bit with NO_REORDER and static
+; CHECK-LE32: .abicalls
+; CHECK-LE32: .option	pic0
+; CHECK-LE32: .set	noreorder
+;
+; 32(R1) bit with NO_REORDER and PIC
+; CHECK-LE32_PIC: .abicalls
+; CHECK-LE32_PIC: .set	noreorder
+;
+; 32R2 bit with NO_REORDER and static
+; CHECK-LE32R2: .abicalls
+; CHECK-LE32R2: .option pic0
+; CHECK-LE32R2: .set noreorder
+;
+; 32R2 bit with NO_REORDER and PIC
+; CHECK-LE32R2_PIC: .abicalls
+; CHECK-LE32R2_PIC: .set noreorder
+;
+; 32R2 bit MICROMIPS with NO_REORDER and static
+; CHECK-LE32R2-MICROMIPS: .abicalls
+; CHECK-LE32R2-MICROMIPS: .option pic0
+; CHECK-LE32R2-MICROMIPS: .set	micromips
+;
+; 32R2 bit MICROMIPS with NO_REORDER and PIC
+; CHECK-LE32R2-MICROMIPS_PIC: .abicalls
+; CHECK-LE32R2-MICROMIPS_PIC: .set micromips
+;
+; 64(R1) bit with NO_REORDER and static
+; CHECK-LE64: .abicalls
+; CHECK-LE64: .set noreorder
+;
+; 64(R1) bit with NO_REORDER and PIC
+; CHECK-LE64_PIC: .abicalls
+; CHECK-LE64_PIC: .set noreorder
+;
+; 64R2 bit with NO_REORDER and static
+; CHECK-LE64R2: .abicalls
+; CHECK-LE64R2: .set noreorder
+;
+; 64R2 bit with NO_REORDER and PIC
+; CHECK-LE64R2_PIC: .abicalls
+; CHECK-LE64R2_PIC: .set noreorder
+;
+; 32R2 bit MIPS16 with PIC
+; CHECK-LE32R2-MIPS16: .abicalls
+; CHECK-LE32R2-MIPS16: .set mips16
+
+define i32 @main() nounwind {
+entry:
+  ret i32 0
+}
diff --git a/test/CodeGen/Mips/elf_st_other.ll b/test/CodeGen/Mips/elf_st_other.ll
new file mode 100644
index 0000000..8a5f20d
--- /dev/null
+++ b/test/CodeGen/Mips/elf_st_other.ll
@@ -0,0 +1,12 @@
+; This tests value of ELF st_other field for function symbol table entries.
+; For microMIPS value should be equal to STO_MIPS_MICROMIPS.
+
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | FileCheck %s
+
+define i32 @main() nounwind {
+entry:
+  ret i32 0
+}
+
+; CHECK: .set	micromips
+; CHECK: main:
diff --git a/test/CodeGen/Mips/ex2.ll b/test/CodeGen/Mips/ex2.ll
index c5535e7..6d024c2 100644
--- a/test/CodeGen/Mips/ex2.ll
+++ b/test/CodeGen/Mips/ex2.ll
@@ -6,12 +6,11 @@
 define i32 @main() {
 ; 16-LABEL: main:
 ; 16: 	.cfi_startproc
-; 16: 	save	$ra, $s0, $s1, $s2, 40
-; 16:   .cfi_def_cfa_offset 40
-; 16:   .cfi_offset 18, -8
-; 16:   .cfi_offset 17, -12
-; 16: 	.cfi_offset 16, -16
+; 16: 	save	$16, $17, $ra, 32 # 16 bit inst
+; 16:   .cfi_def_cfa_offset 32
 ; 16: 	.cfi_offset 31, -4
+; 16: 	.cfi_offset 17, -8
+; 16:   .cfi_offset 16, -12
 ; 16:   .cfi_endproc
 entry:
   %retval = alloca i32, align 4
diff --git a/test/CodeGen/Mips/f16abs.ll b/test/CodeGen/Mips/f16abs.ll
index 928914f..0fba9c4 100644
--- a/test/CodeGen/Mips/f16abs.ll
+++ b/test/CodeGen/Mips/f16abs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=static
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=static
 
 @y = global double -1.450000e+00, align 8
 @x = common global double 0.000000e+00, align 8
diff --git a/test/CodeGen/Mips/fastcc.ll b/test/CodeGen/Mips/fastcc.ll
index 82919e7..8ee7af8 100644
--- a/test/CodeGen/Mips/fastcc.ll
+++ b/test/CodeGen/Mips/fastcc.ll
@@ -1,4 +1,7 @@
 ; RUN: llc  < %s -march=mipsel | FileCheck %s 
+; RUN: llc  < %s -mtriple=mipsel-none-nacl-gnu \
+; RUN:  | FileCheck %s -check-prefix=CHECK-NACL
+
 
 @gi0 = external global i32
 @gi1 = external global i32
@@ -95,6 +98,11 @@ entry:
 ; CHECK: lw  $5
 ; CHECK: lw  $4
 
+; t6, t7 and t8 are reserved in NaCl and cannot be used for fastcc.
+; CHECK-NACL-NOT: lw  $14
+; CHECK-NACL-NOT: lw  $15
+; CHECK-NACL-NOT: lw  $24
+
   %0 = load i32* @gi0, align 4
   %1 = load i32* @gi1, align 4
   %2 = load i32* @gi2, align 4
@@ -134,6 +142,11 @@ entry:
 ; CHECK: sw  $24
 ; CHECK: sw  $3
 
+; t6, t7 and t8 are reserved in NaCl and cannot be used for fastcc.
+; CHECK-NACL-NOT: sw  $14
+; CHECK-NACL-NOT: sw  $15
+; CHECK-NACL-NOT: sw  $24
+
   store i32 %a0, i32* @g0, align 4
   store i32 %a1, i32* @g1, align 4
   store i32 %a2, i32* @g2, align 4
diff --git a/test/CodeGen/Mips/fixdfsf.ll b/test/CodeGen/Mips/fixdfsf.ll
index b08eefd..4271ac2 100644
--- a/test/CodeGen/Mips/fixdfsf.ll
+++ b/test/CodeGen/Mips/fixdfsf.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic < %s | FileCheck %s -check-prefix=pic1
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic < %s | FileCheck %s -check-prefix=pic2
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic2
 
 @x = common global double 0.000000e+00, align 8
 @y = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/fmadd1.ll b/test/CodeGen/Mips/fmadd1.ll
index 435b419..6768ed6 100644
--- a/test/CodeGen/Mips/fmadd1.ll
+++ b/test/CodeGen/Mips/fmadd1.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -enable-no-nans-fp-math | FileCheck %s -check-prefix=32R2
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=64R2
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2NAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2NAN
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -enable-no-nans-fp-math | FileCheck %s -check-prefix=32R2 -check-prefix=CHECK
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=64R2 -check-prefix=CHECK
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2NAN -check-prefix=CHECK
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2NAN -check-prefix=CHECK
 
 define float @FOO0float(float %a, float %b, float %c) nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/fp-indexed-ls.ll b/test/CodeGen/Mips/fp-indexed-ls.ll
index 1c4a3fd..d8c37e7 100644
--- a/test/CodeGen/Mips/fp-indexed-ls.ll
+++ b/test/CodeGen/Mips/fp-indexed-ls.ll
@@ -1,4 +1,6 @@
 ; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-none-nacl-gnu -mcpu=mips32r2 < %s \
+; RUN:  | FileCheck %s -check-prefix=CHECK-NACL
 
 %struct.S = type <{ [4 x float] }>
 %struct.S2 = type <{ [4 x double] }>
@@ -13,6 +15,7 @@
 define float @foo0(float* nocapture %b, i32 %o) nounwind readonly {
 entry:
 ; CHECK: lwxc1
+; CHECK-NACL-NOT: lwxc1
   %arrayidx = getelementptr inbounds float* %b, i32 %o
   %0 = load float* %arrayidx, align 4
   ret float %0
@@ -21,6 +24,7 @@ entry:
 define double @foo1(double* nocapture %b, i32 %o) nounwind readonly {
 entry:
 ; CHECK: ldxc1
+; CHECK-NACL-NOT: ldxc1
   %arrayidx = getelementptr inbounds double* %b, i32 %o
   %0 = load double* %arrayidx, align 8
   ret double %0
@@ -37,6 +41,7 @@ entry:
 define void @foo3(float* nocapture %b, i32 %o) nounwind {
 entry:
 ; CHECK: swxc1
+; CHECK-NACL-NOT: swxc1
   %0 = load float* @gf, align 4
   %arrayidx = getelementptr inbounds float* %b, i32 %o
   store float %0, float* %arrayidx, align 4
@@ -46,6 +51,7 @@ entry:
 define void @foo4(double* nocapture %b, i32 %o) nounwind {
 entry:
 ; CHECK: sdxc1
+; CHECK-NACL-NOT: sdxc1
   %0 = load double* @gd, align 8
   %arrayidx = getelementptr inbounds double* %b, i32 %o
   store double %0, double* %arrayidx, align 8
diff --git a/test/CodeGen/Mips/fp16instrinsmc.ll b/test/CodeGen/Mips/fp16instrinsmc.ll
index bb43d27..7ced36c 100644
--- a/test/CodeGen/Mips/fp16instrinsmc.ll
+++ b/test/CodeGen/Mips/fp16instrinsmc.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic < %s | FileCheck %s -check-prefix=pic
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=1010111 -mips-os16 < %s | FileCheck %s -check-prefix=fmask 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -mips32-function-mask=1010111 -mips-os16 < %s | FileCheck %s -check-prefix=fmask
 
 @x = global float 1.500000e+00, align 4
 @xn = global float -1.900000e+01, align 4
diff --git a/test/CodeGen/Mips/fp16mix.ll b/test/CodeGen/Mips/fp16mix.ll
index 8d85099..a94f838 100644
--- a/test/CodeGen/Mips/fp16mix.ll
+++ b/test/CodeGen/Mips/fp16mix.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=10 -mips-os16 < %s | FileCheck %s -check-prefix=fmask1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -mips32-function-mask=10 -mips-os16 < %s | FileCheck %s -check-prefix=fmask1
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=01 -mips-os16 < %s | FileCheck %s -check-prefix=fmask2 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -mips32-function-mask=01 -mips-os16 < %s | FileCheck %s -check-prefix=fmask2
 
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=10. -mips-os16 < %s | FileCheck %s -check-prefix=fmask1nr
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -mips32-function-mask=10. -mips-os16 < %s | FileCheck %s -check-prefix=fmask1nr
 
 ; Function Attrs: nounwind optsize readnone
 define void @foo1()  {
@@ -17,7 +17,7 @@ entry:
 ; fmask1: .set	reorder
 ; fmask1: .end	foo1
 ; fmask2: .ent	foo1
-; fmask2: save	{{.*}}
+; fmask2: jrc $ra
 ; fmask2: .end	foo1
 ; fmask1nr: .ent foo1
 ; fmask1nr: .set	noreorder
@@ -42,10 +42,10 @@ entry:
 ; fmask2: .set	reorder
 ; fmask2: .end	foo2
 ; fmask1: .ent	foo2
-; fmask1: save	{{.*}}
+; fmask1: jrc $ra
 ; fmask1: .end	foo2
 ; fmask1nr: .ent	foo2
-; fmask1nr: save	{{.*}}
+; fmask1nr: jrc $ra
 ; fmask1nr: .end	foo2
 }
 
@@ -62,10 +62,10 @@ entry:
 ; fmask1: .set	reorder
 ; fmask1: .end	foo3
 ; fmask2:  .ent	foo3
-; fmask2:  save	{{.*}}
+; fmask2:  jrc $ra
 ; fmask2:  .end	foo3
 ; fmask1r:  .ent	foo3
-; fmask1r:  save	{{.*}}
+; fmask1r:  jrc $ra
 ; fmask1r:  .end	foo3
 }
 
@@ -82,10 +82,10 @@ entry:
 ; fmask2: .set	reorder
 ; fmask2: .end	foo4
 ; fmask1: .ent	foo4
-; fmask1: save	{{.*}}
+; fmask1: jrc $ra
 ; fmask1: .end	foo4
 ; fmask1nr: .ent	foo4
-; fmask1nr: save	{{.*}}
+; fmask1nr: jrc $ra
 ; fmask1nr: .end	foo4
 }
 
diff --git a/test/CodeGen/Mips/fp16static.ll b/test/CodeGen/Mips/fp16static.ll
index 240ec75..beb063d 100644
--- a/test/CodeGen/Mips/fp16static.ll
+++ b/test/CodeGen/Mips/fp16static.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
 
 @x = common global float 0.000000e+00, align 4
 
diff --git a/test/CodeGen/Mips/fpneeded.ll b/test/CodeGen/Mips/fpneeded.ll
index dcdebb9..fdd8e8f 100644
--- a/test/CodeGen/Mips/fpneeded.ll
+++ b/test/CodeGen/Mips/fpneeded.ll
@@ -10,7 +10,7 @@ entry:
   ret float 1.000000e+00
 }
 
-; 32: 	.set	nomips16                  # @fv
+; 32: 	.set	nomips16
 ; 32: 	.ent	fv
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -26,7 +26,7 @@ entry:
   ret double 2.000000e+00
 }
 
-; 32: 	.set	nomips16                  # @dv
+; 32: 	.set	nomips16
 ; 32: 	.ent	dv
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -44,7 +44,7 @@ entry:
   ret void
 }
 
-; 32: 	.set	nomips16                  # @vf
+; 32: 	.set	nomips16
 ; 32: 	.ent	vf
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -62,7 +62,7 @@ entry:
   ret void
 }
 
-; 32: 	.set	nomips16                  # @vd
+; 32: 	.set	nomips16
 ; 32: 	.ent	vd
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -83,7 +83,7 @@ entry:
   ret void
 }
 
-; 32: 	.set	nomips16                  # @foo1
+; 32: 	.set	nomips16
 ; 32: 	.ent	foo1
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -102,7 +102,7 @@ entry:
 }
 
 
-; 32: 	.set	nomips16                  # @foo2
+; 32: 	.set	nomips16
 ; 32: 	.ent	foo2
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -120,7 +120,7 @@ entry:
   ret void
 }
 
-; 32: 	.set	nomips16                  # @foo3
+; 32: 	.set	nomips16
 ; 32: 	.ent	foo3
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -138,7 +138,7 @@ entry:
   ret void
 }
 
-; 32: 	.set	mips16                  # @vv
+; 32: 	.set	mips16
 ; 32: 	.ent	vv
 
 ; 32:	save	{{.+}}
diff --git a/test/CodeGen/Mips/fpnotneeded.ll b/test/CodeGen/Mips/fpnotneeded.ll
index b4fab64..e12d7ba 100644
--- a/test/CodeGen/Mips/fpnotneeded.ll
+++ b/test/CodeGen/Mips/fpnotneeded.ll
@@ -1,4 +1,6 @@
-; RUN: llc  -march=mipsel -mcpu=mips32 -relocation-model=static -O3 < %s -mips-os16  | FileCheck %s -check-prefix=32
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -relocation-model=static -O3 < %s -mips-os16  | FileCheck %s -check-prefix=32
+
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -relocation-model=static -O3 -mips16-constant-islands < %s -mips-os16  | FileCheck %s -check-prefix=cisle
 
 @i = global i32 1, align 4
 @f = global float 1.000000e+00, align 4
@@ -8,7 +10,7 @@ entry:
   ret void
 }
 
-; 32: 	.set	mips16                  # @vv
+; 32: 	.set	mips16
 ; 32: 	.ent	vv
 
 ; 32:	save	{{.+}}
@@ -21,7 +23,7 @@ entry:
   ret i32 %0
 }
 
-; 32: 	.set	mips16                  # @iv
+; 32: 	.set	mips16
 ; 32: 	.ent	iv
 
 ; 32:	save	{{.+}}
@@ -37,7 +39,7 @@ entry:
   ret void
 }
 
-; 32: 	.set	mips16                  # @vif
+; 32: 	.set	mips16
 ; 32: 	.ent	vif
 
 ; 32:	save	{{.+}}
@@ -50,13 +52,15 @@ entry:
   ret void
 }
 
-; 32: 	.set	mips16                  # @foo
+; 32: 	.set	mips16
 ; 32: 	.ent	foo
 
 ; 32:	save	{{.+}}
 ; 32:	restore	{{.+}} 
 ; 32:	.end	foo
 
+; cisle:	.end	foo
+
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 
@@ -65,7 +69,7 @@ entry:
   ret float 1.000000e+00
 }
 
-; 32: 	.set	nomips16                  # @fv
+; 32: 	.set	nomips16
 ; 32: 	.ent	fv
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
diff --git a/test/CodeGen/Mips/fptr2.ll b/test/CodeGen/Mips/fptr2.ll
index 77028db..c8b5e0d 100644
--- a/test/CodeGen/Mips/fptr2.ll
+++ b/test/CodeGen/Mips/fptr2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=static16
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static  < %s | FileCheck %s -check-prefix=static16
 
 ; Function Attrs: nounwind
 define double @my_mul(double %a, double %b) #0 {
diff --git a/test/CodeGen/Mips/global-address.ll b/test/CodeGen/Mips/global-address.ll
index 0d49a74..0785cfc 100644
--- a/test/CodeGen/Mips/global-address.ll
+++ b/test/CodeGen/Mips/global-address.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-O32
 ; RUN: llc -march=mipsel -relocation-model=static -mtriple=mipsel-linux-gnu < %s | FileCheck %s -check-prefix=STATIC-O32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n32 -relocation-model=static  -mtriple=mipsel-linux-gnu < %s | FileCheck %s -check-prefix=STATIC-N32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n32 -relocation-model=static  -mtriple=mipsel-linux-gnu < %s | FileCheck %s -check-prefix=STATIC-N32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=-n64,n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
 
 @s1 = internal unnamed_addr global i32 8, align 4
 @g1 = external global i32
diff --git a/test/CodeGen/Mips/helloworld.ll b/test/CodeGen/Mips/helloworld.ll
index 058a041..36f4ad6 100644
--- a/test/CodeGen/Mips/helloworld.ll
+++ b/test/CodeGen/Mips/helloworld.ll
@@ -15,7 +15,7 @@ entry:
   %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0))
   ret i32 0
 
-; SR: 	.set	mips16                  # @main
+; SR: 	.set	mips16
 
 ; SR32: .set nomips16
 ; SR32: .ent main
@@ -25,10 +25,9 @@ entry:
 ; SR32:  .set noreorder
 ; SR32:  .set nomacro
 ; SR32:  .set noat
-; SR:	save 	$ra, $s0, $s1, $s2, [[FS:[0-9]+]]
+; SR:	save 	$ra, 24 # 16 bit inst
 ; PE:    .ent main
-; PE:    .align  2
-; PE-NEXT:	li	$[[T1:[0-9]+]], %hi(_gp_disp)
+; PE:	li	$[[T1:[0-9]+]], %hi(_gp_disp)
 ; PE-NEXT: 	addiu	$[[T2:[0-9]+]], $pc, %lo(_gp_disp)
 ; PE:	        sll	$[[T3:[0-9]+]], $[[T1]], 16
 ; C1:	lw	${{[0-9]+}}, %got($.str)(${{[0-9]+}})
@@ -37,7 +36,7 @@ entry:
 ; C2:	move	$25, ${{[0-9]+}}
 ; C1:	move 	$gp, ${{[0-9]+}}
 ; C1:	jalrc 	${{[0-9]+}}
-; SR:	restore 	$ra, $s0, $s1, $s2, [[FS]]
+; SR:	restore $ra,	24 # 16 bit inst
 ; PE:	li	$2, 0
 ; PE:	jrc 	$ra
 
diff --git a/test/CodeGen/Mips/hf16_1.ll b/test/CodeGen/Mips/hf16_1.ll
index c7454ee..9879cd5 100644
--- a/test/CodeGen/Mips/hf16_1.ll
+++ b/test/CodeGen/Mips/hf16_1.ll
@@ -1,5 +1,5 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -soft-float -mips16-hard-float -O3 < %s | FileCheck %s -check-prefix=1
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -soft-float -mips16-hard-float -O3 < %s | FileCheck %s -check-prefix=2
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=1
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=2
 
 
 @x = common global float 0.000000e+00, align 4
diff --git a/test/CodeGen/Mips/hf16call32.ll b/test/CodeGen/Mips/hf16call32.ll
index 461438e..aec9c71 100644
--- a/test/CodeGen/Mips/hf16call32.ll
+++ b/test/CodeGen/Mips/hf16call32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=stel
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=stel
 
 @x = common global float 0.000000e+00, align 4
 @y = common global float 0.000000e+00, align 4
diff --git a/test/CodeGen/Mips/hf16call32_body.ll b/test/CodeGen/Mips/hf16call32_body.ll
index 34bae26..adac314 100644
--- a/test/CodeGen/Mips/hf16call32_body.ll
+++ b/test/CodeGen/Mips/hf16call32_body.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=stel
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=stel
 
 @x = external global float
 @xd = external global double
diff --git a/test/CodeGen/Mips/hf1_body.ll b/test/CodeGen/Mips/hf1_body.ll
index b2cce92..5acfe86 100644
--- a/test/CodeGen/Mips/hf1_body.ll
+++ b/test/CodeGen/Mips/hf1_body.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16  -relocation-model=pic -soft-float -mips16-hard-float < %s | FileCheck %s -check-prefix=picfp16
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16  -relocation-model=pic < %s | FileCheck %s -check-prefix=picfp16
 
 @x = external global float
 
diff --git a/test/CodeGen/Mips/hfptrcall.ll b/test/CodeGen/Mips/hfptrcall.ll
index 25639da..9df8d90 100644
--- a/test/CodeGen/Mips/hfptrcall.ll
+++ b/test/CodeGen/Mips/hfptrcall.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic < %s | FileCheck %s -check-prefix=picel
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=picel
 
 @ptrsv = global float ()* @sv, align 4
 @ptrdv = global double ()* @dv, align 4
diff --git a/test/CodeGen/Mips/i32k.ll b/test/CodeGen/Mips/i32k.ll
index f4dd1eb..73f1302 100644
--- a/test/CodeGen/Mips/i32k.ll
+++ b/test/CodeGen/Mips/i32k.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -mips16-constant-islands=false -O3 < %s | FileCheck %s -check-prefix=16
 
 @.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
 
diff --git a/test/CodeGen/Mips/l3mc.ll b/test/CodeGen/Mips/l3mc.ll
new file mode 100644
index 0000000..3bfb389
--- /dev/null
+++ b/test/CodeGen/Mips/l3mc.ll
@@ -0,0 +1,114 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck  %s -check-prefix=__call_stub_fp___fixunsdfsi 
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatdidf 
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatdisf 
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatundidf
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixsfdi 
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixunsdfdi 
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixdfdi
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixunssfsi 
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___fixunssfdi 
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=__call_stub_fp___floatundisf 
+
+@ll1 = global i64 0, align 8
+@ll2 = global i64 0, align 8
+@ll3 = global i64 0, align 8
+@l1 = global i32 0, align 4
+@l2 = global i32 0, align 4
+@l3 = global i32 0, align 4
+@ull1 = global i64 0, align 8
+@ull2 = global i64 0, align 8
+@ull3 = global i64 0, align 8
+@ul1 = global i32 0, align 4
+@ul2 = global i32 0, align 4
+@ul3 = global i32 0, align 4
+@d1 = global double 0.000000e+00, align 8
+@d2 = global double 0.000000e+00, align 8
+@d3 = global double 0.000000e+00, align 8
+@d4 = global double 0.000000e+00, align 8
+@f1 = global float 0.000000e+00, align 4
+@f2 = global float 0.000000e+00, align 4
+@f3 = global float 0.000000e+00, align 4
+@f4 = global float 0.000000e+00, align 4
+
+; Function Attrs: nounwind
+define void @_Z3foov() #0 {
+entry:
+  %0 = load double* @d1, align 8
+  %conv = fptosi double %0 to i64
+  store i64 %conv, i64* @ll1, align 8
+  %1 = load double* @d2, align 8
+  %conv1 = fptoui double %1 to i64
+  store i64 %conv1, i64* @ull1, align 8
+  %2 = load float* @f1, align 4
+  %conv2 = fptosi float %2 to i64
+  store i64 %conv2, i64* @ll2, align 8
+  %3 = load float* @f2, align 4
+  %conv3 = fptoui float %3 to i64
+  store i64 %conv3, i64* @ull2, align 8
+  %4 = load double* @d3, align 8
+  %conv4 = fptosi double %4 to i32
+  store i32 %conv4, i32* @l1, align 4
+  %5 = load double* @d4, align 8
+  %conv5 = fptoui double %5 to i32
+  store i32 %conv5, i32* @ul1, align 4
+  %6 = load float* @f3, align 4
+  %conv6 = fptosi float %6 to i32
+  store i32 %conv6, i32* @l2, align 4
+  %7 = load float* @f4, align 4
+  %conv7 = fptoui float %7 to i32
+  store i32 %conv7, i32* @ul2, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @_Z3goov() #0 {
+entry:
+  %0 = load i64* @ll1, align 8
+  %conv = sitofp i64 %0 to double
+  store double %conv, double* @d1, align 8
+  %1 = load i64* @ull1, align 8
+  %conv1 = uitofp i64 %1 to double
+  store double %conv1, double* @d2, align 8
+  %2 = load i64* @ll2, align 8
+  %conv2 = sitofp i64 %2 to float
+  store float %conv2, float* @f1, align 4
+  %3 = load i64* @ull2, align 8
+  %conv3 = uitofp i64 %3 to float
+  store float %conv3, float* @f2, align 4
+  %4 = load i32* @l1, align 4
+  %conv4 = sitofp i32 %4 to double
+  store double %conv4, double* @d3, align 8
+  %5 = load i32* @ul1, align 4
+  %conv5 = uitofp i32 %5 to double
+  store double %conv5, double* @d4, align 8
+  %6 = load i32* @l2, align 4
+  %conv6 = sitofp i32 %6 to float
+  store float %conv6, float* @f3, align 4
+  %7 = load i32* @ul2, align 4
+  %conv7 = uitofp i32 %7 to float
+  store float %conv7, float* @f4, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; __call_stub_fp___fixunsdfsi:  __call_stub_fp___fixunsdfsi:
+; __call_stub_fp___floatdidf:  __call_stub_fp___floatdidf:
+; __call_stub_fp___floatdisf:  __call_stub_fp___floatdisf:
+; __call_stub_fp___floatundidf:  __call_stub_fp___floatundidf:
+; __call_stub_fp___fixsfdi:  __call_stub_fp___fixsfdi:
+; __call_stub_fp___fixunsdfdi:  __call_stub_fp___fixunsdfdi:
+; __call_stub_fp___fixdfdi:  __call_stub_fp___fixdfdi:
+; __call_stub_fp___fixunssfsi:  __call_stub_fp___fixunssfsi:
+; __call_stub_fp___fixunssfdi:  __call_stub_fp___fixunssfdi:
+; __call_stub_fp___floatundisf:  __call_stub_fp___floatundisf:
+
diff --git a/test/CodeGen/Mips/largefr1.ll b/test/CodeGen/Mips/largefr1.ll
deleted file mode 100644
index 9a5fd08..0000000
--- a/test/CodeGen/Mips/largefr1.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s -check-prefix=1
-
-
-@i = common global i32 0, align 4
-@j = common global i32 0, align 4
-@.str = private unnamed_addr constant [8 x i8] c"%i %i \0A\00", align 1
-
-define void @foo(i32* %p, i32 %i, i32 %j) nounwind {
-entry:
-  %p.addr = alloca i32*, align 4
-  %i.addr = alloca i32, align 4
-  %j.addr = alloca i32, align 4
-  store i32* %p, i32** %p.addr, align 4
-  store i32 %i, i32* %i.addr, align 4
-  store i32 %j, i32* %j.addr, align 4
-  %0 = load i32* %j.addr, align 4
-  %1 = load i32** %p.addr, align 4
-  %2 = load i32* %i.addr, align 4
-  %add.ptr = getelementptr inbounds i32* %1, i32 %2
-  store i32 %0, i32* %add.ptr, align 4
-  ret void
-}
-
-define i32 @main() nounwind {
-entry:
-; 1-LABEL: main:
-; 1: 1: 	.word	-798000
-; 1:            lw ${{[0-9]+}}, 1f
-; 1:            b 2f
-; 1:            .align 2
-; 1:            .word	800020
-
-; 1:            b 2f
-; 1:            .align 2
-; 1:            .word	400020
-
-; 1:            move ${{[0-9]+}}, $sp
-; 1:            addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
-; 1:            addiu ${{[0-9]+}}, ${{[0-9]+}}, 0
-
-
-
-; 1:            b 2f
-; 1:            .align 2
-; 1:            .word	400220
-
-; 1:            move ${{[0-9]+}}, $sp
-; 1:            addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
-; 1:           	lw	${{[0-9]+}}, 0(${{[0-9]+}})
-
-
-
-
-  %retval = alloca i32, align 4
-  %one = alloca [100000 x i32], align 4
-  %two = alloca [100000 x i32], align 4
-  store i32 0, i32* %retval
-  %arrayidx = getelementptr inbounds [100000 x i32]* %one, i32 0, i32 0
-  call void @foo(i32* %arrayidx, i32 50, i32 9999)
-  %arrayidx1 = getelementptr inbounds [100000 x i32]* %two, i32 0, i32 0
-  call void @foo(i32* %arrayidx1, i32 99999, i32 5555)
-  %arrayidx2 = getelementptr inbounds [100000 x i32]* %one, i32 0, i32 50
-  %0 = load i32* %arrayidx2, align 4
-  store i32 %0, i32* @i, align 4
-  %arrayidx3 = getelementptr inbounds [100000 x i32]* %two, i32 0, i32 99999
-  %1 = load i32* %arrayidx3, align 4
-  store i32 %1, i32* @j, align 4
-  %2 = load i32* @i, align 4
-  %3 = load i32* @j, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %2, i32 %3)
-  ret i32 0
-}
-
-declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/lcb2.ll b/test/CodeGen/Mips/lcb2.ll
new file mode 100644
index 0000000..715584b
--- /dev/null
+++ b/test/CodeGen/Mips/lcb2.ll
@@ -0,0 +1,133 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands=true   < %s | FileCheck %s -check-prefix=lcb
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands=true   < %s | FileCheck %s -check-prefix=lcbn
+
+@i = global i32 0, align 4
+@j = common global i32 0, align 4
+@k = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define i32 @bnez() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 10000", ""() #1, !srcloc !5
+  store i32 0, i32* @i, align 4, !tbaa !1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 0
+}
+; lcb: 	.ent	bnez
+; lcbn:	.ent	bnez
+; lcb:	bnez	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}
+; lcbn-NOT: bnez	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}  # 16 bit inst
+; lcb: 	.end	bnez
+; lcbn:	.end	bnez
+
+; Function Attrs: nounwind optsize
+define i32 @beqz() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 10, i32* @j, align 4, !tbaa !1
+  tail call void asm sideeffect ".space 10000", ""() #1, !srcloc !6
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 55, i32* @j, align 4, !tbaa !1
+  tail call void asm sideeffect ".space 10000", ""() #1, !srcloc !7
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret i32 0
+}
+
+; lcb: 	.ent	beqz
+; lcbn:	.ent	beqz
+; lcb:	beqz	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}
+; lcbn-NOT: beqz	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}  # 16 bit inst
+; lcb: 	.end	beqz
+; lcbn:	.end	beqz
+
+
+; Function Attrs: nounwind optsize
+define void @bteqz() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %1 = load i32* @j, align 4, !tbaa !1
+  %cmp = icmp eq i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @k, align 4, !tbaa !1
+  tail call void asm sideeffect ".space 1000", ""() #1, !srcloc !8
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 1000", ""() #1, !srcloc !9
+  store i32 2, i32* @k, align 4, !tbaa !1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; lcb: 	.ent	bteqz
+; lcbn:	.ent	bteqz
+; lcb:	btnez	$BB{{[0-9]+}}_{{[0-9]+}}
+; lcbn-NOT: btnez	$BB{{[0-9]+}}_{{[0-9]+}} # 16 bit inst
+; lcb: 	.end	bteqz
+; lcbn:	.end	bteqz
+
+
+; Function Attrs: nounwind optsize
+define void @btz() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %1 = load i32* @j, align 4, !tbaa !1
+  %cmp1 = icmp sgt i32 %0, %1
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry, %if.then
+  tail call void asm sideeffect ".space 60000", ""() #1, !srcloc !10
+  %2 = load i32* @i, align 4, !tbaa !1
+  %3 = load i32* @j, align 4, !tbaa !1
+  %cmp = icmp sgt i32 %2, %3
+  br i1 %cmp, label %if.then, label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; lcb: 	.ent	btz
+; lcbn:	.ent	btz
+; lcb:	bteqz	$BB{{[0-9]+}}_{{[0-9]+}}
+; lcbn-NOT: bteqz	$BB{{[0-9]+}}_{{[0-9]+}} # 16 bit inst
+; lcb:	btnez	$BB{{[0-9]+}}_{{[0-9]+}}
+; lcbn-NOT: btnez	$BB{{[0-9]+}}_{{[0-9]+}} # 16 bit inst
+; lcb: 	.end	btz
+; lcbn:	.end	btz
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5 (gitosis@dmz-portal.mips.com:clang.git ed197d08c90d82e1119774e10920e6f7a841c8ec) (gitosis@dmz-portal.mips.com:llvm.git b9235a363fa2dddb26ac01cbaed58efbc9eff392)"}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{i32 59}
+!6 = metadata !{i32 156}
+!7 = metadata !{i32 210}
+!8 = metadata !{i32 299}
+!9 = metadata !{i32 340}
+!10 = metadata !{i32 412}
diff --git a/test/CodeGen/Mips/lcb3c.ll b/test/CodeGen/Mips/lcb3c.ll
new file mode 100644
index 0000000..72a0b8c
--- /dev/null
+++ b/test/CodeGen/Mips/lcb3c.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -O0    < %s | FileCheck %s -check-prefix=lcb
+
+@i = global i32 0, align 4
+@j = common global i32 0, align 4
+@k = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define i32 @s() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 0, i32* @i, align 4
+  call void asm sideeffect ".space 1000", ""() #1, !srcloc !1
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 1, i32* @i, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret i32 0
+; lcb:	bnez	$2, $BB0_2
+; lcb:	b	$BB0_1 # 16 bit inst
+; lcb: $BB0_1:                                 # %if.then
+}
+
+; Function Attrs: nounwind
+define i32 @b() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 0, i32* @i, align 4
+  call void asm sideeffect ".space 1000000", ""() #1, !srcloc !2
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 1, i32* @i, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret i32 0
+}
+
+; lcb:	beqz	$2, $BB1_1  # 16 bit inst
+; lcb:	jal	$BB1_2	# branch
+; lcb: $BB1_1:                                 # %if.then
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+
+!1 = metadata !{i32 65}
+!2 = metadata !{i32 167}
diff --git a/test/CodeGen/Mips/lcb4a.ll b/test/CodeGen/Mips/lcb4a.ll
new file mode 100644
index 0000000..e37feca
--- /dev/null
+++ b/test/CodeGen/Mips/lcb4a.ll
@@ -0,0 +1,69 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static     < %s | FileCheck %s -check-prefix=ci
+
+@i = global i32 0, align 4
+@j = common global i32 0, align 4
+@k = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define i32 @foo() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 1000", ""() #1, !srcloc !5
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 1004", ""() #1, !srcloc !6
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %storemerge = phi i32 [ 1, %if.else ], [ 0, %if.then ]
+  store i32 %storemerge, i32* @i, align 4, !tbaa !1
+  ret i32 0
+}
+
+; ci:	beqz	$3, $BB0_2
+; ci: # BB#1:                                 # %if.else
+
+
+; Function Attrs: nounwind optsize
+define i32 @goo() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 1000000", ""() #1, !srcloc !7
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 1000004", ""() #1, !srcloc !8
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %storemerge = phi i32 [ 1, %if.else ], [ 0, %if.then ]
+  store i32 %storemerge, i32* @i, align 4, !tbaa !1
+  ret i32 0
+}
+
+; ci:	bnez	$3, $BB1_1  # 16 bit inst
+; ci:	jal	$BB1_2	# branch
+; ci:	nop
+; ci: $BB1_1:                                 # %if.else
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{i32 58}
+!6 = metadata !{i32 108}
+!7 = metadata !{i32 190}
+!8 = metadata !{i32 243}
diff --git a/test/CodeGen/Mips/lcb5.ll b/test/CodeGen/Mips/lcb5.ll
new file mode 100644
index 0000000..0a89c80
--- /dev/null
+++ b/test/CodeGen/Mips/lcb5.ll
@@ -0,0 +1,240 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static     < %s | FileCheck %s -check-prefix=ci
+
+@i = global i32 0, align 4
+@j = common global i32 0, align 4
+@k = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define i32 @x0() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 1000", ""() #1, !srcloc !5
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 1004", ""() #1, !srcloc !6
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %storemerge = phi i32 [ 1, %if.else ], [ 0, %if.then ]
+  store i32 %storemerge, i32* @i, align 4, !tbaa !1
+  ret i32 0
+}
+
+; ci:	.ent	x0
+; ci: 	beqz	$3, $BB0_2
+; ci: $BB0_2:
+; ci:	.end	x0
+
+; Function Attrs: nounwind optsize
+define i32 @x1() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 1000000", ""() #1, !srcloc !7
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 1000004", ""() #1, !srcloc !8
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %storemerge = phi i32 [ 1, %if.else ], [ 0, %if.then ]
+  store i32 %storemerge, i32* @i, align 4, !tbaa !1
+  ret i32 0
+}
+
+; ci:	.ent	x1
+; ci:	bnez	$3, $BB1_1  # 16 bit inst
+; ci:	jal	$BB1_2	# branch
+; ci:	nop
+; ci: $BB1_1:
+; ci:	.end	x1
+
+; Function Attrs: nounwind optsize
+define i32 @y0() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 10, i32* @j, align 4, !tbaa !1
+  tail call void asm sideeffect ".space 1000", ""() #1, !srcloc !9
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 55, i32* @j, align 4, !tbaa !1
+  tail call void asm sideeffect ".space 1004", ""() #1, !srcloc !10
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret i32 0
+}
+
+; ci:	.ent	y0
+; ci:	beqz	$2, $BB2_2
+; ci:	.end	y0
+
+; Function Attrs: nounwind optsize
+define i32 @y1() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 10, i32* @j, align 4, !tbaa !1
+  tail call void asm sideeffect ".space 1000000", ""() #1, !srcloc !11
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 55, i32* @j, align 4, !tbaa !1
+  tail call void asm sideeffect ".space 1000004", ""() #1, !srcloc !12
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret i32 0
+}
+
+; ci:	.ent	y1
+; ci:	bnez	$2, $BB3_1  # 16 bit inst
+; ci:	jal	$BB3_2	# branch
+; ci:	nop
+; ci: $BB3_1:
+; ci:	.end	y1
+
+; Function Attrs: nounwind optsize
+define void @z0() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %1 = load i32* @j, align 4, !tbaa !1
+  %cmp = icmp eq i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @k, align 4, !tbaa !1
+  tail call void asm sideeffect ".space 10000", ""() #1, !srcloc !13
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 10004", ""() #1, !srcloc !14
+  store i32 2, i32* @k, align 4, !tbaa !1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; ci:	.ent	z0
+; ci:	btnez	$BB4_2
+; ci:	.end	z0
+
+; Function Attrs: nounwind optsize
+define void @z1() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %1 = load i32* @j, align 4, !tbaa !1
+  %cmp = icmp eq i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @k, align 4, !tbaa !1
+  tail call void asm sideeffect ".space 10000000", ""() #1, !srcloc !15
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  tail call void asm sideeffect ".space 10000004", ""() #1, !srcloc !16
+  store i32 2, i32* @k, align 4, !tbaa !1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; ci:	.ent	z1
+; ci:	bteqz	$BB5_1  # 16 bit inst
+; ci:	jal	$BB5_2	# branch
+; ci:	nop
+; ci: $BB5_1:
+; ci:	.end	z1
+
+; Function Attrs: nounwind optsize
+define void @z3() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %1 = load i32* @j, align 4, !tbaa !1
+  %cmp1 = icmp sgt i32 %0, %1
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry, %if.then
+  tail call void asm sideeffect ".space 10000", ""() #1, !srcloc !17
+  %2 = load i32* @i, align 4, !tbaa !1
+  %3 = load i32* @j, align 4, !tbaa !1
+  %cmp = icmp sgt i32 %2, %3
+  br i1 %cmp, label %if.then, label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; ci:	.ent	z3
+; ci:	bteqz	$BB6_2
+; ci:	.end	z3
+
+; Function Attrs: nounwind optsize
+define void @z4() #0 {
+entry:
+  %0 = load i32* @i, align 4, !tbaa !1
+  %1 = load i32* @j, align 4, !tbaa !1
+  %cmp1 = icmp sgt i32 %0, %1
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry, %if.then
+  tail call void asm sideeffect ".space 10000000", ""() #1, !srcloc !18
+  %2 = load i32* @i, align 4, !tbaa !1
+  %3 = load i32* @j, align 4, !tbaa !1
+  %cmp = icmp sgt i32 %2, %3
+  br i1 %cmp, label %if.then, label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; ci:	.ent	z4
+; ci:	btnez	$BB7_1  # 16 bit inst
+; ci:	jal	$BB7_2	# branch
+; ci:	nop
+; ci:	.align	2
+; ci: $BB7_1:
+; ci:	.end	z4
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{i32 57}
+!6 = metadata !{i32 107}
+!7 = metadata !{i32 188}
+!8 = metadata !{i32 241}
+!9 = metadata !{i32 338}
+!10 = metadata !{i32 391}
+!11 = metadata !{i32 477}
+!12 = metadata !{i32 533}
+!13 = metadata !{i32 621}
+!14 = metadata !{i32 663}
+!15 = metadata !{i32 747}
+!16 = metadata !{i32 792}
+!17 = metadata !{i32 867}
+!18 = metadata !{i32 953}
diff --git a/test/CodeGen/Mips/mature-mc-support.ll b/test/CodeGen/Mips/mature-mc-support.ll
new file mode 100644
index 0000000..6e5998d
--- /dev/null
+++ b/test/CodeGen/Mips/mature-mc-support.ll
@@ -0,0 +1,32 @@
+; Test that inline assembly is parsed by the MC layer when MC support is mature
+; (even when the output is assembly).
+; FIXME: Mips doesn't use the integrated assembler by default so we only test
+; that -filetype=obj tries to parse the assembly.
+
+; SKIP: not llc -march=mips < %s > /dev/null 2> %t1
+; SKIP: FileCheck %s < %t1
+
+; RUN: not llc -march=mips -filetype=obj < %s > /dev/null 2> %t2
+; RUN: FileCheck %s < %t2
+
+; SKIP: not llc -march=mipsel < %s > /dev/null 2> %t3
+; SKIP: FileCheck %s < %t3
+
+; RUN: not llc -march=mipsel -filetype=obj < %s > /dev/null 2> %t4
+; RUN: FileCheck %s < %t4
+
+; SKIP: not llc -march=mips64 < %s > /dev/null 2> %t5
+; SKIP: FileCheck %s < %t5
+
+; RUN: not llc -march=mips64 -filetype=obj < %s > /dev/null 2> %t6
+; RUN: FileCheck %s < %t6
+
+; SKIP: not llc -march=mips64el < %s > /dev/null 2> %t7
+; SKIP: FileCheck %s < %t7
+
+; RUN: not llc -march=mips64el -filetype=obj < %s > /dev/null 2> %t8
+; RUN: FileCheck %s < %t8
+
+module asm "	.this_directive_is_very_unlikely_to_exist"
+
+; CHECK: LLVM ERROR: Error parsing inline asm
diff --git a/test/CodeGen/Mips/mbrsize4a.ll b/test/CodeGen/Mips/mbrsize4a.ll
new file mode 100644
index 0000000..c802991
--- /dev/null
+++ b/test/CodeGen/Mips/mbrsize4a.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands   < %s | FileCheck %s -check-prefix=jal16
+
+@j = global i32 10, align 4
+@.str = private unnamed_addr constant [11 x i8] c"at bottom\0A\00", align 1
+@i = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  br label %z
+
+z:                                                ; preds = %y, %entry
+  %call = call i32 bitcast (i32 (...)* @foo to i32 ()*)()
+  call void asm sideeffect ".space 10000000", ""() #2, !srcloc !1
+  br label %y
+
+y:                                                ; preds = %z
+  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0))
+  br label %z
+
+return:                                           ; No predecessors!
+  %0 = load i32* %retval
+  ret i32 %0
+; jal16: 	jal	$BB{{[0-9]+}}_{{[0-9]+}}
+}
+
+declare i32 @foo(...) #1
+
+declare i32 @printf(i8*, ...) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!1 = metadata !{i32 68}
diff --git a/test/CodeGen/Mips/micromips-atomic.ll b/test/CodeGen/Mips/micromips-atomic.ll
new file mode 100644
index 0000000..a50e0b7
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-atomic.ll
@@ -0,0 +1,18 @@
+; RUN: llc %s -march=mipsel -mcpu=mips32r2 -mattr=micromips -filetype=asm \
+; RUN: -relocation-model=pic -o - | FileCheck %s
+
+@x = common global i32 0, align 4
+
+define i32 @AtomicLoadAdd32(i32 %incr) nounwind {
+entry:
+  %0 = atomicrmw add i32* @x, i32 %incr monotonic
+  ret i32 %0
+
+; CHECK-LABEL:   AtomicLoadAdd32:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R1:[0-9]+]], 0($[[R0]])
+; CHECK:   addu    $[[R2:[0-9]+]], $[[R1]], $4
+; CHECK:   sc      $[[R2]], 0($[[R0]])
+; CHECK:   beqz    $[[R2]], $[[BB0]]
+}
diff --git a/test/CodeGen/Mips/micromips-jal.ll b/test/CodeGen/Mips/micromips-jal.ll
new file mode 100644
index 0000000..fccc229
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-jal.ll
@@ -0,0 +1,48 @@
+; RUN: llc %s -march=mipsel -mcpu=mips32r2 -mattr=micromips -filetype=asm \
+; RUN:   -relocation-model=static -o - | FileCheck %s
+
+define i32 @sum(i32 %a, i32 %b) nounwind uwtable {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %0 = load i32* %a.addr, align 4
+  %1 = load i32* %b.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* %y, align 4
+  %1 = load i32* %z, align 4
+  %call = call i32 @sum(i32 %0, i32 %1)
+  store i32 %call, i32* %x, align 4
+  %2 = load i32* %x, align 4
+  ret i32 %2
+}
+
+; CHECK:    .text
+
+; CHECK:    .globl  sum
+; CHECK:    .type sum,@function
+; CHECK:    .set  micromips
+; CHECK:    .ent  sum
+; CHECK-LABEL: sum:
+; CHECK:    .end  sum
+
+; CHECK:    .globl  main
+; CHECK:    .type main,@function
+; CHECK:    .set  micromips
+; CHECK:    .ent  main
+; CHECK-LABEL: main:
+
+; CHECK:    jal sum
+
+; CHECK:    .end main
diff --git a/test/CodeGen/Mips/micromips-load-effective-address.ll b/test/CodeGen/Mips/micromips-load-effective-address.ll
new file mode 100644
index 0000000..afba760
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-load-effective-address.ll
@@ -0,0 +1,29 @@
+; RUN: llc %s -march=mipsel -mattr=micromips -filetype=asm \
+; RUN: -relocation-model=pic -O3 -o - | FileCheck %s
+
+define i32 @sum(i32* %x, i32* %y) nounwind uwtable {
+entry:
+  %x.addr = alloca i32*, align 8
+  %y.addr = alloca i32*, align 8
+  store i32* %x, i32** %x.addr, align 8
+  store i32* %y, i32** %y.addr, align 8
+  %0 = load i32** %x.addr, align 8
+  %1 = load i32* %0, align 4
+  %2 = load i32** %y.addr, align 8
+  %3 = load i32* %2, align 4
+  %add = add nsw i32 %1, %3
+  ret i32 %add
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @sum(i32* %x, i32* %y)
+  ret i32 %call
+}
+
+; CHECK: addiu ${{[0-9]+}}, $sp, {{[0-9]+}}
+; CHECK: addiu ${{[0-9]+}}, $sp, {{[0-9]+}}
diff --git a/test/CodeGen/Mips/micromips-long-branch.ll b/test/CodeGen/Mips/micromips-long-branch.ll
new file mode 100644
index 0000000..3267f4a
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-long-branch.ll
@@ -0,0 +1,16437 @@
+; RUN: llc %s -march=mipsel -mcpu=mips32r2 -mattr=micromips -filetype=asm \
+; RUN: -relocation-model=pic -O3 -o - | FileCheck %s
+
+@a = common global [10 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK:  addiu $sp, $sp, -8
+; CHECK:  sw  $ra, 0($sp)
+; CHECK:  lui $[[REG1:[0-9]+]], 2
+; CHECK:  addiu $[[REG1]], $[[REG1]], 0
+; CHECK:  addu  $[[REG1]], $ra, $[[REG1]]
+; CHECK:  lw  $ra, 0($sp)
+; CHECK:  jr  $[[REG1]]
+; CHECK:  addiu $sp, $sp, 8
+
+for.body:
+  %1 = load i32* %i, align 4
+  %2 = load i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [10 x i32]* @a, i32 0, i64 %idxprom
+  store i32 %1, i32* %arrayidx, align 4  %nop0 = alloca i1, i1 0
+  %nop1 = alloca i1, i1 0
+  %nop2 = alloca i1, i1 0
+  %nop3 = alloca i1, i1 0
+  %nop4 = alloca i1, i1 0
+  %nop5 = alloca i1, i1 0
+  %nop6 = alloca i1, i1 0
+  %nop7 = alloca i1, i1 0
+  %nop8 = alloca i1, i1 0
+  %nop9 = alloca i1, i1 0
+  %nop10 = alloca i1, i1 0
+  %nop11 = alloca i1, i1 0
+  %nop12 = alloca i1, i1 0
+  %nop13 = alloca i1, i1 0
+  %nop14 = alloca i1, i1 0
+  %nop15 = alloca i1, i1 0
+  %nop16 = alloca i1, i1 0
+  %nop17 = alloca i1, i1 0
+  %nop18 = alloca i1, i1 0
+  %nop19 = alloca i1, i1 0
+  %nop20 = alloca i1, i1 0
+  %nop21 = alloca i1, i1 0
+  %nop22 = alloca i1, i1 0
+  %nop23 = alloca i1, i1 0
+  %nop24 = alloca i1, i1 0
+  %nop25 = alloca i1, i1 0
+  %nop26 = alloca i1, i1 0
+  %nop27 = alloca i1, i1 0
+  %nop28 = alloca i1, i1 0
+  %nop29 = alloca i1, i1 0
+  %nop30 = alloca i1, i1 0
+  %nop31 = alloca i1, i1 0
+  %nop32 = alloca i1, i1 0
+  %nop33 = alloca i1, i1 0
+  %nop34 = alloca i1, i1 0
+  %nop35 = alloca i1, i1 0
+  %nop36 = alloca i1, i1 0
+  %nop37 = alloca i1, i1 0
+  %nop38 = alloca i1, i1 0
+  %nop39 = alloca i1, i1 0
+  %nop40 = alloca i1, i1 0
+  %nop41 = alloca i1, i1 0
+  %nop42 = alloca i1, i1 0
+  %nop43 = alloca i1, i1 0
+  %nop44 = alloca i1, i1 0
+  %nop45 = alloca i1, i1 0
+  %nop46 = alloca i1, i1 0
+  %nop47 = alloca i1, i1 0
+  %nop48 = alloca i1, i1 0
+  %nop49 = alloca i1, i1 0
+  %nop50 = alloca i1, i1 0
+  %nop51 = alloca i1, i1 0
+  %nop52 = alloca i1, i1 0
+  %nop53 = alloca i1, i1 0
+  %nop54 = alloca i1, i1 0
+  %nop55 = alloca i1, i1 0
+  %nop56 = alloca i1, i1 0
+  %nop57 = alloca i1, i1 0
+  %nop58 = alloca i1, i1 0
+  %nop59 = alloca i1, i1 0
+  %nop60 = alloca i1, i1 0
+  %nop61 = alloca i1, i1 0
+  %nop62 = alloca i1, i1 0
+  %nop63 = alloca i1, i1 0
+  %nop64 = alloca i1, i1 0
+  %nop65 = alloca i1, i1 0
+  %nop66 = alloca i1, i1 0
+  %nop67 = alloca i1, i1 0
+  %nop68 = alloca i1, i1 0
+  %nop69 = alloca i1, i1 0
+  %nop70 = alloca i1, i1 0
+  %nop71 = alloca i1, i1 0
+  %nop72 = alloca i1, i1 0
+  %nop73 = alloca i1, i1 0
+  %nop74 = alloca i1, i1 0
+  %nop75 = alloca i1, i1 0
+  %nop76 = alloca i1, i1 0
+  %nop77 = alloca i1, i1 0
+  %nop78 = alloca i1, i1 0
+  %nop79 = alloca i1, i1 0
+  %nop80 = alloca i1, i1 0
+  %nop81 = alloca i1, i1 0
+  %nop82 = alloca i1, i1 0
+  %nop83 = alloca i1, i1 0
+  %nop84 = alloca i1, i1 0
+  %nop85 = alloca i1, i1 0
+  %nop86 = alloca i1, i1 0
+  %nop87 = alloca i1, i1 0
+  %nop88 = alloca i1, i1 0
+  %nop89 = alloca i1, i1 0
+  %nop90 = alloca i1, i1 0
+  %nop91 = alloca i1, i1 0
+  %nop92 = alloca i1, i1 0
+  %nop93 = alloca i1, i1 0
+  %nop94 = alloca i1, i1 0
+  %nop95 = alloca i1, i1 0
+  %nop96 = alloca i1, i1 0
+  %nop97 = alloca i1, i1 0
+  %nop98 = alloca i1, i1 0
+  %nop99 = alloca i1, i1 0
+  %nop100 = alloca i1, i1 0
+  %nop101 = alloca i1, i1 0
+  %nop102 = alloca i1, i1 0
+  %nop103 = alloca i1, i1 0
+  %nop104 = alloca i1, i1 0
+  %nop105 = alloca i1, i1 0
+  %nop106 = alloca i1, i1 0
+  %nop107 = alloca i1, i1 0
+  %nop108 = alloca i1, i1 0
+  %nop109 = alloca i1, i1 0
+  %nop110 = alloca i1, i1 0
+  %nop111 = alloca i1, i1 0
+  %nop112 = alloca i1, i1 0
+  %nop113 = alloca i1, i1 0
+  %nop114 = alloca i1, i1 0
+  %nop115 = alloca i1, i1 0
+  %nop116 = alloca i1, i1 0
+  %nop117 = alloca i1, i1 0
+  %nop118 = alloca i1, i1 0
+  %nop119 = alloca i1, i1 0
+  %nop120 = alloca i1, i1 0
+  %nop121 = alloca i1, i1 0
+  %nop122 = alloca i1, i1 0
+  %nop123 = alloca i1, i1 0
+  %nop124 = alloca i1, i1 0
+  %nop125 = alloca i1, i1 0
+  %nop126 = alloca i1, i1 0
+  %nop127 = alloca i1, i1 0
+  %nop128 = alloca i1, i1 0
+  %nop129 = alloca i1, i1 0
+  %nop130 = alloca i1, i1 0
+  %nop131 = alloca i1, i1 0
+  %nop132 = alloca i1, i1 0
+  %nop133 = alloca i1, i1 0
+  %nop134 = alloca i1, i1 0
+  %nop135 = alloca i1, i1 0
+  %nop136 = alloca i1, i1 0
+  %nop137 = alloca i1, i1 0
+  %nop138 = alloca i1, i1 0
+  %nop139 = alloca i1, i1 0
+  %nop140 = alloca i1, i1 0
+  %nop141 = alloca i1, i1 0
+  %nop142 = alloca i1, i1 0
+  %nop143 = alloca i1, i1 0
+  %nop144 = alloca i1, i1 0
+  %nop145 = alloca i1, i1 0
+  %nop146 = alloca i1, i1 0
+  %nop147 = alloca i1, i1 0
+  %nop148 = alloca i1, i1 0
+  %nop149 = alloca i1, i1 0
+  %nop150 = alloca i1, i1 0
+  %nop151 = alloca i1, i1 0
+  %nop152 = alloca i1, i1 0
+  %nop153 = alloca i1, i1 0
+  %nop154 = alloca i1, i1 0
+  %nop155 = alloca i1, i1 0
+  %nop156 = alloca i1, i1 0
+  %nop157 = alloca i1, i1 0
+  %nop158 = alloca i1, i1 0
+  %nop159 = alloca i1, i1 0
+  %nop160 = alloca i1, i1 0
+  %nop161 = alloca i1, i1 0
+  %nop162 = alloca i1, i1 0
+  %nop163 = alloca i1, i1 0
+  %nop164 = alloca i1, i1 0
+  %nop165 = alloca i1, i1 0
+  %nop166 = alloca i1, i1 0
+  %nop167 = alloca i1, i1 0
+  %nop168 = alloca i1, i1 0
+  %nop169 = alloca i1, i1 0
+  %nop170 = alloca i1, i1 0
+  %nop171 = alloca i1, i1 0
+  %nop172 = alloca i1, i1 0
+  %nop173 = alloca i1, i1 0
+  %nop174 = alloca i1, i1 0
+  %nop175 = alloca i1, i1 0
+  %nop176 = alloca i1, i1 0
+  %nop177 = alloca i1, i1 0
+  %nop178 = alloca i1, i1 0
+  %nop179 = alloca i1, i1 0
+  %nop180 = alloca i1, i1 0
+  %nop181 = alloca i1, i1 0
+  %nop182 = alloca i1, i1 0
+  %nop183 = alloca i1, i1 0
+  %nop184 = alloca i1, i1 0
+  %nop185 = alloca i1, i1 0
+  %nop186 = alloca i1, i1 0
+  %nop187 = alloca i1, i1 0
+  %nop188 = alloca i1, i1 0
+  %nop189 = alloca i1, i1 0
+  %nop190 = alloca i1, i1 0
+  %nop191 = alloca i1, i1 0
+  %nop192 = alloca i1, i1 0
+  %nop193 = alloca i1, i1 0
+  %nop194 = alloca i1, i1 0
+  %nop195 = alloca i1, i1 0
+  %nop196 = alloca i1, i1 0
+  %nop197 = alloca i1, i1 0
+  %nop198 = alloca i1, i1 0
+  %nop199 = alloca i1, i1 0
+  %nop200 = alloca i1, i1 0
+  %nop201 = alloca i1, i1 0
+  %nop202 = alloca i1, i1 0
+  %nop203 = alloca i1, i1 0
+  %nop204 = alloca i1, i1 0
+  %nop205 = alloca i1, i1 0
+  %nop206 = alloca i1, i1 0
+  %nop207 = alloca i1, i1 0
+  %nop208 = alloca i1, i1 0
+  %nop209 = alloca i1, i1 0
+  %nop210 = alloca i1, i1 0
+  %nop211 = alloca i1, i1 0
+  %nop212 = alloca i1, i1 0
+  %nop213 = alloca i1, i1 0
+  %nop214 = alloca i1, i1 0
+  %nop215 = alloca i1, i1 0
+  %nop216 = alloca i1, i1 0
+  %nop217 = alloca i1, i1 0
+  %nop218 = alloca i1, i1 0
+  %nop219 = alloca i1, i1 0
+  %nop220 = alloca i1, i1 0
+  %nop221 = alloca i1, i1 0
+  %nop222 = alloca i1, i1 0
+  %nop223 = alloca i1, i1 0
+  %nop224 = alloca i1, i1 0
+  %nop225 = alloca i1, i1 0
+  %nop226 = alloca i1, i1 0
+  %nop227 = alloca i1, i1 0
+  %nop228 = alloca i1, i1 0
+  %nop229 = alloca i1, i1 0
+  %nop230 = alloca i1, i1 0
+  %nop231 = alloca i1, i1 0
+  %nop232 = alloca i1, i1 0
+  %nop233 = alloca i1, i1 0
+  %nop234 = alloca i1, i1 0
+  %nop235 = alloca i1, i1 0
+  %nop236 = alloca i1, i1 0
+  %nop237 = alloca i1, i1 0
+  %nop238 = alloca i1, i1 0
+  %nop239 = alloca i1, i1 0
+  %nop240 = alloca i1, i1 0
+  %nop241 = alloca i1, i1 0
+  %nop242 = alloca i1, i1 0
+  %nop243 = alloca i1, i1 0
+  %nop244 = alloca i1, i1 0
+  %nop245 = alloca i1, i1 0
+  %nop246 = alloca i1, i1 0
+  %nop247 = alloca i1, i1 0
+  %nop248 = alloca i1, i1 0
+  %nop249 = alloca i1, i1 0
+  %nop250 = alloca i1, i1 0
+  %nop251 = alloca i1, i1 0
+  %nop252 = alloca i1, i1 0
+  %nop253 = alloca i1, i1 0
+  %nop254 = alloca i1, i1 0
+  %nop255 = alloca i1, i1 0
+  %nop256 = alloca i1, i1 0
+  %nop257 = alloca i1, i1 0
+  %nop258 = alloca i1, i1 0
+  %nop259 = alloca i1, i1 0
+  %nop260 = alloca i1, i1 0
+  %nop261 = alloca i1, i1 0
+  %nop262 = alloca i1, i1 0
+  %nop263 = alloca i1, i1 0
+  %nop264 = alloca i1, i1 0
+  %nop265 = alloca i1, i1 0
+  %nop266 = alloca i1, i1 0
+  %nop267 = alloca i1, i1 0
+  %nop268 = alloca i1, i1 0
+  %nop269 = alloca i1, i1 0
+  %nop270 = alloca i1, i1 0
+  %nop271 = alloca i1, i1 0
+  %nop272 = alloca i1, i1 0
+  %nop273 = alloca i1, i1 0
+  %nop274 = alloca i1, i1 0
+  %nop275 = alloca i1, i1 0
+  %nop276 = alloca i1, i1 0
+  %nop277 = alloca i1, i1 0
+  %nop278 = alloca i1, i1 0
+  %nop279 = alloca i1, i1 0
+  %nop280 = alloca i1, i1 0
+  %nop281 = alloca i1, i1 0
+  %nop282 = alloca i1, i1 0
+  %nop283 = alloca i1, i1 0
+  %nop284 = alloca i1, i1 0
+  %nop285 = alloca i1, i1 0
+  %nop286 = alloca i1, i1 0
+  %nop287 = alloca i1, i1 0
+  %nop288 = alloca i1, i1 0
+  %nop289 = alloca i1, i1 0
+  %nop290 = alloca i1, i1 0
+  %nop291 = alloca i1, i1 0
+  %nop292 = alloca i1, i1 0
+  %nop293 = alloca i1, i1 0
+  %nop294 = alloca i1, i1 0
+  %nop295 = alloca i1, i1 0
+  %nop296 = alloca i1, i1 0
+  %nop297 = alloca i1, i1 0
+  %nop298 = alloca i1, i1 0
+  %nop299 = alloca i1, i1 0
+  %nop300 = alloca i1, i1 0
+  %nop301 = alloca i1, i1 0
+  %nop302 = alloca i1, i1 0
+  %nop303 = alloca i1, i1 0
+  %nop304 = alloca i1, i1 0
+  %nop305 = alloca i1, i1 0
+  %nop306 = alloca i1, i1 0
+  %nop307 = alloca i1, i1 0
+  %nop308 = alloca i1, i1 0
+  %nop309 = alloca i1, i1 0
+  %nop310 = alloca i1, i1 0
+  %nop311 = alloca i1, i1 0
+  %nop312 = alloca i1, i1 0
+  %nop313 = alloca i1, i1 0
+  %nop314 = alloca i1, i1 0
+  %nop315 = alloca i1, i1 0
+  %nop316 = alloca i1, i1 0
+  %nop317 = alloca i1, i1 0
+  %nop318 = alloca i1, i1 0
+  %nop319 = alloca i1, i1 0
+  %nop320 = alloca i1, i1 0
+  %nop321 = alloca i1, i1 0
+  %nop322 = alloca i1, i1 0
+  %nop323 = alloca i1, i1 0
+  %nop324 = alloca i1, i1 0
+  %nop325 = alloca i1, i1 0
+  %nop326 = alloca i1, i1 0
+  %nop327 = alloca i1, i1 0
+  %nop328 = alloca i1, i1 0
+  %nop329 = alloca i1, i1 0
+  %nop330 = alloca i1, i1 0
+  %nop331 = alloca i1, i1 0
+  %nop332 = alloca i1, i1 0
+  %nop333 = alloca i1, i1 0
+  %nop334 = alloca i1, i1 0
+  %nop335 = alloca i1, i1 0
+  %nop336 = alloca i1, i1 0
+  %nop337 = alloca i1, i1 0
+  %nop338 = alloca i1, i1 0
+  %nop339 = alloca i1, i1 0
+  %nop340 = alloca i1, i1 0
+  %nop341 = alloca i1, i1 0
+  %nop342 = alloca i1, i1 0
+  %nop343 = alloca i1, i1 0
+  %nop344 = alloca i1, i1 0
+  %nop345 = alloca i1, i1 0
+  %nop346 = alloca i1, i1 0
+  %nop347 = alloca i1, i1 0
+  %nop348 = alloca i1, i1 0
+  %nop349 = alloca i1, i1 0
+  %nop350 = alloca i1, i1 0
+  %nop351 = alloca i1, i1 0
+  %nop352 = alloca i1, i1 0
+  %nop353 = alloca i1, i1 0
+  %nop354 = alloca i1, i1 0
+  %nop355 = alloca i1, i1 0
+  %nop356 = alloca i1, i1 0
+  %nop357 = alloca i1, i1 0
+  %nop358 = alloca i1, i1 0
+  %nop359 = alloca i1, i1 0
+  %nop360 = alloca i1, i1 0
+  %nop361 = alloca i1, i1 0
+  %nop362 = alloca i1, i1 0
+  %nop363 = alloca i1, i1 0
+  %nop364 = alloca i1, i1 0
+  %nop365 = alloca i1, i1 0
+  %nop366 = alloca i1, i1 0
+  %nop367 = alloca i1, i1 0
+  %nop368 = alloca i1, i1 0
+  %nop369 = alloca i1, i1 0
+  %nop370 = alloca i1, i1 0
+  %nop371 = alloca i1, i1 0
+  %nop372 = alloca i1, i1 0
+  %nop373 = alloca i1, i1 0
+  %nop374 = alloca i1, i1 0
+  %nop375 = alloca i1, i1 0
+  %nop376 = alloca i1, i1 0
+  %nop377 = alloca i1, i1 0
+  %nop378 = alloca i1, i1 0
+  %nop379 = alloca i1, i1 0
+  %nop380 = alloca i1, i1 0
+  %nop381 = alloca i1, i1 0
+  %nop382 = alloca i1, i1 0
+  %nop383 = alloca i1, i1 0
+  %nop384 = alloca i1, i1 0
+  %nop385 = alloca i1, i1 0
+  %nop386 = alloca i1, i1 0
+  %nop387 = alloca i1, i1 0
+  %nop388 = alloca i1, i1 0
+  %nop389 = alloca i1, i1 0
+  %nop390 = alloca i1, i1 0
+  %nop391 = alloca i1, i1 0
+  %nop392 = alloca i1, i1 0
+  %nop393 = alloca i1, i1 0
+  %nop394 = alloca i1, i1 0
+  %nop395 = alloca i1, i1 0
+  %nop396 = alloca i1, i1 0
+  %nop397 = alloca i1, i1 0
+  %nop398 = alloca i1, i1 0
+  %nop399 = alloca i1, i1 0
+  %nop400 = alloca i1, i1 0
+  %nop401 = alloca i1, i1 0
+  %nop402 = alloca i1, i1 0
+  %nop403 = alloca i1, i1 0
+  %nop404 = alloca i1, i1 0
+  %nop405 = alloca i1, i1 0
+  %nop406 = alloca i1, i1 0
+  %nop407 = alloca i1, i1 0
+  %nop408 = alloca i1, i1 0
+  %nop409 = alloca i1, i1 0
+  %nop410 = alloca i1, i1 0
+  %nop411 = alloca i1, i1 0
+  %nop412 = alloca i1, i1 0
+  %nop413 = alloca i1, i1 0
+  %nop414 = alloca i1, i1 0
+  %nop415 = alloca i1, i1 0
+  %nop416 = alloca i1, i1 0
+  %nop417 = alloca i1, i1 0
+  %nop418 = alloca i1, i1 0
+  %nop419 = alloca i1, i1 0
+  %nop420 = alloca i1, i1 0
+  %nop421 = alloca i1, i1 0
+  %nop422 = alloca i1, i1 0
+  %nop423 = alloca i1, i1 0
+  %nop424 = alloca i1, i1 0
+  %nop425 = alloca i1, i1 0
+  %nop426 = alloca i1, i1 0
+  %nop427 = alloca i1, i1 0
+  %nop428 = alloca i1, i1 0
+  %nop429 = alloca i1, i1 0
+  %nop430 = alloca i1, i1 0
+  %nop431 = alloca i1, i1 0
+  %nop432 = alloca i1, i1 0
+  %nop433 = alloca i1, i1 0
+  %nop434 = alloca i1, i1 0
+  %nop435 = alloca i1, i1 0
+  %nop436 = alloca i1, i1 0
+  %nop437 = alloca i1, i1 0
+  %nop438 = alloca i1, i1 0
+  %nop439 = alloca i1, i1 0
+  %nop440 = alloca i1, i1 0
+  %nop441 = alloca i1, i1 0
+  %nop442 = alloca i1, i1 0
+  %nop443 = alloca i1, i1 0
+  %nop444 = alloca i1, i1 0
+  %nop445 = alloca i1, i1 0
+  %nop446 = alloca i1, i1 0
+  %nop447 = alloca i1, i1 0
+  %nop448 = alloca i1, i1 0
+  %nop449 = alloca i1, i1 0
+  %nop450 = alloca i1, i1 0
+  %nop451 = alloca i1, i1 0
+  %nop452 = alloca i1, i1 0
+  %nop453 = alloca i1, i1 0
+  %nop454 = alloca i1, i1 0
+  %nop455 = alloca i1, i1 0
+  %nop456 = alloca i1, i1 0
+  %nop457 = alloca i1, i1 0
+  %nop458 = alloca i1, i1 0
+  %nop459 = alloca i1, i1 0
+  %nop460 = alloca i1, i1 0
+  %nop461 = alloca i1, i1 0
+  %nop462 = alloca i1, i1 0
+  %nop463 = alloca i1, i1 0
+  %nop464 = alloca i1, i1 0
+  %nop465 = alloca i1, i1 0
+  %nop466 = alloca i1, i1 0
+  %nop467 = alloca i1, i1 0
+  %nop468 = alloca i1, i1 0
+  %nop469 = alloca i1, i1 0
+  %nop470 = alloca i1, i1 0
+  %nop471 = alloca i1, i1 0
+  %nop472 = alloca i1, i1 0
+  %nop473 = alloca i1, i1 0
+  %nop474 = alloca i1, i1 0
+  %nop475 = alloca i1, i1 0
+  %nop476 = alloca i1, i1 0
+  %nop477 = alloca i1, i1 0
+  %nop478 = alloca i1, i1 0
+  %nop479 = alloca i1, i1 0
+  %nop480 = alloca i1, i1 0
+  %nop481 = alloca i1, i1 0
+  %nop482 = alloca i1, i1 0
+  %nop483 = alloca i1, i1 0
+  %nop484 = alloca i1, i1 0
+  %nop485 = alloca i1, i1 0
+  %nop486 = alloca i1, i1 0
+  %nop487 = alloca i1, i1 0
+  %nop488 = alloca i1, i1 0
+  %nop489 = alloca i1, i1 0
+  %nop490 = alloca i1, i1 0
+  %nop491 = alloca i1, i1 0
+  %nop492 = alloca i1, i1 0
+  %nop493 = alloca i1, i1 0
+  %nop494 = alloca i1, i1 0
+  %nop495 = alloca i1, i1 0
+  %nop496 = alloca i1, i1 0
+  %nop497 = alloca i1, i1 0
+  %nop498 = alloca i1, i1 0
+  %nop499 = alloca i1, i1 0
+  %nop500 = alloca i1, i1 0
+  %nop501 = alloca i1, i1 0
+  %nop502 = alloca i1, i1 0
+  %nop503 = alloca i1, i1 0
+  %nop504 = alloca i1, i1 0
+  %nop505 = alloca i1, i1 0
+  %nop506 = alloca i1, i1 0
+  %nop507 = alloca i1, i1 0
+  %nop508 = alloca i1, i1 0
+  %nop509 = alloca i1, i1 0
+  %nop510 = alloca i1, i1 0
+  %nop511 = alloca i1, i1 0
+  %nop512 = alloca i1, i1 0
+  %nop513 = alloca i1, i1 0
+  %nop514 = alloca i1, i1 0
+  %nop515 = alloca i1, i1 0
+  %nop516 = alloca i1, i1 0
+  %nop517 = alloca i1, i1 0
+  %nop518 = alloca i1, i1 0
+  %nop519 = alloca i1, i1 0
+  %nop520 = alloca i1, i1 0
+  %nop521 = alloca i1, i1 0
+  %nop522 = alloca i1, i1 0
+  %nop523 = alloca i1, i1 0
+  %nop524 = alloca i1, i1 0
+  %nop525 = alloca i1, i1 0
+  %nop526 = alloca i1, i1 0
+  %nop527 = alloca i1, i1 0
+  %nop528 = alloca i1, i1 0
+  %nop529 = alloca i1, i1 0
+  %nop530 = alloca i1, i1 0
+  %nop531 = alloca i1, i1 0
+  %nop532 = alloca i1, i1 0
+  %nop533 = alloca i1, i1 0
+  %nop534 = alloca i1, i1 0
+  %nop535 = alloca i1, i1 0
+  %nop536 = alloca i1, i1 0
+  %nop537 = alloca i1, i1 0
+  %nop538 = alloca i1, i1 0
+  %nop539 = alloca i1, i1 0
+  %nop540 = alloca i1, i1 0
+  %nop541 = alloca i1, i1 0
+  %nop542 = alloca i1, i1 0
+  %nop543 = alloca i1, i1 0
+  %nop544 = alloca i1, i1 0
+  %nop545 = alloca i1, i1 0
+  %nop546 = alloca i1, i1 0
+  %nop547 = alloca i1, i1 0
+  %nop548 = alloca i1, i1 0
+  %nop549 = alloca i1, i1 0
+  %nop550 = alloca i1, i1 0
+  %nop551 = alloca i1, i1 0
+  %nop552 = alloca i1, i1 0
+  %nop553 = alloca i1, i1 0
+  %nop554 = alloca i1, i1 0
+  %nop555 = alloca i1, i1 0
+  %nop556 = alloca i1, i1 0
+  %nop557 = alloca i1, i1 0
+  %nop558 = alloca i1, i1 0
+  %nop559 = alloca i1, i1 0
+  %nop560 = alloca i1, i1 0
+  %nop561 = alloca i1, i1 0
+  %nop562 = alloca i1, i1 0
+  %nop563 = alloca i1, i1 0
+  %nop564 = alloca i1, i1 0
+  %nop565 = alloca i1, i1 0
+  %nop566 = alloca i1, i1 0
+  %nop567 = alloca i1, i1 0
+  %nop568 = alloca i1, i1 0
+  %nop569 = alloca i1, i1 0
+  %nop570 = alloca i1, i1 0
+  %nop571 = alloca i1, i1 0
+  %nop572 = alloca i1, i1 0
+  %nop573 = alloca i1, i1 0
+  %nop574 = alloca i1, i1 0
+  %nop575 = alloca i1, i1 0
+  %nop576 = alloca i1, i1 0
+  %nop577 = alloca i1, i1 0
+  %nop578 = alloca i1, i1 0
+  %nop579 = alloca i1, i1 0
+  %nop580 = alloca i1, i1 0
+  %nop581 = alloca i1, i1 0
+  %nop582 = alloca i1, i1 0
+  %nop583 = alloca i1, i1 0
+  %nop584 = alloca i1, i1 0
+  %nop585 = alloca i1, i1 0
+  %nop586 = alloca i1, i1 0
+  %nop587 = alloca i1, i1 0
+  %nop588 = alloca i1, i1 0
+  %nop589 = alloca i1, i1 0
+  %nop590 = alloca i1, i1 0
+  %nop591 = alloca i1, i1 0
+  %nop592 = alloca i1, i1 0
+  %nop593 = alloca i1, i1 0
+  %nop594 = alloca i1, i1 0
+  %nop595 = alloca i1, i1 0
+  %nop596 = alloca i1, i1 0
+  %nop597 = alloca i1, i1 0
+  %nop598 = alloca i1, i1 0
+  %nop599 = alloca i1, i1 0
+  %nop600 = alloca i1, i1 0
+  %nop601 = alloca i1, i1 0
+  %nop602 = alloca i1, i1 0
+  %nop603 = alloca i1, i1 0
+  %nop604 = alloca i1, i1 0
+  %nop605 = alloca i1, i1 0
+  %nop606 = alloca i1, i1 0
+  %nop607 = alloca i1, i1 0
+  %nop608 = alloca i1, i1 0
+  %nop609 = alloca i1, i1 0
+  %nop610 = alloca i1, i1 0
+  %nop611 = alloca i1, i1 0
+  %nop612 = alloca i1, i1 0
+  %nop613 = alloca i1, i1 0
+  %nop614 = alloca i1, i1 0
+  %nop615 = alloca i1, i1 0
+  %nop616 = alloca i1, i1 0
+  %nop617 = alloca i1, i1 0
+  %nop618 = alloca i1, i1 0
+  %nop619 = alloca i1, i1 0
+  %nop620 = alloca i1, i1 0
+  %nop621 = alloca i1, i1 0
+  %nop622 = alloca i1, i1 0
+  %nop623 = alloca i1, i1 0
+  %nop624 = alloca i1, i1 0
+  %nop625 = alloca i1, i1 0
+  %nop626 = alloca i1, i1 0
+  %nop627 = alloca i1, i1 0
+  %nop628 = alloca i1, i1 0
+  %nop629 = alloca i1, i1 0
+  %nop630 = alloca i1, i1 0
+  %nop631 = alloca i1, i1 0
+  %nop632 = alloca i1, i1 0
+  %nop633 = alloca i1, i1 0
+  %nop634 = alloca i1, i1 0
+  %nop635 = alloca i1, i1 0
+  %nop636 = alloca i1, i1 0
+  %nop637 = alloca i1, i1 0
+  %nop638 = alloca i1, i1 0
+  %nop639 = alloca i1, i1 0
+  %nop640 = alloca i1, i1 0
+  %nop641 = alloca i1, i1 0
+  %nop642 = alloca i1, i1 0
+  %nop643 = alloca i1, i1 0
+  %nop644 = alloca i1, i1 0
+  %nop645 = alloca i1, i1 0
+  %nop646 = alloca i1, i1 0
+  %nop647 = alloca i1, i1 0
+  %nop648 = alloca i1, i1 0
+  %nop649 = alloca i1, i1 0
+  %nop650 = alloca i1, i1 0
+  %nop651 = alloca i1, i1 0
+  %nop652 = alloca i1, i1 0
+  %nop653 = alloca i1, i1 0
+  %nop654 = alloca i1, i1 0
+  %nop655 = alloca i1, i1 0
+  %nop656 = alloca i1, i1 0
+  %nop657 = alloca i1, i1 0
+  %nop658 = alloca i1, i1 0
+  %nop659 = alloca i1, i1 0
+  %nop660 = alloca i1, i1 0
+  %nop661 = alloca i1, i1 0
+  %nop662 = alloca i1, i1 0
+  %nop663 = alloca i1, i1 0
+  %nop664 = alloca i1, i1 0
+  %nop665 = alloca i1, i1 0
+  %nop666 = alloca i1, i1 0
+  %nop667 = alloca i1, i1 0
+  %nop668 = alloca i1, i1 0
+  %nop669 = alloca i1, i1 0
+  %nop670 = alloca i1, i1 0
+  %nop671 = alloca i1, i1 0
+  %nop672 = alloca i1, i1 0
+  %nop673 = alloca i1, i1 0
+  %nop674 = alloca i1, i1 0
+  %nop675 = alloca i1, i1 0
+  %nop676 = alloca i1, i1 0
+  %nop677 = alloca i1, i1 0
+  %nop678 = alloca i1, i1 0
+  %nop679 = alloca i1, i1 0
+  %nop680 = alloca i1, i1 0
+  %nop681 = alloca i1, i1 0
+  %nop682 = alloca i1, i1 0
+  %nop683 = alloca i1, i1 0
+  %nop684 = alloca i1, i1 0
+  %nop685 = alloca i1, i1 0
+  %nop686 = alloca i1, i1 0
+  %nop687 = alloca i1, i1 0
+  %nop688 = alloca i1, i1 0
+  %nop689 = alloca i1, i1 0
+  %nop690 = alloca i1, i1 0
+  %nop691 = alloca i1, i1 0
+  %nop692 = alloca i1, i1 0
+  %nop693 = alloca i1, i1 0
+  %nop694 = alloca i1, i1 0
+  %nop695 = alloca i1, i1 0
+  %nop696 = alloca i1, i1 0
+  %nop697 = alloca i1, i1 0
+  %nop698 = alloca i1, i1 0
+  %nop699 = alloca i1, i1 0
+  %nop700 = alloca i1, i1 0
+  %nop701 = alloca i1, i1 0
+  %nop702 = alloca i1, i1 0
+  %nop703 = alloca i1, i1 0
+  %nop704 = alloca i1, i1 0
+  %nop705 = alloca i1, i1 0
+  %nop706 = alloca i1, i1 0
+  %nop707 = alloca i1, i1 0
+  %nop708 = alloca i1, i1 0
+  %nop709 = alloca i1, i1 0
+  %nop710 = alloca i1, i1 0
+  %nop711 = alloca i1, i1 0
+  %nop712 = alloca i1, i1 0
+  %nop713 = alloca i1, i1 0
+  %nop714 = alloca i1, i1 0
+  %nop715 = alloca i1, i1 0
+  %nop716 = alloca i1, i1 0
+  %nop717 = alloca i1, i1 0
+  %nop718 = alloca i1, i1 0
+  %nop719 = alloca i1, i1 0
+  %nop720 = alloca i1, i1 0
+  %nop721 = alloca i1, i1 0
+  %nop722 = alloca i1, i1 0
+  %nop723 = alloca i1, i1 0
+  %nop724 = alloca i1, i1 0
+  %nop725 = alloca i1, i1 0
+  %nop726 = alloca i1, i1 0
+  %nop727 = alloca i1, i1 0
+  %nop728 = alloca i1, i1 0
+  %nop729 = alloca i1, i1 0
+  %nop730 = alloca i1, i1 0
+  %nop731 = alloca i1, i1 0
+  %nop732 = alloca i1, i1 0
+  %nop733 = alloca i1, i1 0
+  %nop734 = alloca i1, i1 0
+  %nop735 = alloca i1, i1 0
+  %nop736 = alloca i1, i1 0
+  %nop737 = alloca i1, i1 0
+  %nop738 = alloca i1, i1 0
+  %nop739 = alloca i1, i1 0
+  %nop740 = alloca i1, i1 0
+  %nop741 = alloca i1, i1 0
+  %nop742 = alloca i1, i1 0
+  %nop743 = alloca i1, i1 0
+  %nop744 = alloca i1, i1 0
+  %nop745 = alloca i1, i1 0
+  %nop746 = alloca i1, i1 0
+  %nop747 = alloca i1, i1 0
+  %nop748 = alloca i1, i1 0
+  %nop749 = alloca i1, i1 0
+  %nop750 = alloca i1, i1 0
+  %nop751 = alloca i1, i1 0
+  %nop752 = alloca i1, i1 0
+  %nop753 = alloca i1, i1 0
+  %nop754 = alloca i1, i1 0
+  %nop755 = alloca i1, i1 0
+  %nop756 = alloca i1, i1 0
+  %nop757 = alloca i1, i1 0
+  %nop758 = alloca i1, i1 0
+  %nop759 = alloca i1, i1 0
+  %nop760 = alloca i1, i1 0
+  %nop761 = alloca i1, i1 0
+  %nop762 = alloca i1, i1 0
+  %nop763 = alloca i1, i1 0
+  %nop764 = alloca i1, i1 0
+  %nop765 = alloca i1, i1 0
+  %nop766 = alloca i1, i1 0
+  %nop767 = alloca i1, i1 0
+  %nop768 = alloca i1, i1 0
+  %nop769 = alloca i1, i1 0
+  %nop770 = alloca i1, i1 0
+  %nop771 = alloca i1, i1 0
+  %nop772 = alloca i1, i1 0
+  %nop773 = alloca i1, i1 0
+  %nop774 = alloca i1, i1 0
+  %nop775 = alloca i1, i1 0
+  %nop776 = alloca i1, i1 0
+  %nop777 = alloca i1, i1 0
+  %nop778 = alloca i1, i1 0
+  %nop779 = alloca i1, i1 0
+  %nop780 = alloca i1, i1 0
+  %nop781 = alloca i1, i1 0
+  %nop782 = alloca i1, i1 0
+  %nop783 = alloca i1, i1 0
+  %nop784 = alloca i1, i1 0
+  %nop785 = alloca i1, i1 0
+  %nop786 = alloca i1, i1 0
+  %nop787 = alloca i1, i1 0
+  %nop788 = alloca i1, i1 0
+  %nop789 = alloca i1, i1 0
+  %nop790 = alloca i1, i1 0
+  %nop791 = alloca i1, i1 0
+  %nop792 = alloca i1, i1 0
+  %nop793 = alloca i1, i1 0
+  %nop794 = alloca i1, i1 0
+  %nop795 = alloca i1, i1 0
+  %nop796 = alloca i1, i1 0
+  %nop797 = alloca i1, i1 0
+  %nop798 = alloca i1, i1 0
+  %nop799 = alloca i1, i1 0
+  %nop800 = alloca i1, i1 0
+  %nop801 = alloca i1, i1 0
+  %nop802 = alloca i1, i1 0
+  %nop803 = alloca i1, i1 0
+  %nop804 = alloca i1, i1 0
+  %nop805 = alloca i1, i1 0
+  %nop806 = alloca i1, i1 0
+  %nop807 = alloca i1, i1 0
+  %nop808 = alloca i1, i1 0
+  %nop809 = alloca i1, i1 0
+  %nop810 = alloca i1, i1 0
+  %nop811 = alloca i1, i1 0
+  %nop812 = alloca i1, i1 0
+  %nop813 = alloca i1, i1 0
+  %nop814 = alloca i1, i1 0
+  %nop815 = alloca i1, i1 0
+  %nop816 = alloca i1, i1 0
+  %nop817 = alloca i1, i1 0
+  %nop818 = alloca i1, i1 0
+  %nop819 = alloca i1, i1 0
+  %nop820 = alloca i1, i1 0
+  %nop821 = alloca i1, i1 0
+  %nop822 = alloca i1, i1 0
+  %nop823 = alloca i1, i1 0
+  %nop824 = alloca i1, i1 0
+  %nop825 = alloca i1, i1 0
+  %nop826 = alloca i1, i1 0
+  %nop827 = alloca i1, i1 0
+  %nop828 = alloca i1, i1 0
+  %nop829 = alloca i1, i1 0
+  %nop830 = alloca i1, i1 0
+  %nop831 = alloca i1, i1 0
+  %nop832 = alloca i1, i1 0
+  %nop833 = alloca i1, i1 0
+  %nop834 = alloca i1, i1 0
+  %nop835 = alloca i1, i1 0
+  %nop836 = alloca i1, i1 0
+  %nop837 = alloca i1, i1 0
+  %nop838 = alloca i1, i1 0
+  %nop839 = alloca i1, i1 0
+  %nop840 = alloca i1, i1 0
+  %nop841 = alloca i1, i1 0
+  %nop842 = alloca i1, i1 0
+  %nop843 = alloca i1, i1 0
+  %nop844 = alloca i1, i1 0
+  %nop845 = alloca i1, i1 0
+  %nop846 = alloca i1, i1 0
+  %nop847 = alloca i1, i1 0
+  %nop848 = alloca i1, i1 0
+  %nop849 = alloca i1, i1 0
+  %nop850 = alloca i1, i1 0
+  %nop851 = alloca i1, i1 0
+  %nop852 = alloca i1, i1 0
+  %nop853 = alloca i1, i1 0
+  %nop854 = alloca i1, i1 0
+  %nop855 = alloca i1, i1 0
+  %nop856 = alloca i1, i1 0
+  %nop857 = alloca i1, i1 0
+  %nop858 = alloca i1, i1 0
+  %nop859 = alloca i1, i1 0
+  %nop860 = alloca i1, i1 0
+  %nop861 = alloca i1, i1 0
+  %nop862 = alloca i1, i1 0
+  %nop863 = alloca i1, i1 0
+  %nop864 = alloca i1, i1 0
+  %nop865 = alloca i1, i1 0
+  %nop866 = alloca i1, i1 0
+  %nop867 = alloca i1, i1 0
+  %nop868 = alloca i1, i1 0
+  %nop869 = alloca i1, i1 0
+  %nop870 = alloca i1, i1 0
+  %nop871 = alloca i1, i1 0
+  %nop872 = alloca i1, i1 0
+  %nop873 = alloca i1, i1 0
+  %nop874 = alloca i1, i1 0
+  %nop875 = alloca i1, i1 0
+  %nop876 = alloca i1, i1 0
+  %nop877 = alloca i1, i1 0
+  %nop878 = alloca i1, i1 0
+  %nop879 = alloca i1, i1 0
+  %nop880 = alloca i1, i1 0
+  %nop881 = alloca i1, i1 0
+  %nop882 = alloca i1, i1 0
+  %nop883 = alloca i1, i1 0
+  %nop884 = alloca i1, i1 0
+  %nop885 = alloca i1, i1 0
+  %nop886 = alloca i1, i1 0
+  %nop887 = alloca i1, i1 0
+  %nop888 = alloca i1, i1 0
+  %nop889 = alloca i1, i1 0
+  %nop890 = alloca i1, i1 0
+  %nop891 = alloca i1, i1 0
+  %nop892 = alloca i1, i1 0
+  %nop893 = alloca i1, i1 0
+  %nop894 = alloca i1, i1 0
+  %nop895 = alloca i1, i1 0
+  %nop896 = alloca i1, i1 0
+  %nop897 = alloca i1, i1 0
+  %nop898 = alloca i1, i1 0
+  %nop899 = alloca i1, i1 0
+  %nop900 = alloca i1, i1 0
+  %nop901 = alloca i1, i1 0
+  %nop902 = alloca i1, i1 0
+  %nop903 = alloca i1, i1 0
+  %nop904 = alloca i1, i1 0
+  %nop905 = alloca i1, i1 0
+  %nop906 = alloca i1, i1 0
+  %nop907 = alloca i1, i1 0
+  %nop908 = alloca i1, i1 0
+  %nop909 = alloca i1, i1 0
+  %nop910 = alloca i1, i1 0
+  %nop911 = alloca i1, i1 0
+  %nop912 = alloca i1, i1 0
+  %nop913 = alloca i1, i1 0
+  %nop914 = alloca i1, i1 0
+  %nop915 = alloca i1, i1 0
+  %nop916 = alloca i1, i1 0
+  %nop917 = alloca i1, i1 0
+  %nop918 = alloca i1, i1 0
+  %nop919 = alloca i1, i1 0
+  %nop920 = alloca i1, i1 0
+  %nop921 = alloca i1, i1 0
+  %nop922 = alloca i1, i1 0
+  %nop923 = alloca i1, i1 0
+  %nop924 = alloca i1, i1 0
+  %nop925 = alloca i1, i1 0
+  %nop926 = alloca i1, i1 0
+  %nop927 = alloca i1, i1 0
+  %nop928 = alloca i1, i1 0
+  %nop929 = alloca i1, i1 0
+  %nop930 = alloca i1, i1 0
+  %nop931 = alloca i1, i1 0
+  %nop932 = alloca i1, i1 0
+  %nop933 = alloca i1, i1 0
+  %nop934 = alloca i1, i1 0
+  %nop935 = alloca i1, i1 0
+  %nop936 = alloca i1, i1 0
+  %nop937 = alloca i1, i1 0
+  %nop938 = alloca i1, i1 0
+  %nop939 = alloca i1, i1 0
+  %nop940 = alloca i1, i1 0
+  %nop941 = alloca i1, i1 0
+  %nop942 = alloca i1, i1 0
+  %nop943 = alloca i1, i1 0
+  %nop944 = alloca i1, i1 0
+  %nop945 = alloca i1, i1 0
+  %nop946 = alloca i1, i1 0
+  %nop947 = alloca i1, i1 0
+  %nop948 = alloca i1, i1 0
+  %nop949 = alloca i1, i1 0
+  %nop950 = alloca i1, i1 0
+  %nop951 = alloca i1, i1 0
+  %nop952 = alloca i1, i1 0
+  %nop953 = alloca i1, i1 0
+  %nop954 = alloca i1, i1 0
+  %nop955 = alloca i1, i1 0
+  %nop956 = alloca i1, i1 0
+  %nop957 = alloca i1, i1 0
+  %nop958 = alloca i1, i1 0
+  %nop959 = alloca i1, i1 0
+  %nop960 = alloca i1, i1 0
+  %nop961 = alloca i1, i1 0
+  %nop962 = alloca i1, i1 0
+  %nop963 = alloca i1, i1 0
+  %nop964 = alloca i1, i1 0
+  %nop965 = alloca i1, i1 0
+  %nop966 = alloca i1, i1 0
+  %nop967 = alloca i1, i1 0
+  %nop968 = alloca i1, i1 0
+  %nop969 = alloca i1, i1 0
+  %nop970 = alloca i1, i1 0
+  %nop971 = alloca i1, i1 0
+  %nop972 = alloca i1, i1 0
+  %nop973 = alloca i1, i1 0
+  %nop974 = alloca i1, i1 0
+  %nop975 = alloca i1, i1 0
+  %nop976 = alloca i1, i1 0
+  %nop977 = alloca i1, i1 0
+  %nop978 = alloca i1, i1 0
+  %nop979 = alloca i1, i1 0
+  %nop980 = alloca i1, i1 0
+  %nop981 = alloca i1, i1 0
+  %nop982 = alloca i1, i1 0
+  %nop983 = alloca i1, i1 0
+  %nop984 = alloca i1, i1 0
+  %nop985 = alloca i1, i1 0
+  %nop986 = alloca i1, i1 0
+  %nop987 = alloca i1, i1 0
+  %nop988 = alloca i1, i1 0
+  %nop989 = alloca i1, i1 0
+  %nop990 = alloca i1, i1 0
+  %nop991 = alloca i1, i1 0
+  %nop992 = alloca i1, i1 0
+  %nop993 = alloca i1, i1 0
+  %nop994 = alloca i1, i1 0
+  %nop995 = alloca i1, i1 0
+  %nop996 = alloca i1, i1 0
+  %nop997 = alloca i1, i1 0
+  %nop998 = alloca i1, i1 0
+  %nop999 = alloca i1, i1 0
+  %nop1000 = alloca i1, i1 0
+  %nop1001 = alloca i1, i1 0
+  %nop1002 = alloca i1, i1 0
+  %nop1003 = alloca i1, i1 0
+  %nop1004 = alloca i1, i1 0
+  %nop1005 = alloca i1, i1 0
+  %nop1006 = alloca i1, i1 0
+  %nop1007 = alloca i1, i1 0
+  %nop1008 = alloca i1, i1 0
+  %nop1009 = alloca i1, i1 0
+  %nop1010 = alloca i1, i1 0
+  %nop1011 = alloca i1, i1 0
+  %nop1012 = alloca i1, i1 0
+  %nop1013 = alloca i1, i1 0
+  %nop1014 = alloca i1, i1 0
+  %nop1015 = alloca i1, i1 0
+  %nop1016 = alloca i1, i1 0
+  %nop1017 = alloca i1, i1 0
+  %nop1018 = alloca i1, i1 0
+  %nop1019 = alloca i1, i1 0
+  %nop1020 = alloca i1, i1 0
+  %nop1021 = alloca i1, i1 0
+  %nop1022 = alloca i1, i1 0
+  %nop1023 = alloca i1, i1 0
+  %nop1024 = alloca i1, i1 0
+  %nop1025 = alloca i1, i1 0
+  %nop1026 = alloca i1, i1 0
+  %nop1027 = alloca i1, i1 0
+  %nop1028 = alloca i1, i1 0
+  %nop1029 = alloca i1, i1 0
+  %nop1030 = alloca i1, i1 0
+  %nop1031 = alloca i1, i1 0
+  %nop1032 = alloca i1, i1 0
+  %nop1033 = alloca i1, i1 0
+  %nop1034 = alloca i1, i1 0
+  %nop1035 = alloca i1, i1 0
+  %nop1036 = alloca i1, i1 0
+  %nop1037 = alloca i1, i1 0
+  %nop1038 = alloca i1, i1 0
+  %nop1039 = alloca i1, i1 0
+  %nop1040 = alloca i1, i1 0
+  %nop1041 = alloca i1, i1 0
+  %nop1042 = alloca i1, i1 0
+  %nop1043 = alloca i1, i1 0
+  %nop1044 = alloca i1, i1 0
+  %nop1045 = alloca i1, i1 0
+  %nop1046 = alloca i1, i1 0
+  %nop1047 = alloca i1, i1 0
+  %nop1048 = alloca i1, i1 0
+  %nop1049 = alloca i1, i1 0
+  %nop1050 = alloca i1, i1 0
+  %nop1051 = alloca i1, i1 0
+  %nop1052 = alloca i1, i1 0
+  %nop1053 = alloca i1, i1 0
+  %nop1054 = alloca i1, i1 0
+  %nop1055 = alloca i1, i1 0
+  %nop1056 = alloca i1, i1 0
+  %nop1057 = alloca i1, i1 0
+  %nop1058 = alloca i1, i1 0
+  %nop1059 = alloca i1, i1 0
+  %nop1060 = alloca i1, i1 0
+  %nop1061 = alloca i1, i1 0
+  %nop1062 = alloca i1, i1 0
+  %nop1063 = alloca i1, i1 0
+  %nop1064 = alloca i1, i1 0
+  %nop1065 = alloca i1, i1 0
+  %nop1066 = alloca i1, i1 0
+  %nop1067 = alloca i1, i1 0
+  %nop1068 = alloca i1, i1 0
+  %nop1069 = alloca i1, i1 0
+  %nop1070 = alloca i1, i1 0
+  %nop1071 = alloca i1, i1 0
+  %nop1072 = alloca i1, i1 0
+  %nop1073 = alloca i1, i1 0
+  %nop1074 = alloca i1, i1 0
+  %nop1075 = alloca i1, i1 0
+  %nop1076 = alloca i1, i1 0
+  %nop1077 = alloca i1, i1 0
+  %nop1078 = alloca i1, i1 0
+  %nop1079 = alloca i1, i1 0
+  %nop1080 = alloca i1, i1 0
+  %nop1081 = alloca i1, i1 0
+  %nop1082 = alloca i1, i1 0
+  %nop1083 = alloca i1, i1 0
+  %nop1084 = alloca i1, i1 0
+  %nop1085 = alloca i1, i1 0
+  %nop1086 = alloca i1, i1 0
+  %nop1087 = alloca i1, i1 0
+  %nop1088 = alloca i1, i1 0
+  %nop1089 = alloca i1, i1 0
+  %nop1090 = alloca i1, i1 0
+  %nop1091 = alloca i1, i1 0
+  %nop1092 = alloca i1, i1 0
+  %nop1093 = alloca i1, i1 0
+  %nop1094 = alloca i1, i1 0
+  %nop1095 = alloca i1, i1 0
+  %nop1096 = alloca i1, i1 0
+  %nop1097 = alloca i1, i1 0
+  %nop1098 = alloca i1, i1 0
+  %nop1099 = alloca i1, i1 0
+  %nop1100 = alloca i1, i1 0
+  %nop1101 = alloca i1, i1 0
+  %nop1102 = alloca i1, i1 0
+  %nop1103 = alloca i1, i1 0
+  %nop1104 = alloca i1, i1 0
+  %nop1105 = alloca i1, i1 0
+  %nop1106 = alloca i1, i1 0
+  %nop1107 = alloca i1, i1 0
+  %nop1108 = alloca i1, i1 0
+  %nop1109 = alloca i1, i1 0
+  %nop1110 = alloca i1, i1 0
+  %nop1111 = alloca i1, i1 0
+  %nop1112 = alloca i1, i1 0
+  %nop1113 = alloca i1, i1 0
+  %nop1114 = alloca i1, i1 0
+  %nop1115 = alloca i1, i1 0
+  %nop1116 = alloca i1, i1 0
+  %nop1117 = alloca i1, i1 0
+  %nop1118 = alloca i1, i1 0
+  %nop1119 = alloca i1, i1 0
+  %nop1120 = alloca i1, i1 0
+  %nop1121 = alloca i1, i1 0
+  %nop1122 = alloca i1, i1 0
+  %nop1123 = alloca i1, i1 0
+  %nop1124 = alloca i1, i1 0
+  %nop1125 = alloca i1, i1 0
+  %nop1126 = alloca i1, i1 0
+  %nop1127 = alloca i1, i1 0
+  %nop1128 = alloca i1, i1 0
+  %nop1129 = alloca i1, i1 0
+  %nop1130 = alloca i1, i1 0
+  %nop1131 = alloca i1, i1 0
+  %nop1132 = alloca i1, i1 0
+  %nop1133 = alloca i1, i1 0
+  %nop1134 = alloca i1, i1 0
+  %nop1135 = alloca i1, i1 0
+  %nop1136 = alloca i1, i1 0
+  %nop1137 = alloca i1, i1 0
+  %nop1138 = alloca i1, i1 0
+  %nop1139 = alloca i1, i1 0
+  %nop1140 = alloca i1, i1 0
+  %nop1141 = alloca i1, i1 0
+  %nop1142 = alloca i1, i1 0
+  %nop1143 = alloca i1, i1 0
+  %nop1144 = alloca i1, i1 0
+  %nop1145 = alloca i1, i1 0
+  %nop1146 = alloca i1, i1 0
+  %nop1147 = alloca i1, i1 0
+  %nop1148 = alloca i1, i1 0
+  %nop1149 = alloca i1, i1 0
+  %nop1150 = alloca i1, i1 0
+  %nop1151 = alloca i1, i1 0
+  %nop1152 = alloca i1, i1 0
+  %nop1153 = alloca i1, i1 0
+  %nop1154 = alloca i1, i1 0
+  %nop1155 = alloca i1, i1 0
+  %nop1156 = alloca i1, i1 0
+  %nop1157 = alloca i1, i1 0
+  %nop1158 = alloca i1, i1 0
+  %nop1159 = alloca i1, i1 0
+  %nop1160 = alloca i1, i1 0
+  %nop1161 = alloca i1, i1 0
+  %nop1162 = alloca i1, i1 0
+  %nop1163 = alloca i1, i1 0
+  %nop1164 = alloca i1, i1 0
+  %nop1165 = alloca i1, i1 0
+  %nop1166 = alloca i1, i1 0
+  %nop1167 = alloca i1, i1 0
+  %nop1168 = alloca i1, i1 0
+  %nop1169 = alloca i1, i1 0
+  %nop1170 = alloca i1, i1 0
+  %nop1171 = alloca i1, i1 0
+  %nop1172 = alloca i1, i1 0
+  %nop1173 = alloca i1, i1 0
+  %nop1174 = alloca i1, i1 0
+  %nop1175 = alloca i1, i1 0
+  %nop1176 = alloca i1, i1 0
+  %nop1177 = alloca i1, i1 0
+  %nop1178 = alloca i1, i1 0
+  %nop1179 = alloca i1, i1 0
+  %nop1180 = alloca i1, i1 0
+  %nop1181 = alloca i1, i1 0
+  %nop1182 = alloca i1, i1 0
+  %nop1183 = alloca i1, i1 0
+  %nop1184 = alloca i1, i1 0
+  %nop1185 = alloca i1, i1 0
+  %nop1186 = alloca i1, i1 0
+  %nop1187 = alloca i1, i1 0
+  %nop1188 = alloca i1, i1 0
+  %nop1189 = alloca i1, i1 0
+  %nop1190 = alloca i1, i1 0
+  %nop1191 = alloca i1, i1 0
+  %nop1192 = alloca i1, i1 0
+  %nop1193 = alloca i1, i1 0
+  %nop1194 = alloca i1, i1 0
+  %nop1195 = alloca i1, i1 0
+  %nop1196 = alloca i1, i1 0
+  %nop1197 = alloca i1, i1 0
+  %nop1198 = alloca i1, i1 0
+  %nop1199 = alloca i1, i1 0
+  %nop1200 = alloca i1, i1 0
+  %nop1201 = alloca i1, i1 0
+  %nop1202 = alloca i1, i1 0
+  %nop1203 = alloca i1, i1 0
+  %nop1204 = alloca i1, i1 0
+  %nop1205 = alloca i1, i1 0
+  %nop1206 = alloca i1, i1 0
+  %nop1207 = alloca i1, i1 0
+  %nop1208 = alloca i1, i1 0
+  %nop1209 = alloca i1, i1 0
+  %nop1210 = alloca i1, i1 0
+  %nop1211 = alloca i1, i1 0
+  %nop1212 = alloca i1, i1 0
+  %nop1213 = alloca i1, i1 0
+  %nop1214 = alloca i1, i1 0
+  %nop1215 = alloca i1, i1 0
+  %nop1216 = alloca i1, i1 0
+  %nop1217 = alloca i1, i1 0
+  %nop1218 = alloca i1, i1 0
+  %nop1219 = alloca i1, i1 0
+  %nop1220 = alloca i1, i1 0
+  %nop1221 = alloca i1, i1 0
+  %nop1222 = alloca i1, i1 0
+  %nop1223 = alloca i1, i1 0
+  %nop1224 = alloca i1, i1 0
+  %nop1225 = alloca i1, i1 0
+  %nop1226 = alloca i1, i1 0
+  %nop1227 = alloca i1, i1 0
+  %nop1228 = alloca i1, i1 0
+  %nop1229 = alloca i1, i1 0
+  %nop1230 = alloca i1, i1 0
+  %nop1231 = alloca i1, i1 0
+  %nop1232 = alloca i1, i1 0
+  %nop1233 = alloca i1, i1 0
+  %nop1234 = alloca i1, i1 0
+  %nop1235 = alloca i1, i1 0
+  %nop1236 = alloca i1, i1 0
+  %nop1237 = alloca i1, i1 0
+  %nop1238 = alloca i1, i1 0
+  %nop1239 = alloca i1, i1 0
+  %nop1240 = alloca i1, i1 0
+  %nop1241 = alloca i1, i1 0
+  %nop1242 = alloca i1, i1 0
+  %nop1243 = alloca i1, i1 0
+  %nop1244 = alloca i1, i1 0
+  %nop1245 = alloca i1, i1 0
+  %nop1246 = alloca i1, i1 0
+  %nop1247 = alloca i1, i1 0
+  %nop1248 = alloca i1, i1 0
+  %nop1249 = alloca i1, i1 0
+  %nop1250 = alloca i1, i1 0
+  %nop1251 = alloca i1, i1 0
+  %nop1252 = alloca i1, i1 0
+  %nop1253 = alloca i1, i1 0
+  %nop1254 = alloca i1, i1 0
+  %nop1255 = alloca i1, i1 0
+  %nop1256 = alloca i1, i1 0
+  %nop1257 = alloca i1, i1 0
+  %nop1258 = alloca i1, i1 0
+  %nop1259 = alloca i1, i1 0
+  %nop1260 = alloca i1, i1 0
+  %nop1261 = alloca i1, i1 0
+  %nop1262 = alloca i1, i1 0
+  %nop1263 = alloca i1, i1 0
+  %nop1264 = alloca i1, i1 0
+  %nop1265 = alloca i1, i1 0
+  %nop1266 = alloca i1, i1 0
+  %nop1267 = alloca i1, i1 0
+  %nop1268 = alloca i1, i1 0
+  %nop1269 = alloca i1, i1 0
+  %nop1270 = alloca i1, i1 0
+  %nop1271 = alloca i1, i1 0
+  %nop1272 = alloca i1, i1 0
+  %nop1273 = alloca i1, i1 0
+  %nop1274 = alloca i1, i1 0
+  %nop1275 = alloca i1, i1 0
+  %nop1276 = alloca i1, i1 0
+  %nop1277 = alloca i1, i1 0
+  %nop1278 = alloca i1, i1 0
+  %nop1279 = alloca i1, i1 0
+  %nop1280 = alloca i1, i1 0
+  %nop1281 = alloca i1, i1 0
+  %nop1282 = alloca i1, i1 0
+  %nop1283 = alloca i1, i1 0
+  %nop1284 = alloca i1, i1 0
+  %nop1285 = alloca i1, i1 0
+  %nop1286 = alloca i1, i1 0
+  %nop1287 = alloca i1, i1 0
+  %nop1288 = alloca i1, i1 0
+  %nop1289 = alloca i1, i1 0
+  %nop1290 = alloca i1, i1 0
+  %nop1291 = alloca i1, i1 0
+  %nop1292 = alloca i1, i1 0
+  %nop1293 = alloca i1, i1 0
+  %nop1294 = alloca i1, i1 0
+  %nop1295 = alloca i1, i1 0
+  %nop1296 = alloca i1, i1 0
+  %nop1297 = alloca i1, i1 0
+  %nop1298 = alloca i1, i1 0
+  %nop1299 = alloca i1, i1 0
+  %nop1300 = alloca i1, i1 0
+  %nop1301 = alloca i1, i1 0
+  %nop1302 = alloca i1, i1 0
+  %nop1303 = alloca i1, i1 0
+  %nop1304 = alloca i1, i1 0
+  %nop1305 = alloca i1, i1 0
+  %nop1306 = alloca i1, i1 0
+  %nop1307 = alloca i1, i1 0
+  %nop1308 = alloca i1, i1 0
+  %nop1309 = alloca i1, i1 0
+  %nop1310 = alloca i1, i1 0
+  %nop1311 = alloca i1, i1 0
+  %nop1312 = alloca i1, i1 0
+  %nop1313 = alloca i1, i1 0
+  %nop1314 = alloca i1, i1 0
+  %nop1315 = alloca i1, i1 0
+  %nop1316 = alloca i1, i1 0
+  %nop1317 = alloca i1, i1 0
+  %nop1318 = alloca i1, i1 0
+  %nop1319 = alloca i1, i1 0
+  %nop1320 = alloca i1, i1 0
+  %nop1321 = alloca i1, i1 0
+  %nop1322 = alloca i1, i1 0
+  %nop1323 = alloca i1, i1 0
+  %nop1324 = alloca i1, i1 0
+  %nop1325 = alloca i1, i1 0
+  %nop1326 = alloca i1, i1 0
+  %nop1327 = alloca i1, i1 0
+  %nop1328 = alloca i1, i1 0
+  %nop1329 = alloca i1, i1 0
+  %nop1330 = alloca i1, i1 0
+  %nop1331 = alloca i1, i1 0
+  %nop1332 = alloca i1, i1 0
+  %nop1333 = alloca i1, i1 0
+  %nop1334 = alloca i1, i1 0
+  %nop1335 = alloca i1, i1 0
+  %nop1336 = alloca i1, i1 0
+  %nop1337 = alloca i1, i1 0
+  %nop1338 = alloca i1, i1 0
+  %nop1339 = alloca i1, i1 0
+  %nop1340 = alloca i1, i1 0
+  %nop1341 = alloca i1, i1 0
+  %nop1342 = alloca i1, i1 0
+  %nop1343 = alloca i1, i1 0
+  %nop1344 = alloca i1, i1 0
+  %nop1345 = alloca i1, i1 0
+  %nop1346 = alloca i1, i1 0
+  %nop1347 = alloca i1, i1 0
+  %nop1348 = alloca i1, i1 0
+  %nop1349 = alloca i1, i1 0
+  %nop1350 = alloca i1, i1 0
+  %nop1351 = alloca i1, i1 0
+  %nop1352 = alloca i1, i1 0
+  %nop1353 = alloca i1, i1 0
+  %nop1354 = alloca i1, i1 0
+  %nop1355 = alloca i1, i1 0
+  %nop1356 = alloca i1, i1 0
+  %nop1357 = alloca i1, i1 0
+  %nop1358 = alloca i1, i1 0
+  %nop1359 = alloca i1, i1 0
+  %nop1360 = alloca i1, i1 0
+  %nop1361 = alloca i1, i1 0
+  %nop1362 = alloca i1, i1 0
+  %nop1363 = alloca i1, i1 0
+  %nop1364 = alloca i1, i1 0
+  %nop1365 = alloca i1, i1 0
+  %nop1366 = alloca i1, i1 0
+  %nop1367 = alloca i1, i1 0
+  %nop1368 = alloca i1, i1 0
+  %nop1369 = alloca i1, i1 0
+  %nop1370 = alloca i1, i1 0
+  %nop1371 = alloca i1, i1 0
+  %nop1372 = alloca i1, i1 0
+  %nop1373 = alloca i1, i1 0
+  %nop1374 = alloca i1, i1 0
+  %nop1375 = alloca i1, i1 0
+  %nop1376 = alloca i1, i1 0
+  %nop1377 = alloca i1, i1 0
+  %nop1378 = alloca i1, i1 0
+  %nop1379 = alloca i1, i1 0
+  %nop1380 = alloca i1, i1 0
+  %nop1381 = alloca i1, i1 0
+  %nop1382 = alloca i1, i1 0
+  %nop1383 = alloca i1, i1 0
+  %nop1384 = alloca i1, i1 0
+  %nop1385 = alloca i1, i1 0
+  %nop1386 = alloca i1, i1 0
+  %nop1387 = alloca i1, i1 0
+  %nop1388 = alloca i1, i1 0
+  %nop1389 = alloca i1, i1 0
+  %nop1390 = alloca i1, i1 0
+  %nop1391 = alloca i1, i1 0
+  %nop1392 = alloca i1, i1 0
+  %nop1393 = alloca i1, i1 0
+  %nop1394 = alloca i1, i1 0
+  %nop1395 = alloca i1, i1 0
+  %nop1396 = alloca i1, i1 0
+  %nop1397 = alloca i1, i1 0
+  %nop1398 = alloca i1, i1 0
+  %nop1399 = alloca i1, i1 0
+  %nop1400 = alloca i1, i1 0
+  %nop1401 = alloca i1, i1 0
+  %nop1402 = alloca i1, i1 0
+  %nop1403 = alloca i1, i1 0
+  %nop1404 = alloca i1, i1 0
+  %nop1405 = alloca i1, i1 0
+  %nop1406 = alloca i1, i1 0
+  %nop1407 = alloca i1, i1 0
+  %nop1408 = alloca i1, i1 0
+  %nop1409 = alloca i1, i1 0
+  %nop1410 = alloca i1, i1 0
+  %nop1411 = alloca i1, i1 0
+  %nop1412 = alloca i1, i1 0
+  %nop1413 = alloca i1, i1 0
+  %nop1414 = alloca i1, i1 0
+  %nop1415 = alloca i1, i1 0
+  %nop1416 = alloca i1, i1 0
+  %nop1417 = alloca i1, i1 0
+  %nop1418 = alloca i1, i1 0
+  %nop1419 = alloca i1, i1 0
+  %nop1420 = alloca i1, i1 0
+  %nop1421 = alloca i1, i1 0
+  %nop1422 = alloca i1, i1 0
+  %nop1423 = alloca i1, i1 0
+  %nop1424 = alloca i1, i1 0
+  %nop1425 = alloca i1, i1 0
+  %nop1426 = alloca i1, i1 0
+  %nop1427 = alloca i1, i1 0
+  %nop1428 = alloca i1, i1 0
+  %nop1429 = alloca i1, i1 0
+  %nop1430 = alloca i1, i1 0
+  %nop1431 = alloca i1, i1 0
+  %nop1432 = alloca i1, i1 0
+  %nop1433 = alloca i1, i1 0
+  %nop1434 = alloca i1, i1 0
+  %nop1435 = alloca i1, i1 0
+  %nop1436 = alloca i1, i1 0
+  %nop1437 = alloca i1, i1 0
+  %nop1438 = alloca i1, i1 0
+  %nop1439 = alloca i1, i1 0
+  %nop1440 = alloca i1, i1 0
+  %nop1441 = alloca i1, i1 0
+  %nop1442 = alloca i1, i1 0
+  %nop1443 = alloca i1, i1 0
+  %nop1444 = alloca i1, i1 0
+  %nop1445 = alloca i1, i1 0
+  %nop1446 = alloca i1, i1 0
+  %nop1447 = alloca i1, i1 0
+  %nop1448 = alloca i1, i1 0
+  %nop1449 = alloca i1, i1 0
+  %nop1450 = alloca i1, i1 0
+  %nop1451 = alloca i1, i1 0
+  %nop1452 = alloca i1, i1 0
+  %nop1453 = alloca i1, i1 0
+  %nop1454 = alloca i1, i1 0
+  %nop1455 = alloca i1, i1 0
+  %nop1456 = alloca i1, i1 0
+  %nop1457 = alloca i1, i1 0
+  %nop1458 = alloca i1, i1 0
+  %nop1459 = alloca i1, i1 0
+  %nop1460 = alloca i1, i1 0
+  %nop1461 = alloca i1, i1 0
+  %nop1462 = alloca i1, i1 0
+  %nop1463 = alloca i1, i1 0
+  %nop1464 = alloca i1, i1 0
+  %nop1465 = alloca i1, i1 0
+  %nop1466 = alloca i1, i1 0
+  %nop1467 = alloca i1, i1 0
+  %nop1468 = alloca i1, i1 0
+  %nop1469 = alloca i1, i1 0
+  %nop1470 = alloca i1, i1 0
+  %nop1471 = alloca i1, i1 0
+  %nop1472 = alloca i1, i1 0
+  %nop1473 = alloca i1, i1 0
+  %nop1474 = alloca i1, i1 0
+  %nop1475 = alloca i1, i1 0
+  %nop1476 = alloca i1, i1 0
+  %nop1477 = alloca i1, i1 0
+  %nop1478 = alloca i1, i1 0
+  %nop1479 = alloca i1, i1 0
+  %nop1480 = alloca i1, i1 0
+  %nop1481 = alloca i1, i1 0
+  %nop1482 = alloca i1, i1 0
+  %nop1483 = alloca i1, i1 0
+  %nop1484 = alloca i1, i1 0
+  %nop1485 = alloca i1, i1 0
+  %nop1486 = alloca i1, i1 0
+  %nop1487 = alloca i1, i1 0
+  %nop1488 = alloca i1, i1 0
+  %nop1489 = alloca i1, i1 0
+  %nop1490 = alloca i1, i1 0
+  %nop1491 = alloca i1, i1 0
+  %nop1492 = alloca i1, i1 0
+  %nop1493 = alloca i1, i1 0
+  %nop1494 = alloca i1, i1 0
+  %nop1495 = alloca i1, i1 0
+  %nop1496 = alloca i1, i1 0
+  %nop1497 = alloca i1, i1 0
+  %nop1498 = alloca i1, i1 0
+  %nop1499 = alloca i1, i1 0
+  %nop1500 = alloca i1, i1 0
+  %nop1501 = alloca i1, i1 0
+  %nop1502 = alloca i1, i1 0
+  %nop1503 = alloca i1, i1 0
+  %nop1504 = alloca i1, i1 0
+  %nop1505 = alloca i1, i1 0
+  %nop1506 = alloca i1, i1 0
+  %nop1507 = alloca i1, i1 0
+  %nop1508 = alloca i1, i1 0
+  %nop1509 = alloca i1, i1 0
+  %nop1510 = alloca i1, i1 0
+  %nop1511 = alloca i1, i1 0
+  %nop1512 = alloca i1, i1 0
+  %nop1513 = alloca i1, i1 0
+  %nop1514 = alloca i1, i1 0
+  %nop1515 = alloca i1, i1 0
+  %nop1516 = alloca i1, i1 0
+  %nop1517 = alloca i1, i1 0
+  %nop1518 = alloca i1, i1 0
+  %nop1519 = alloca i1, i1 0
+  %nop1520 = alloca i1, i1 0
+  %nop1521 = alloca i1, i1 0
+  %nop1522 = alloca i1, i1 0
+  %nop1523 = alloca i1, i1 0
+  %nop1524 = alloca i1, i1 0
+  %nop1525 = alloca i1, i1 0
+  %nop1526 = alloca i1, i1 0
+  %nop1527 = alloca i1, i1 0
+  %nop1528 = alloca i1, i1 0
+  %nop1529 = alloca i1, i1 0
+  %nop1530 = alloca i1, i1 0
+  %nop1531 = alloca i1, i1 0
+  %nop1532 = alloca i1, i1 0
+  %nop1533 = alloca i1, i1 0
+  %nop1534 = alloca i1, i1 0
+  %nop1535 = alloca i1, i1 0
+  %nop1536 = alloca i1, i1 0
+  %nop1537 = alloca i1, i1 0
+  %nop1538 = alloca i1, i1 0
+  %nop1539 = alloca i1, i1 0
+  %nop1540 = alloca i1, i1 0
+  %nop1541 = alloca i1, i1 0
+  %nop1542 = alloca i1, i1 0
+  %nop1543 = alloca i1, i1 0
+  %nop1544 = alloca i1, i1 0
+  %nop1545 = alloca i1, i1 0
+  %nop1546 = alloca i1, i1 0
+  %nop1547 = alloca i1, i1 0
+  %nop1548 = alloca i1, i1 0
+  %nop1549 = alloca i1, i1 0
+  %nop1550 = alloca i1, i1 0
+  %nop1551 = alloca i1, i1 0
+  %nop1552 = alloca i1, i1 0
+  %nop1553 = alloca i1, i1 0
+  %nop1554 = alloca i1, i1 0
+  %nop1555 = alloca i1, i1 0
+  %nop1556 = alloca i1, i1 0
+  %nop1557 = alloca i1, i1 0
+  %nop1558 = alloca i1, i1 0
+  %nop1559 = alloca i1, i1 0
+  %nop1560 = alloca i1, i1 0
+  %nop1561 = alloca i1, i1 0
+  %nop1562 = alloca i1, i1 0
+  %nop1563 = alloca i1, i1 0
+  %nop1564 = alloca i1, i1 0
+  %nop1565 = alloca i1, i1 0
+  %nop1566 = alloca i1, i1 0
+  %nop1567 = alloca i1, i1 0
+  %nop1568 = alloca i1, i1 0
+  %nop1569 = alloca i1, i1 0
+  %nop1570 = alloca i1, i1 0
+  %nop1571 = alloca i1, i1 0
+  %nop1572 = alloca i1, i1 0
+  %nop1573 = alloca i1, i1 0
+  %nop1574 = alloca i1, i1 0
+  %nop1575 = alloca i1, i1 0
+  %nop1576 = alloca i1, i1 0
+  %nop1577 = alloca i1, i1 0
+  %nop1578 = alloca i1, i1 0
+  %nop1579 = alloca i1, i1 0
+  %nop1580 = alloca i1, i1 0
+  %nop1581 = alloca i1, i1 0
+  %nop1582 = alloca i1, i1 0
+  %nop1583 = alloca i1, i1 0
+  %nop1584 = alloca i1, i1 0
+  %nop1585 = alloca i1, i1 0
+  %nop1586 = alloca i1, i1 0
+  %nop1587 = alloca i1, i1 0
+  %nop1588 = alloca i1, i1 0
+  %nop1589 = alloca i1, i1 0
+  %nop1590 = alloca i1, i1 0
+  %nop1591 = alloca i1, i1 0
+  %nop1592 = alloca i1, i1 0
+  %nop1593 = alloca i1, i1 0
+  %nop1594 = alloca i1, i1 0
+  %nop1595 = alloca i1, i1 0
+  %nop1596 = alloca i1, i1 0
+  %nop1597 = alloca i1, i1 0
+  %nop1598 = alloca i1, i1 0
+  %nop1599 = alloca i1, i1 0
+  %nop1600 = alloca i1, i1 0
+  %nop1601 = alloca i1, i1 0
+  %nop1602 = alloca i1, i1 0
+  %nop1603 = alloca i1, i1 0
+  %nop1604 = alloca i1, i1 0
+  %nop1605 = alloca i1, i1 0
+  %nop1606 = alloca i1, i1 0
+  %nop1607 = alloca i1, i1 0
+  %nop1608 = alloca i1, i1 0
+  %nop1609 = alloca i1, i1 0
+  %nop1610 = alloca i1, i1 0
+  %nop1611 = alloca i1, i1 0
+  %nop1612 = alloca i1, i1 0
+  %nop1613 = alloca i1, i1 0
+  %nop1614 = alloca i1, i1 0
+  %nop1615 = alloca i1, i1 0
+  %nop1616 = alloca i1, i1 0
+  %nop1617 = alloca i1, i1 0
+  %nop1618 = alloca i1, i1 0
+  %nop1619 = alloca i1, i1 0
+  %nop1620 = alloca i1, i1 0
+  %nop1621 = alloca i1, i1 0
+  %nop1622 = alloca i1, i1 0
+  %nop1623 = alloca i1, i1 0
+  %nop1624 = alloca i1, i1 0
+  %nop1625 = alloca i1, i1 0
+  %nop1626 = alloca i1, i1 0
+  %nop1627 = alloca i1, i1 0
+  %nop1628 = alloca i1, i1 0
+  %nop1629 = alloca i1, i1 0
+  %nop1630 = alloca i1, i1 0
+  %nop1631 = alloca i1, i1 0
+  %nop1632 = alloca i1, i1 0
+  %nop1633 = alloca i1, i1 0
+  %nop1634 = alloca i1, i1 0
+  %nop1635 = alloca i1, i1 0
+  %nop1636 = alloca i1, i1 0
+  %nop1637 = alloca i1, i1 0
+  %nop1638 = alloca i1, i1 0
+  %nop1639 = alloca i1, i1 0
+  %nop1640 = alloca i1, i1 0
+  %nop1641 = alloca i1, i1 0
+  %nop1642 = alloca i1, i1 0
+  %nop1643 = alloca i1, i1 0
+  %nop1644 = alloca i1, i1 0
+  %nop1645 = alloca i1, i1 0
+  %nop1646 = alloca i1, i1 0
+  %nop1647 = alloca i1, i1 0
+  %nop1648 = alloca i1, i1 0
+  %nop1649 = alloca i1, i1 0
+  %nop1650 = alloca i1, i1 0
+  %nop1651 = alloca i1, i1 0
+  %nop1652 = alloca i1, i1 0
+  %nop1653 = alloca i1, i1 0
+  %nop1654 = alloca i1, i1 0
+  %nop1655 = alloca i1, i1 0
+  %nop1656 = alloca i1, i1 0
+  %nop1657 = alloca i1, i1 0
+  %nop1658 = alloca i1, i1 0
+  %nop1659 = alloca i1, i1 0
+  %nop1660 = alloca i1, i1 0
+  %nop1661 = alloca i1, i1 0
+  %nop1662 = alloca i1, i1 0
+  %nop1663 = alloca i1, i1 0
+  %nop1664 = alloca i1, i1 0
+  %nop1665 = alloca i1, i1 0
+  %nop1666 = alloca i1, i1 0
+  %nop1667 = alloca i1, i1 0
+  %nop1668 = alloca i1, i1 0
+  %nop1669 = alloca i1, i1 0
+  %nop1670 = alloca i1, i1 0
+  %nop1671 = alloca i1, i1 0
+  %nop1672 = alloca i1, i1 0
+  %nop1673 = alloca i1, i1 0
+  %nop1674 = alloca i1, i1 0
+  %nop1675 = alloca i1, i1 0
+  %nop1676 = alloca i1, i1 0
+  %nop1677 = alloca i1, i1 0
+  %nop1678 = alloca i1, i1 0
+  %nop1679 = alloca i1, i1 0
+  %nop1680 = alloca i1, i1 0
+  %nop1681 = alloca i1, i1 0
+  %nop1682 = alloca i1, i1 0
+  %nop1683 = alloca i1, i1 0
+  %nop1684 = alloca i1, i1 0
+  %nop1685 = alloca i1, i1 0
+  %nop1686 = alloca i1, i1 0
+  %nop1687 = alloca i1, i1 0
+  %nop1688 = alloca i1, i1 0
+  %nop1689 = alloca i1, i1 0
+  %nop1690 = alloca i1, i1 0
+  %nop1691 = alloca i1, i1 0
+  %nop1692 = alloca i1, i1 0
+  %nop1693 = alloca i1, i1 0
+  %nop1694 = alloca i1, i1 0
+  %nop1695 = alloca i1, i1 0
+  %nop1696 = alloca i1, i1 0
+  %nop1697 = alloca i1, i1 0
+  %nop1698 = alloca i1, i1 0
+  %nop1699 = alloca i1, i1 0
+  %nop1700 = alloca i1, i1 0
+  %nop1701 = alloca i1, i1 0
+  %nop1702 = alloca i1, i1 0
+  %nop1703 = alloca i1, i1 0
+  %nop1704 = alloca i1, i1 0
+  %nop1705 = alloca i1, i1 0
+  %nop1706 = alloca i1, i1 0
+  %nop1707 = alloca i1, i1 0
+  %nop1708 = alloca i1, i1 0
+  %nop1709 = alloca i1, i1 0
+  %nop1710 = alloca i1, i1 0
+  %nop1711 = alloca i1, i1 0
+  %nop1712 = alloca i1, i1 0
+  %nop1713 = alloca i1, i1 0
+  %nop1714 = alloca i1, i1 0
+  %nop1715 = alloca i1, i1 0
+  %nop1716 = alloca i1, i1 0
+  %nop1717 = alloca i1, i1 0
+  %nop1718 = alloca i1, i1 0
+  %nop1719 = alloca i1, i1 0
+  %nop1720 = alloca i1, i1 0
+  %nop1721 = alloca i1, i1 0
+  %nop1722 = alloca i1, i1 0
+  %nop1723 = alloca i1, i1 0
+  %nop1724 = alloca i1, i1 0
+  %nop1725 = alloca i1, i1 0
+  %nop1726 = alloca i1, i1 0
+  %nop1727 = alloca i1, i1 0
+  %nop1728 = alloca i1, i1 0
+  %nop1729 = alloca i1, i1 0
+  %nop1730 = alloca i1, i1 0
+  %nop1731 = alloca i1, i1 0
+  %nop1732 = alloca i1, i1 0
+  %nop1733 = alloca i1, i1 0
+  %nop1734 = alloca i1, i1 0
+  %nop1735 = alloca i1, i1 0
+  %nop1736 = alloca i1, i1 0
+  %nop1737 = alloca i1, i1 0
+  %nop1738 = alloca i1, i1 0
+  %nop1739 = alloca i1, i1 0
+  %nop1740 = alloca i1, i1 0
+  %nop1741 = alloca i1, i1 0
+  %nop1742 = alloca i1, i1 0
+  %nop1743 = alloca i1, i1 0
+  %nop1744 = alloca i1, i1 0
+  %nop1745 = alloca i1, i1 0
+  %nop1746 = alloca i1, i1 0
+  %nop1747 = alloca i1, i1 0
+  %nop1748 = alloca i1, i1 0
+  %nop1749 = alloca i1, i1 0
+  %nop1750 = alloca i1, i1 0
+  %nop1751 = alloca i1, i1 0
+  %nop1752 = alloca i1, i1 0
+  %nop1753 = alloca i1, i1 0
+  %nop1754 = alloca i1, i1 0
+  %nop1755 = alloca i1, i1 0
+  %nop1756 = alloca i1, i1 0
+  %nop1757 = alloca i1, i1 0
+  %nop1758 = alloca i1, i1 0
+  %nop1759 = alloca i1, i1 0
+  %nop1760 = alloca i1, i1 0
+  %nop1761 = alloca i1, i1 0
+  %nop1762 = alloca i1, i1 0
+  %nop1763 = alloca i1, i1 0
+  %nop1764 = alloca i1, i1 0
+  %nop1765 = alloca i1, i1 0
+  %nop1766 = alloca i1, i1 0
+  %nop1767 = alloca i1, i1 0
+  %nop1768 = alloca i1, i1 0
+  %nop1769 = alloca i1, i1 0
+  %nop1770 = alloca i1, i1 0
+  %nop1771 = alloca i1, i1 0
+  %nop1772 = alloca i1, i1 0
+  %nop1773 = alloca i1, i1 0
+  %nop1774 = alloca i1, i1 0
+  %nop1775 = alloca i1, i1 0
+  %nop1776 = alloca i1, i1 0
+  %nop1777 = alloca i1, i1 0
+  %nop1778 = alloca i1, i1 0
+  %nop1779 = alloca i1, i1 0
+  %nop1780 = alloca i1, i1 0
+  %nop1781 = alloca i1, i1 0
+  %nop1782 = alloca i1, i1 0
+  %nop1783 = alloca i1, i1 0
+  %nop1784 = alloca i1, i1 0
+  %nop1785 = alloca i1, i1 0
+  %nop1786 = alloca i1, i1 0
+  %nop1787 = alloca i1, i1 0
+  %nop1788 = alloca i1, i1 0
+  %nop1789 = alloca i1, i1 0
+  %nop1790 = alloca i1, i1 0
+  %nop1791 = alloca i1, i1 0
+  %nop1792 = alloca i1, i1 0
+  %nop1793 = alloca i1, i1 0
+  %nop1794 = alloca i1, i1 0
+  %nop1795 = alloca i1, i1 0
+  %nop1796 = alloca i1, i1 0
+  %nop1797 = alloca i1, i1 0
+  %nop1798 = alloca i1, i1 0
+  %nop1799 = alloca i1, i1 0
+  %nop1800 = alloca i1, i1 0
+  %nop1801 = alloca i1, i1 0
+  %nop1802 = alloca i1, i1 0
+  %nop1803 = alloca i1, i1 0
+  %nop1804 = alloca i1, i1 0
+  %nop1805 = alloca i1, i1 0
+  %nop1806 = alloca i1, i1 0
+  %nop1807 = alloca i1, i1 0
+  %nop1808 = alloca i1, i1 0
+  %nop1809 = alloca i1, i1 0
+  %nop1810 = alloca i1, i1 0
+  %nop1811 = alloca i1, i1 0
+  %nop1812 = alloca i1, i1 0
+  %nop1813 = alloca i1, i1 0
+  %nop1814 = alloca i1, i1 0
+  %nop1815 = alloca i1, i1 0
+  %nop1816 = alloca i1, i1 0
+  %nop1817 = alloca i1, i1 0
+  %nop1818 = alloca i1, i1 0
+  %nop1819 = alloca i1, i1 0
+  %nop1820 = alloca i1, i1 0
+  %nop1821 = alloca i1, i1 0
+  %nop1822 = alloca i1, i1 0
+  %nop1823 = alloca i1, i1 0
+  %nop1824 = alloca i1, i1 0
+  %nop1825 = alloca i1, i1 0
+  %nop1826 = alloca i1, i1 0
+  %nop1827 = alloca i1, i1 0
+  %nop1828 = alloca i1, i1 0
+  %nop1829 = alloca i1, i1 0
+  %nop1830 = alloca i1, i1 0
+  %nop1831 = alloca i1, i1 0
+  %nop1832 = alloca i1, i1 0
+  %nop1833 = alloca i1, i1 0
+  %nop1834 = alloca i1, i1 0
+  %nop1835 = alloca i1, i1 0
+  %nop1836 = alloca i1, i1 0
+  %nop1837 = alloca i1, i1 0
+  %nop1838 = alloca i1, i1 0
+  %nop1839 = alloca i1, i1 0
+  %nop1840 = alloca i1, i1 0
+  %nop1841 = alloca i1, i1 0
+  %nop1842 = alloca i1, i1 0
+  %nop1843 = alloca i1, i1 0
+  %nop1844 = alloca i1, i1 0
+  %nop1845 = alloca i1, i1 0
+  %nop1846 = alloca i1, i1 0
+  %nop1847 = alloca i1, i1 0
+  %nop1848 = alloca i1, i1 0
+  %nop1849 = alloca i1, i1 0
+  %nop1850 = alloca i1, i1 0
+  %nop1851 = alloca i1, i1 0
+  %nop1852 = alloca i1, i1 0
+  %nop1853 = alloca i1, i1 0
+  %nop1854 = alloca i1, i1 0
+  %nop1855 = alloca i1, i1 0
+  %nop1856 = alloca i1, i1 0
+  %nop1857 = alloca i1, i1 0
+  %nop1858 = alloca i1, i1 0
+  %nop1859 = alloca i1, i1 0
+  %nop1860 = alloca i1, i1 0
+  %nop1861 = alloca i1, i1 0
+  %nop1862 = alloca i1, i1 0
+  %nop1863 = alloca i1, i1 0
+  %nop1864 = alloca i1, i1 0
+  %nop1865 = alloca i1, i1 0
+  %nop1866 = alloca i1, i1 0
+  %nop1867 = alloca i1, i1 0
+  %nop1868 = alloca i1, i1 0
+  %nop1869 = alloca i1, i1 0
+  %nop1870 = alloca i1, i1 0
+  %nop1871 = alloca i1, i1 0
+  %nop1872 = alloca i1, i1 0
+  %nop1873 = alloca i1, i1 0
+  %nop1874 = alloca i1, i1 0
+  %nop1875 = alloca i1, i1 0
+  %nop1876 = alloca i1, i1 0
+  %nop1877 = alloca i1, i1 0
+  %nop1878 = alloca i1, i1 0
+  %nop1879 = alloca i1, i1 0
+  %nop1880 = alloca i1, i1 0
+  %nop1881 = alloca i1, i1 0
+  %nop1882 = alloca i1, i1 0
+  %nop1883 = alloca i1, i1 0
+  %nop1884 = alloca i1, i1 0
+  %nop1885 = alloca i1, i1 0
+  %nop1886 = alloca i1, i1 0
+  %nop1887 = alloca i1, i1 0
+  %nop1888 = alloca i1, i1 0
+  %nop1889 = alloca i1, i1 0
+  %nop1890 = alloca i1, i1 0
+  %nop1891 = alloca i1, i1 0
+  %nop1892 = alloca i1, i1 0
+  %nop1893 = alloca i1, i1 0
+  %nop1894 = alloca i1, i1 0
+  %nop1895 = alloca i1, i1 0
+  %nop1896 = alloca i1, i1 0
+  %nop1897 = alloca i1, i1 0
+  %nop1898 = alloca i1, i1 0
+  %nop1899 = alloca i1, i1 0
+  %nop1900 = alloca i1, i1 0
+  %nop1901 = alloca i1, i1 0
+  %nop1902 = alloca i1, i1 0
+  %nop1903 = alloca i1, i1 0
+  %nop1904 = alloca i1, i1 0
+  %nop1905 = alloca i1, i1 0
+  %nop1906 = alloca i1, i1 0
+  %nop1907 = alloca i1, i1 0
+  %nop1908 = alloca i1, i1 0
+  %nop1909 = alloca i1, i1 0
+  %nop1910 = alloca i1, i1 0
+  %nop1911 = alloca i1, i1 0
+  %nop1912 = alloca i1, i1 0
+  %nop1913 = alloca i1, i1 0
+  %nop1914 = alloca i1, i1 0
+  %nop1915 = alloca i1, i1 0
+  %nop1916 = alloca i1, i1 0
+  %nop1917 = alloca i1, i1 0
+  %nop1918 = alloca i1, i1 0
+  %nop1919 = alloca i1, i1 0
+  %nop1920 = alloca i1, i1 0
+  %nop1921 = alloca i1, i1 0
+  %nop1922 = alloca i1, i1 0
+  %nop1923 = alloca i1, i1 0
+  %nop1924 = alloca i1, i1 0
+  %nop1925 = alloca i1, i1 0
+  %nop1926 = alloca i1, i1 0
+  %nop1927 = alloca i1, i1 0
+  %nop1928 = alloca i1, i1 0
+  %nop1929 = alloca i1, i1 0
+  %nop1930 = alloca i1, i1 0
+  %nop1931 = alloca i1, i1 0
+  %nop1932 = alloca i1, i1 0
+  %nop1933 = alloca i1, i1 0
+  %nop1934 = alloca i1, i1 0
+  %nop1935 = alloca i1, i1 0
+  %nop1936 = alloca i1, i1 0
+  %nop1937 = alloca i1, i1 0
+  %nop1938 = alloca i1, i1 0
+  %nop1939 = alloca i1, i1 0
+  %nop1940 = alloca i1, i1 0
+  %nop1941 = alloca i1, i1 0
+  %nop1942 = alloca i1, i1 0
+  %nop1943 = alloca i1, i1 0
+  %nop1944 = alloca i1, i1 0
+  %nop1945 = alloca i1, i1 0
+  %nop1946 = alloca i1, i1 0
+  %nop1947 = alloca i1, i1 0
+  %nop1948 = alloca i1, i1 0
+  %nop1949 = alloca i1, i1 0
+  %nop1950 = alloca i1, i1 0
+  %nop1951 = alloca i1, i1 0
+  %nop1952 = alloca i1, i1 0
+  %nop1953 = alloca i1, i1 0
+  %nop1954 = alloca i1, i1 0
+  %nop1955 = alloca i1, i1 0
+  %nop1956 = alloca i1, i1 0
+  %nop1957 = alloca i1, i1 0
+  %nop1958 = alloca i1, i1 0
+  %nop1959 = alloca i1, i1 0
+  %nop1960 = alloca i1, i1 0
+  %nop1961 = alloca i1, i1 0
+  %nop1962 = alloca i1, i1 0
+  %nop1963 = alloca i1, i1 0
+  %nop1964 = alloca i1, i1 0
+  %nop1965 = alloca i1, i1 0
+  %nop1966 = alloca i1, i1 0
+  %nop1967 = alloca i1, i1 0
+  %nop1968 = alloca i1, i1 0
+  %nop1969 = alloca i1, i1 0
+  %nop1970 = alloca i1, i1 0
+  %nop1971 = alloca i1, i1 0
+  %nop1972 = alloca i1, i1 0
+  %nop1973 = alloca i1, i1 0
+  %nop1974 = alloca i1, i1 0
+  %nop1975 = alloca i1, i1 0
+  %nop1976 = alloca i1, i1 0
+  %nop1977 = alloca i1, i1 0
+  %nop1978 = alloca i1, i1 0
+  %nop1979 = alloca i1, i1 0
+  %nop1980 = alloca i1, i1 0
+  %nop1981 = alloca i1, i1 0
+  %nop1982 = alloca i1, i1 0
+  %nop1983 = alloca i1, i1 0
+  %nop1984 = alloca i1, i1 0
+  %nop1985 = alloca i1, i1 0
+  %nop1986 = alloca i1, i1 0
+  %nop1987 = alloca i1, i1 0
+  %nop1988 = alloca i1, i1 0
+  %nop1989 = alloca i1, i1 0
+  %nop1990 = alloca i1, i1 0
+  %nop1991 = alloca i1, i1 0
+  %nop1992 = alloca i1, i1 0
+  %nop1993 = alloca i1, i1 0
+  %nop1994 = alloca i1, i1 0
+  %nop1995 = alloca i1, i1 0
+  %nop1996 = alloca i1, i1 0
+  %nop1997 = alloca i1, i1 0
+  %nop1998 = alloca i1, i1 0
+  %nop1999 = alloca i1, i1 0
+  %nop2000 = alloca i1, i1 0
+  %nop2001 = alloca i1, i1 0
+  %nop2002 = alloca i1, i1 0
+  %nop2003 = alloca i1, i1 0
+  %nop2004 = alloca i1, i1 0
+  %nop2005 = alloca i1, i1 0
+  %nop2006 = alloca i1, i1 0
+  %nop2007 = alloca i1, i1 0
+  %nop2008 = alloca i1, i1 0
+  %nop2009 = alloca i1, i1 0
+  %nop2010 = alloca i1, i1 0
+  %nop2011 = alloca i1, i1 0
+  %nop2012 = alloca i1, i1 0
+  %nop2013 = alloca i1, i1 0
+  %nop2014 = alloca i1, i1 0
+  %nop2015 = alloca i1, i1 0
+  %nop2016 = alloca i1, i1 0
+  %nop2017 = alloca i1, i1 0
+  %nop2018 = alloca i1, i1 0
+  %nop2019 = alloca i1, i1 0
+  %nop2020 = alloca i1, i1 0
+  %nop2021 = alloca i1, i1 0
+  %nop2022 = alloca i1, i1 0
+  %nop2023 = alloca i1, i1 0
+  %nop2024 = alloca i1, i1 0
+  %nop2025 = alloca i1, i1 0
+  %nop2026 = alloca i1, i1 0
+  %nop2027 = alloca i1, i1 0
+  %nop2028 = alloca i1, i1 0
+  %nop2029 = alloca i1, i1 0
+  %nop2030 = alloca i1, i1 0
+  %nop2031 = alloca i1, i1 0
+  %nop2032 = alloca i1, i1 0
+  %nop2033 = alloca i1, i1 0
+  %nop2034 = alloca i1, i1 0
+  %nop2035 = alloca i1, i1 0
+  %nop2036 = alloca i1, i1 0
+  %nop2037 = alloca i1, i1 0
+  %nop2038 = alloca i1, i1 0
+  %nop2039 = alloca i1, i1 0
+  %nop2040 = alloca i1, i1 0
+  %nop2041 = alloca i1, i1 0
+  %nop2042 = alloca i1, i1 0
+  %nop2043 = alloca i1, i1 0
+  %nop2044 = alloca i1, i1 0
+  %nop2045 = alloca i1, i1 0
+  %nop2046 = alloca i1, i1 0
+  %nop2047 = alloca i1, i1 0
+  %nop2048 = alloca i1, i1 0
+  %nop2049 = alloca i1, i1 0
+  %nop2050 = alloca i1, i1 0
+  %nop2051 = alloca i1, i1 0
+  %nop2052 = alloca i1, i1 0
+  %nop2053 = alloca i1, i1 0
+  %nop2054 = alloca i1, i1 0
+  %nop2055 = alloca i1, i1 0
+  %nop2056 = alloca i1, i1 0
+  %nop2057 = alloca i1, i1 0
+  %nop2058 = alloca i1, i1 0
+  %nop2059 = alloca i1, i1 0
+  %nop2060 = alloca i1, i1 0
+  %nop2061 = alloca i1, i1 0
+  %nop2062 = alloca i1, i1 0
+  %nop2063 = alloca i1, i1 0
+  %nop2064 = alloca i1, i1 0
+  %nop2065 = alloca i1, i1 0
+  %nop2066 = alloca i1, i1 0
+  %nop2067 = alloca i1, i1 0
+  %nop2068 = alloca i1, i1 0
+  %nop2069 = alloca i1, i1 0
+  %nop2070 = alloca i1, i1 0
+  %nop2071 = alloca i1, i1 0
+  %nop2072 = alloca i1, i1 0
+  %nop2073 = alloca i1, i1 0
+  %nop2074 = alloca i1, i1 0
+  %nop2075 = alloca i1, i1 0
+  %nop2076 = alloca i1, i1 0
+  %nop2077 = alloca i1, i1 0
+  %nop2078 = alloca i1, i1 0
+  %nop2079 = alloca i1, i1 0
+  %nop2080 = alloca i1, i1 0
+  %nop2081 = alloca i1, i1 0
+  %nop2082 = alloca i1, i1 0
+  %nop2083 = alloca i1, i1 0
+  %nop2084 = alloca i1, i1 0
+  %nop2085 = alloca i1, i1 0
+  %nop2086 = alloca i1, i1 0
+  %nop2087 = alloca i1, i1 0
+  %nop2088 = alloca i1, i1 0
+  %nop2089 = alloca i1, i1 0
+  %nop2090 = alloca i1, i1 0
+  %nop2091 = alloca i1, i1 0
+  %nop2092 = alloca i1, i1 0
+  %nop2093 = alloca i1, i1 0
+  %nop2094 = alloca i1, i1 0
+  %nop2095 = alloca i1, i1 0
+  %nop2096 = alloca i1, i1 0
+  %nop2097 = alloca i1, i1 0
+  %nop2098 = alloca i1, i1 0
+  %nop2099 = alloca i1, i1 0
+  %nop2100 = alloca i1, i1 0
+  %nop2101 = alloca i1, i1 0
+  %nop2102 = alloca i1, i1 0
+  %nop2103 = alloca i1, i1 0
+  %nop2104 = alloca i1, i1 0
+  %nop2105 = alloca i1, i1 0
+  %nop2106 = alloca i1, i1 0
+  %nop2107 = alloca i1, i1 0
+  %nop2108 = alloca i1, i1 0
+  %nop2109 = alloca i1, i1 0
+  %nop2110 = alloca i1, i1 0
+  %nop2111 = alloca i1, i1 0
+  %nop2112 = alloca i1, i1 0
+  %nop2113 = alloca i1, i1 0
+  %nop2114 = alloca i1, i1 0
+  %nop2115 = alloca i1, i1 0
+  %nop2116 = alloca i1, i1 0
+  %nop2117 = alloca i1, i1 0
+  %nop2118 = alloca i1, i1 0
+  %nop2119 = alloca i1, i1 0
+  %nop2120 = alloca i1, i1 0
+  %nop2121 = alloca i1, i1 0
+  %nop2122 = alloca i1, i1 0
+  %nop2123 = alloca i1, i1 0
+  %nop2124 = alloca i1, i1 0
+  %nop2125 = alloca i1, i1 0
+  %nop2126 = alloca i1, i1 0
+  %nop2127 = alloca i1, i1 0
+  %nop2128 = alloca i1, i1 0
+  %nop2129 = alloca i1, i1 0
+  %nop2130 = alloca i1, i1 0
+  %nop2131 = alloca i1, i1 0
+  %nop2132 = alloca i1, i1 0
+  %nop2133 = alloca i1, i1 0
+  %nop2134 = alloca i1, i1 0
+  %nop2135 = alloca i1, i1 0
+  %nop2136 = alloca i1, i1 0
+  %nop2137 = alloca i1, i1 0
+  %nop2138 = alloca i1, i1 0
+  %nop2139 = alloca i1, i1 0
+  %nop2140 = alloca i1, i1 0
+  %nop2141 = alloca i1, i1 0
+  %nop2142 = alloca i1, i1 0
+  %nop2143 = alloca i1, i1 0
+  %nop2144 = alloca i1, i1 0
+  %nop2145 = alloca i1, i1 0
+  %nop2146 = alloca i1, i1 0
+  %nop2147 = alloca i1, i1 0
+  %nop2148 = alloca i1, i1 0
+  %nop2149 = alloca i1, i1 0
+  %nop2150 = alloca i1, i1 0
+  %nop2151 = alloca i1, i1 0
+  %nop2152 = alloca i1, i1 0
+  %nop2153 = alloca i1, i1 0
+  %nop2154 = alloca i1, i1 0
+  %nop2155 = alloca i1, i1 0
+  %nop2156 = alloca i1, i1 0
+  %nop2157 = alloca i1, i1 0
+  %nop2158 = alloca i1, i1 0
+  %nop2159 = alloca i1, i1 0
+  %nop2160 = alloca i1, i1 0
+  %nop2161 = alloca i1, i1 0
+  %nop2162 = alloca i1, i1 0
+  %nop2163 = alloca i1, i1 0
+  %nop2164 = alloca i1, i1 0
+  %nop2165 = alloca i1, i1 0
+  %nop2166 = alloca i1, i1 0
+  %nop2167 = alloca i1, i1 0
+  %nop2168 = alloca i1, i1 0
+  %nop2169 = alloca i1, i1 0
+  %nop2170 = alloca i1, i1 0
+  %nop2171 = alloca i1, i1 0
+  %nop2172 = alloca i1, i1 0
+  %nop2173 = alloca i1, i1 0
+  %nop2174 = alloca i1, i1 0
+  %nop2175 = alloca i1, i1 0
+  %nop2176 = alloca i1, i1 0
+  %nop2177 = alloca i1, i1 0
+  %nop2178 = alloca i1, i1 0
+  %nop2179 = alloca i1, i1 0
+  %nop2180 = alloca i1, i1 0
+  %nop2181 = alloca i1, i1 0
+  %nop2182 = alloca i1, i1 0
+  %nop2183 = alloca i1, i1 0
+  %nop2184 = alloca i1, i1 0
+  %nop2185 = alloca i1, i1 0
+  %nop2186 = alloca i1, i1 0
+  %nop2187 = alloca i1, i1 0
+  %nop2188 = alloca i1, i1 0
+  %nop2189 = alloca i1, i1 0
+  %nop2190 = alloca i1, i1 0
+  %nop2191 = alloca i1, i1 0
+  %nop2192 = alloca i1, i1 0
+  %nop2193 = alloca i1, i1 0
+  %nop2194 = alloca i1, i1 0
+  %nop2195 = alloca i1, i1 0
+  %nop2196 = alloca i1, i1 0
+  %nop2197 = alloca i1, i1 0
+  %nop2198 = alloca i1, i1 0
+  %nop2199 = alloca i1, i1 0
+  %nop2200 = alloca i1, i1 0
+  %nop2201 = alloca i1, i1 0
+  %nop2202 = alloca i1, i1 0
+  %nop2203 = alloca i1, i1 0
+  %nop2204 = alloca i1, i1 0
+  %nop2205 = alloca i1, i1 0
+  %nop2206 = alloca i1, i1 0
+  %nop2207 = alloca i1, i1 0
+  %nop2208 = alloca i1, i1 0
+  %nop2209 = alloca i1, i1 0
+  %nop2210 = alloca i1, i1 0
+  %nop2211 = alloca i1, i1 0
+  %nop2212 = alloca i1, i1 0
+  %nop2213 = alloca i1, i1 0
+  %nop2214 = alloca i1, i1 0
+  %nop2215 = alloca i1, i1 0
+  %nop2216 = alloca i1, i1 0
+  %nop2217 = alloca i1, i1 0
+  %nop2218 = alloca i1, i1 0
+  %nop2219 = alloca i1, i1 0
+  %nop2220 = alloca i1, i1 0
+  %nop2221 = alloca i1, i1 0
+  %nop2222 = alloca i1, i1 0
+  %nop2223 = alloca i1, i1 0
+  %nop2224 = alloca i1, i1 0
+  %nop2225 = alloca i1, i1 0
+  %nop2226 = alloca i1, i1 0
+  %nop2227 = alloca i1, i1 0
+  %nop2228 = alloca i1, i1 0
+  %nop2229 = alloca i1, i1 0
+  %nop2230 = alloca i1, i1 0
+  %nop2231 = alloca i1, i1 0
+  %nop2232 = alloca i1, i1 0
+  %nop2233 = alloca i1, i1 0
+  %nop2234 = alloca i1, i1 0
+  %nop2235 = alloca i1, i1 0
+  %nop2236 = alloca i1, i1 0
+  %nop2237 = alloca i1, i1 0
+  %nop2238 = alloca i1, i1 0
+  %nop2239 = alloca i1, i1 0
+  %nop2240 = alloca i1, i1 0
+  %nop2241 = alloca i1, i1 0
+  %nop2242 = alloca i1, i1 0
+  %nop2243 = alloca i1, i1 0
+  %nop2244 = alloca i1, i1 0
+  %nop2245 = alloca i1, i1 0
+  %nop2246 = alloca i1, i1 0
+  %nop2247 = alloca i1, i1 0
+  %nop2248 = alloca i1, i1 0
+  %nop2249 = alloca i1, i1 0
+  %nop2250 = alloca i1, i1 0
+  %nop2251 = alloca i1, i1 0
+  %nop2252 = alloca i1, i1 0
+  %nop2253 = alloca i1, i1 0
+  %nop2254 = alloca i1, i1 0
+  %nop2255 = alloca i1, i1 0
+  %nop2256 = alloca i1, i1 0
+  %nop2257 = alloca i1, i1 0
+  %nop2258 = alloca i1, i1 0
+  %nop2259 = alloca i1, i1 0
+  %nop2260 = alloca i1, i1 0
+  %nop2261 = alloca i1, i1 0
+  %nop2262 = alloca i1, i1 0
+  %nop2263 = alloca i1, i1 0
+  %nop2264 = alloca i1, i1 0
+  %nop2265 = alloca i1, i1 0
+  %nop2266 = alloca i1, i1 0
+  %nop2267 = alloca i1, i1 0
+  %nop2268 = alloca i1, i1 0
+  %nop2269 = alloca i1, i1 0
+  %nop2270 = alloca i1, i1 0
+  %nop2271 = alloca i1, i1 0
+  %nop2272 = alloca i1, i1 0
+  %nop2273 = alloca i1, i1 0
+  %nop2274 = alloca i1, i1 0
+  %nop2275 = alloca i1, i1 0
+  %nop2276 = alloca i1, i1 0
+  %nop2277 = alloca i1, i1 0
+  %nop2278 = alloca i1, i1 0
+  %nop2279 = alloca i1, i1 0
+  %nop2280 = alloca i1, i1 0
+  %nop2281 = alloca i1, i1 0
+  %nop2282 = alloca i1, i1 0
+  %nop2283 = alloca i1, i1 0
+  %nop2284 = alloca i1, i1 0
+  %nop2285 = alloca i1, i1 0
+  %nop2286 = alloca i1, i1 0
+  %nop2287 = alloca i1, i1 0
+  %nop2288 = alloca i1, i1 0
+  %nop2289 = alloca i1, i1 0
+  %nop2290 = alloca i1, i1 0
+  %nop2291 = alloca i1, i1 0
+  %nop2292 = alloca i1, i1 0
+  %nop2293 = alloca i1, i1 0
+  %nop2294 = alloca i1, i1 0
+  %nop2295 = alloca i1, i1 0
+  %nop2296 = alloca i1, i1 0
+  %nop2297 = alloca i1, i1 0
+  %nop2298 = alloca i1, i1 0
+  %nop2299 = alloca i1, i1 0
+  %nop2300 = alloca i1, i1 0
+  %nop2301 = alloca i1, i1 0
+  %nop2302 = alloca i1, i1 0
+  %nop2303 = alloca i1, i1 0
+  %nop2304 = alloca i1, i1 0
+  %nop2305 = alloca i1, i1 0
+  %nop2306 = alloca i1, i1 0
+  %nop2307 = alloca i1, i1 0
+  %nop2308 = alloca i1, i1 0
+  %nop2309 = alloca i1, i1 0
+  %nop2310 = alloca i1, i1 0
+  %nop2311 = alloca i1, i1 0
+  %nop2312 = alloca i1, i1 0
+  %nop2313 = alloca i1, i1 0
+  %nop2314 = alloca i1, i1 0
+  %nop2315 = alloca i1, i1 0
+  %nop2316 = alloca i1, i1 0
+  %nop2317 = alloca i1, i1 0
+  %nop2318 = alloca i1, i1 0
+  %nop2319 = alloca i1, i1 0
+  %nop2320 = alloca i1, i1 0
+  %nop2321 = alloca i1, i1 0
+  %nop2322 = alloca i1, i1 0
+  %nop2323 = alloca i1, i1 0
+  %nop2324 = alloca i1, i1 0
+  %nop2325 = alloca i1, i1 0
+  %nop2326 = alloca i1, i1 0
+  %nop2327 = alloca i1, i1 0
+  %nop2328 = alloca i1, i1 0
+  %nop2329 = alloca i1, i1 0
+  %nop2330 = alloca i1, i1 0
+  %nop2331 = alloca i1, i1 0
+  %nop2332 = alloca i1, i1 0
+  %nop2333 = alloca i1, i1 0
+  %nop2334 = alloca i1, i1 0
+  %nop2335 = alloca i1, i1 0
+  %nop2336 = alloca i1, i1 0
+  %nop2337 = alloca i1, i1 0
+  %nop2338 = alloca i1, i1 0
+  %nop2339 = alloca i1, i1 0
+  %nop2340 = alloca i1, i1 0
+  %nop2341 = alloca i1, i1 0
+  %nop2342 = alloca i1, i1 0
+  %nop2343 = alloca i1, i1 0
+  %nop2344 = alloca i1, i1 0
+  %nop2345 = alloca i1, i1 0
+  %nop2346 = alloca i1, i1 0
+  %nop2347 = alloca i1, i1 0
+  %nop2348 = alloca i1, i1 0
+  %nop2349 = alloca i1, i1 0
+  %nop2350 = alloca i1, i1 0
+  %nop2351 = alloca i1, i1 0
+  %nop2352 = alloca i1, i1 0
+  %nop2353 = alloca i1, i1 0
+  %nop2354 = alloca i1, i1 0
+  %nop2355 = alloca i1, i1 0
+  %nop2356 = alloca i1, i1 0
+  %nop2357 = alloca i1, i1 0
+  %nop2358 = alloca i1, i1 0
+  %nop2359 = alloca i1, i1 0
+  %nop2360 = alloca i1, i1 0
+  %nop2361 = alloca i1, i1 0
+  %nop2362 = alloca i1, i1 0
+  %nop2363 = alloca i1, i1 0
+  %nop2364 = alloca i1, i1 0
+  %nop2365 = alloca i1, i1 0
+  %nop2366 = alloca i1, i1 0
+  %nop2367 = alloca i1, i1 0
+  %nop2368 = alloca i1, i1 0
+  %nop2369 = alloca i1, i1 0
+  %nop2370 = alloca i1, i1 0
+  %nop2371 = alloca i1, i1 0
+  %nop2372 = alloca i1, i1 0
+  %nop2373 = alloca i1, i1 0
+  %nop2374 = alloca i1, i1 0
+  %nop2375 = alloca i1, i1 0
+  %nop2376 = alloca i1, i1 0
+  %nop2377 = alloca i1, i1 0
+  %nop2378 = alloca i1, i1 0
+  %nop2379 = alloca i1, i1 0
+  %nop2380 = alloca i1, i1 0
+  %nop2381 = alloca i1, i1 0
+  %nop2382 = alloca i1, i1 0
+  %nop2383 = alloca i1, i1 0
+  %nop2384 = alloca i1, i1 0
+  %nop2385 = alloca i1, i1 0
+  %nop2386 = alloca i1, i1 0
+  %nop2387 = alloca i1, i1 0
+  %nop2388 = alloca i1, i1 0
+  %nop2389 = alloca i1, i1 0
+  %nop2390 = alloca i1, i1 0
+  %nop2391 = alloca i1, i1 0
+  %nop2392 = alloca i1, i1 0
+  %nop2393 = alloca i1, i1 0
+  %nop2394 = alloca i1, i1 0
+  %nop2395 = alloca i1, i1 0
+  %nop2396 = alloca i1, i1 0
+  %nop2397 = alloca i1, i1 0
+  %nop2398 = alloca i1, i1 0
+  %nop2399 = alloca i1, i1 0
+  %nop2400 = alloca i1, i1 0
+  %nop2401 = alloca i1, i1 0
+  %nop2402 = alloca i1, i1 0
+  %nop2403 = alloca i1, i1 0
+  %nop2404 = alloca i1, i1 0
+  %nop2405 = alloca i1, i1 0
+  %nop2406 = alloca i1, i1 0
+  %nop2407 = alloca i1, i1 0
+  %nop2408 = alloca i1, i1 0
+  %nop2409 = alloca i1, i1 0
+  %nop2410 = alloca i1, i1 0
+  %nop2411 = alloca i1, i1 0
+  %nop2412 = alloca i1, i1 0
+  %nop2413 = alloca i1, i1 0
+  %nop2414 = alloca i1, i1 0
+  %nop2415 = alloca i1, i1 0
+  %nop2416 = alloca i1, i1 0
+  %nop2417 = alloca i1, i1 0
+  %nop2418 = alloca i1, i1 0
+  %nop2419 = alloca i1, i1 0
+  %nop2420 = alloca i1, i1 0
+  %nop2421 = alloca i1, i1 0
+  %nop2422 = alloca i1, i1 0
+  %nop2423 = alloca i1, i1 0
+  %nop2424 = alloca i1, i1 0
+  %nop2425 = alloca i1, i1 0
+  %nop2426 = alloca i1, i1 0
+  %nop2427 = alloca i1, i1 0
+  %nop2428 = alloca i1, i1 0
+  %nop2429 = alloca i1, i1 0
+  %nop2430 = alloca i1, i1 0
+  %nop2431 = alloca i1, i1 0
+  %nop2432 = alloca i1, i1 0
+  %nop2433 = alloca i1, i1 0
+  %nop2434 = alloca i1, i1 0
+  %nop2435 = alloca i1, i1 0
+  %nop2436 = alloca i1, i1 0
+  %nop2437 = alloca i1, i1 0
+  %nop2438 = alloca i1, i1 0
+  %nop2439 = alloca i1, i1 0
+  %nop2440 = alloca i1, i1 0
+  %nop2441 = alloca i1, i1 0
+  %nop2442 = alloca i1, i1 0
+  %nop2443 = alloca i1, i1 0
+  %nop2444 = alloca i1, i1 0
+  %nop2445 = alloca i1, i1 0
+  %nop2446 = alloca i1, i1 0
+  %nop2447 = alloca i1, i1 0
+  %nop2448 = alloca i1, i1 0
+  %nop2449 = alloca i1, i1 0
+  %nop2450 = alloca i1, i1 0
+  %nop2451 = alloca i1, i1 0
+  %nop2452 = alloca i1, i1 0
+  %nop2453 = alloca i1, i1 0
+  %nop2454 = alloca i1, i1 0
+  %nop2455 = alloca i1, i1 0
+  %nop2456 = alloca i1, i1 0
+  %nop2457 = alloca i1, i1 0
+  %nop2458 = alloca i1, i1 0
+  %nop2459 = alloca i1, i1 0
+  %nop2460 = alloca i1, i1 0
+  %nop2461 = alloca i1, i1 0
+  %nop2462 = alloca i1, i1 0
+  %nop2463 = alloca i1, i1 0
+  %nop2464 = alloca i1, i1 0
+  %nop2465 = alloca i1, i1 0
+  %nop2466 = alloca i1, i1 0
+  %nop2467 = alloca i1, i1 0
+  %nop2468 = alloca i1, i1 0
+  %nop2469 = alloca i1, i1 0
+  %nop2470 = alloca i1, i1 0
+  %nop2471 = alloca i1, i1 0
+  %nop2472 = alloca i1, i1 0
+  %nop2473 = alloca i1, i1 0
+  %nop2474 = alloca i1, i1 0
+  %nop2475 = alloca i1, i1 0
+  %nop2476 = alloca i1, i1 0
+  %nop2477 = alloca i1, i1 0
+  %nop2478 = alloca i1, i1 0
+  %nop2479 = alloca i1, i1 0
+  %nop2480 = alloca i1, i1 0
+  %nop2481 = alloca i1, i1 0
+  %nop2482 = alloca i1, i1 0
+  %nop2483 = alloca i1, i1 0
+  %nop2484 = alloca i1, i1 0
+  %nop2485 = alloca i1, i1 0
+  %nop2486 = alloca i1, i1 0
+  %nop2487 = alloca i1, i1 0
+  %nop2488 = alloca i1, i1 0
+  %nop2489 = alloca i1, i1 0
+  %nop2490 = alloca i1, i1 0
+  %nop2491 = alloca i1, i1 0
+  %nop2492 = alloca i1, i1 0
+  %nop2493 = alloca i1, i1 0
+  %nop2494 = alloca i1, i1 0
+  %nop2495 = alloca i1, i1 0
+  %nop2496 = alloca i1, i1 0
+  %nop2497 = alloca i1, i1 0
+  %nop2498 = alloca i1, i1 0
+  %nop2499 = alloca i1, i1 0
+  %nop2500 = alloca i1, i1 0
+  %nop2501 = alloca i1, i1 0
+  %nop2502 = alloca i1, i1 0
+  %nop2503 = alloca i1, i1 0
+  %nop2504 = alloca i1, i1 0
+  %nop2505 = alloca i1, i1 0
+  %nop2506 = alloca i1, i1 0
+  %nop2507 = alloca i1, i1 0
+  %nop2508 = alloca i1, i1 0
+  %nop2509 = alloca i1, i1 0
+  %nop2510 = alloca i1, i1 0
+  %nop2511 = alloca i1, i1 0
+  %nop2512 = alloca i1, i1 0
+  %nop2513 = alloca i1, i1 0
+  %nop2514 = alloca i1, i1 0
+  %nop2515 = alloca i1, i1 0
+  %nop2516 = alloca i1, i1 0
+  %nop2517 = alloca i1, i1 0
+  %nop2518 = alloca i1, i1 0
+  %nop2519 = alloca i1, i1 0
+  %nop2520 = alloca i1, i1 0
+  %nop2521 = alloca i1, i1 0
+  %nop2522 = alloca i1, i1 0
+  %nop2523 = alloca i1, i1 0
+  %nop2524 = alloca i1, i1 0
+  %nop2525 = alloca i1, i1 0
+  %nop2526 = alloca i1, i1 0
+  %nop2527 = alloca i1, i1 0
+  %nop2528 = alloca i1, i1 0
+  %nop2529 = alloca i1, i1 0
+  %nop2530 = alloca i1, i1 0
+  %nop2531 = alloca i1, i1 0
+  %nop2532 = alloca i1, i1 0
+  %nop2533 = alloca i1, i1 0
+  %nop2534 = alloca i1, i1 0
+  %nop2535 = alloca i1, i1 0
+  %nop2536 = alloca i1, i1 0
+  %nop2537 = alloca i1, i1 0
+  %nop2538 = alloca i1, i1 0
+  %nop2539 = alloca i1, i1 0
+  %nop2540 = alloca i1, i1 0
+  %nop2541 = alloca i1, i1 0
+  %nop2542 = alloca i1, i1 0
+  %nop2543 = alloca i1, i1 0
+  %nop2544 = alloca i1, i1 0
+  %nop2545 = alloca i1, i1 0
+  %nop2546 = alloca i1, i1 0
+  %nop2547 = alloca i1, i1 0
+  %nop2548 = alloca i1, i1 0
+  %nop2549 = alloca i1, i1 0
+  %nop2550 = alloca i1, i1 0
+  %nop2551 = alloca i1, i1 0
+  %nop2552 = alloca i1, i1 0
+  %nop2553 = alloca i1, i1 0
+  %nop2554 = alloca i1, i1 0
+  %nop2555 = alloca i1, i1 0
+  %nop2556 = alloca i1, i1 0
+  %nop2557 = alloca i1, i1 0
+  %nop2558 = alloca i1, i1 0
+  %nop2559 = alloca i1, i1 0
+  %nop2560 = alloca i1, i1 0
+  %nop2561 = alloca i1, i1 0
+  %nop2562 = alloca i1, i1 0
+  %nop2563 = alloca i1, i1 0
+  %nop2564 = alloca i1, i1 0
+  %nop2565 = alloca i1, i1 0
+  %nop2566 = alloca i1, i1 0
+  %nop2567 = alloca i1, i1 0
+  %nop2568 = alloca i1, i1 0
+  %nop2569 = alloca i1, i1 0
+  %nop2570 = alloca i1, i1 0
+  %nop2571 = alloca i1, i1 0
+  %nop2572 = alloca i1, i1 0
+  %nop2573 = alloca i1, i1 0
+  %nop2574 = alloca i1, i1 0
+  %nop2575 = alloca i1, i1 0
+  %nop2576 = alloca i1, i1 0
+  %nop2577 = alloca i1, i1 0
+  %nop2578 = alloca i1, i1 0
+  %nop2579 = alloca i1, i1 0
+  %nop2580 = alloca i1, i1 0
+  %nop2581 = alloca i1, i1 0
+  %nop2582 = alloca i1, i1 0
+  %nop2583 = alloca i1, i1 0
+  %nop2584 = alloca i1, i1 0
+  %nop2585 = alloca i1, i1 0
+  %nop2586 = alloca i1, i1 0
+  %nop2587 = alloca i1, i1 0
+  %nop2588 = alloca i1, i1 0
+  %nop2589 = alloca i1, i1 0
+  %nop2590 = alloca i1, i1 0
+  %nop2591 = alloca i1, i1 0
+  %nop2592 = alloca i1, i1 0
+  %nop2593 = alloca i1, i1 0
+  %nop2594 = alloca i1, i1 0
+  %nop2595 = alloca i1, i1 0
+  %nop2596 = alloca i1, i1 0
+  %nop2597 = alloca i1, i1 0
+  %nop2598 = alloca i1, i1 0
+  %nop2599 = alloca i1, i1 0
+  %nop2600 = alloca i1, i1 0
+  %nop2601 = alloca i1, i1 0
+  %nop2602 = alloca i1, i1 0
+  %nop2603 = alloca i1, i1 0
+  %nop2604 = alloca i1, i1 0
+  %nop2605 = alloca i1, i1 0
+  %nop2606 = alloca i1, i1 0
+  %nop2607 = alloca i1, i1 0
+  %nop2608 = alloca i1, i1 0
+  %nop2609 = alloca i1, i1 0
+  %nop2610 = alloca i1, i1 0
+  %nop2611 = alloca i1, i1 0
+  %nop2612 = alloca i1, i1 0
+  %nop2613 = alloca i1, i1 0
+  %nop2614 = alloca i1, i1 0
+  %nop2615 = alloca i1, i1 0
+  %nop2616 = alloca i1, i1 0
+  %nop2617 = alloca i1, i1 0
+  %nop2618 = alloca i1, i1 0
+  %nop2619 = alloca i1, i1 0
+  %nop2620 = alloca i1, i1 0
+  %nop2621 = alloca i1, i1 0
+  %nop2622 = alloca i1, i1 0
+  %nop2623 = alloca i1, i1 0
+  %nop2624 = alloca i1, i1 0
+  %nop2625 = alloca i1, i1 0
+  %nop2626 = alloca i1, i1 0
+  %nop2627 = alloca i1, i1 0
+  %nop2628 = alloca i1, i1 0
+  %nop2629 = alloca i1, i1 0
+  %nop2630 = alloca i1, i1 0
+  %nop2631 = alloca i1, i1 0
+  %nop2632 = alloca i1, i1 0
+  %nop2633 = alloca i1, i1 0
+  %nop2634 = alloca i1, i1 0
+  %nop2635 = alloca i1, i1 0
+  %nop2636 = alloca i1, i1 0
+  %nop2637 = alloca i1, i1 0
+  %nop2638 = alloca i1, i1 0
+  %nop2639 = alloca i1, i1 0
+  %nop2640 = alloca i1, i1 0
+  %nop2641 = alloca i1, i1 0
+  %nop2642 = alloca i1, i1 0
+  %nop2643 = alloca i1, i1 0
+  %nop2644 = alloca i1, i1 0
+  %nop2645 = alloca i1, i1 0
+  %nop2646 = alloca i1, i1 0
+  %nop2647 = alloca i1, i1 0
+  %nop2648 = alloca i1, i1 0
+  %nop2649 = alloca i1, i1 0
+  %nop2650 = alloca i1, i1 0
+  %nop2651 = alloca i1, i1 0
+  %nop2652 = alloca i1, i1 0
+  %nop2653 = alloca i1, i1 0
+  %nop2654 = alloca i1, i1 0
+  %nop2655 = alloca i1, i1 0
+  %nop2656 = alloca i1, i1 0
+  %nop2657 = alloca i1, i1 0
+  %nop2658 = alloca i1, i1 0
+  %nop2659 = alloca i1, i1 0
+  %nop2660 = alloca i1, i1 0
+  %nop2661 = alloca i1, i1 0
+  %nop2662 = alloca i1, i1 0
+  %nop2663 = alloca i1, i1 0
+  %nop2664 = alloca i1, i1 0
+  %nop2665 = alloca i1, i1 0
+  %nop2666 = alloca i1, i1 0
+  %nop2667 = alloca i1, i1 0
+  %nop2668 = alloca i1, i1 0
+  %nop2669 = alloca i1, i1 0
+  %nop2670 = alloca i1, i1 0
+  %nop2671 = alloca i1, i1 0
+  %nop2672 = alloca i1, i1 0
+  %nop2673 = alloca i1, i1 0
+  %nop2674 = alloca i1, i1 0
+  %nop2675 = alloca i1, i1 0
+  %nop2676 = alloca i1, i1 0
+  %nop2677 = alloca i1, i1 0
+  %nop2678 = alloca i1, i1 0
+  %nop2679 = alloca i1, i1 0
+  %nop2680 = alloca i1, i1 0
+  %nop2681 = alloca i1, i1 0
+  %nop2682 = alloca i1, i1 0
+  %nop2683 = alloca i1, i1 0
+  %nop2684 = alloca i1, i1 0
+  %nop2685 = alloca i1, i1 0
+  %nop2686 = alloca i1, i1 0
+  %nop2687 = alloca i1, i1 0
+  %nop2688 = alloca i1, i1 0
+  %nop2689 = alloca i1, i1 0
+  %nop2690 = alloca i1, i1 0
+  %nop2691 = alloca i1, i1 0
+  %nop2692 = alloca i1, i1 0
+  %nop2693 = alloca i1, i1 0
+  %nop2694 = alloca i1, i1 0
+  %nop2695 = alloca i1, i1 0
+  %nop2696 = alloca i1, i1 0
+  %nop2697 = alloca i1, i1 0
+  %nop2698 = alloca i1, i1 0
+  %nop2699 = alloca i1, i1 0
+  %nop2700 = alloca i1, i1 0
+  %nop2701 = alloca i1, i1 0
+  %nop2702 = alloca i1, i1 0
+  %nop2703 = alloca i1, i1 0
+  %nop2704 = alloca i1, i1 0
+  %nop2705 = alloca i1, i1 0
+  %nop2706 = alloca i1, i1 0
+  %nop2707 = alloca i1, i1 0
+  %nop2708 = alloca i1, i1 0
+  %nop2709 = alloca i1, i1 0
+  %nop2710 = alloca i1, i1 0
+  %nop2711 = alloca i1, i1 0
+  %nop2712 = alloca i1, i1 0
+  %nop2713 = alloca i1, i1 0
+  %nop2714 = alloca i1, i1 0
+  %nop2715 = alloca i1, i1 0
+  %nop2716 = alloca i1, i1 0
+  %nop2717 = alloca i1, i1 0
+  %nop2718 = alloca i1, i1 0
+  %nop2719 = alloca i1, i1 0
+  %nop2720 = alloca i1, i1 0
+  %nop2721 = alloca i1, i1 0
+  %nop2722 = alloca i1, i1 0
+  %nop2723 = alloca i1, i1 0
+  %nop2724 = alloca i1, i1 0
+  %nop2725 = alloca i1, i1 0
+  %nop2726 = alloca i1, i1 0
+  %nop2727 = alloca i1, i1 0
+  %nop2728 = alloca i1, i1 0
+  %nop2729 = alloca i1, i1 0
+  %nop2730 = alloca i1, i1 0
+  %nop2731 = alloca i1, i1 0
+  %nop2732 = alloca i1, i1 0
+  %nop2733 = alloca i1, i1 0
+  %nop2734 = alloca i1, i1 0
+  %nop2735 = alloca i1, i1 0
+  %nop2736 = alloca i1, i1 0
+  %nop2737 = alloca i1, i1 0
+  %nop2738 = alloca i1, i1 0
+  %nop2739 = alloca i1, i1 0
+  %nop2740 = alloca i1, i1 0
+  %nop2741 = alloca i1, i1 0
+  %nop2742 = alloca i1, i1 0
+  %nop2743 = alloca i1, i1 0
+  %nop2744 = alloca i1, i1 0
+  %nop2745 = alloca i1, i1 0
+  %nop2746 = alloca i1, i1 0
+  %nop2747 = alloca i1, i1 0
+  %nop2748 = alloca i1, i1 0
+  %nop2749 = alloca i1, i1 0
+  %nop2750 = alloca i1, i1 0
+  %nop2751 = alloca i1, i1 0
+  %nop2752 = alloca i1, i1 0
+  %nop2753 = alloca i1, i1 0
+  %nop2754 = alloca i1, i1 0
+  %nop2755 = alloca i1, i1 0
+  %nop2756 = alloca i1, i1 0
+  %nop2757 = alloca i1, i1 0
+  %nop2758 = alloca i1, i1 0
+  %nop2759 = alloca i1, i1 0
+  %nop2760 = alloca i1, i1 0
+  %nop2761 = alloca i1, i1 0
+  %nop2762 = alloca i1, i1 0
+  %nop2763 = alloca i1, i1 0
+  %nop2764 = alloca i1, i1 0
+  %nop2765 = alloca i1, i1 0
+  %nop2766 = alloca i1, i1 0
+  %nop2767 = alloca i1, i1 0
+  %nop2768 = alloca i1, i1 0
+  %nop2769 = alloca i1, i1 0
+  %nop2770 = alloca i1, i1 0
+  %nop2771 = alloca i1, i1 0
+  %nop2772 = alloca i1, i1 0
+  %nop2773 = alloca i1, i1 0
+  %nop2774 = alloca i1, i1 0
+  %nop2775 = alloca i1, i1 0
+  %nop2776 = alloca i1, i1 0
+  %nop2777 = alloca i1, i1 0
+  %nop2778 = alloca i1, i1 0
+  %nop2779 = alloca i1, i1 0
+  %nop2780 = alloca i1, i1 0
+  %nop2781 = alloca i1, i1 0
+  %nop2782 = alloca i1, i1 0
+  %nop2783 = alloca i1, i1 0
+  %nop2784 = alloca i1, i1 0
+  %nop2785 = alloca i1, i1 0
+  %nop2786 = alloca i1, i1 0
+  %nop2787 = alloca i1, i1 0
+  %nop2788 = alloca i1, i1 0
+  %nop2789 = alloca i1, i1 0
+  %nop2790 = alloca i1, i1 0
+  %nop2791 = alloca i1, i1 0
+  %nop2792 = alloca i1, i1 0
+  %nop2793 = alloca i1, i1 0
+  %nop2794 = alloca i1, i1 0
+  %nop2795 = alloca i1, i1 0
+  %nop2796 = alloca i1, i1 0
+  %nop2797 = alloca i1, i1 0
+  %nop2798 = alloca i1, i1 0
+  %nop2799 = alloca i1, i1 0
+  %nop2800 = alloca i1, i1 0
+  %nop2801 = alloca i1, i1 0
+  %nop2802 = alloca i1, i1 0
+  %nop2803 = alloca i1, i1 0
+  %nop2804 = alloca i1, i1 0
+  %nop2805 = alloca i1, i1 0
+  %nop2806 = alloca i1, i1 0
+  %nop2807 = alloca i1, i1 0
+  %nop2808 = alloca i1, i1 0
+  %nop2809 = alloca i1, i1 0
+  %nop2810 = alloca i1, i1 0
+  %nop2811 = alloca i1, i1 0
+  %nop2812 = alloca i1, i1 0
+  %nop2813 = alloca i1, i1 0
+  %nop2814 = alloca i1, i1 0
+  %nop2815 = alloca i1, i1 0
+  %nop2816 = alloca i1, i1 0
+  %nop2817 = alloca i1, i1 0
+  %nop2818 = alloca i1, i1 0
+  %nop2819 = alloca i1, i1 0
+  %nop2820 = alloca i1, i1 0
+  %nop2821 = alloca i1, i1 0
+  %nop2822 = alloca i1, i1 0
+  %nop2823 = alloca i1, i1 0
+  %nop2824 = alloca i1, i1 0
+  %nop2825 = alloca i1, i1 0
+  %nop2826 = alloca i1, i1 0
+  %nop2827 = alloca i1, i1 0
+  %nop2828 = alloca i1, i1 0
+  %nop2829 = alloca i1, i1 0
+  %nop2830 = alloca i1, i1 0
+  %nop2831 = alloca i1, i1 0
+  %nop2832 = alloca i1, i1 0
+  %nop2833 = alloca i1, i1 0
+  %nop2834 = alloca i1, i1 0
+  %nop2835 = alloca i1, i1 0
+  %nop2836 = alloca i1, i1 0
+  %nop2837 = alloca i1, i1 0
+  %nop2838 = alloca i1, i1 0
+  %nop2839 = alloca i1, i1 0
+  %nop2840 = alloca i1, i1 0
+  %nop2841 = alloca i1, i1 0
+  %nop2842 = alloca i1, i1 0
+  %nop2843 = alloca i1, i1 0
+  %nop2844 = alloca i1, i1 0
+  %nop2845 = alloca i1, i1 0
+  %nop2846 = alloca i1, i1 0
+  %nop2847 = alloca i1, i1 0
+  %nop2848 = alloca i1, i1 0
+  %nop2849 = alloca i1, i1 0
+  %nop2850 = alloca i1, i1 0
+  %nop2851 = alloca i1, i1 0
+  %nop2852 = alloca i1, i1 0
+  %nop2853 = alloca i1, i1 0
+  %nop2854 = alloca i1, i1 0
+  %nop2855 = alloca i1, i1 0
+  %nop2856 = alloca i1, i1 0
+  %nop2857 = alloca i1, i1 0
+  %nop2858 = alloca i1, i1 0
+  %nop2859 = alloca i1, i1 0
+  %nop2860 = alloca i1, i1 0
+  %nop2861 = alloca i1, i1 0
+  %nop2862 = alloca i1, i1 0
+  %nop2863 = alloca i1, i1 0
+  %nop2864 = alloca i1, i1 0
+  %nop2865 = alloca i1, i1 0
+  %nop2866 = alloca i1, i1 0
+  %nop2867 = alloca i1, i1 0
+  %nop2868 = alloca i1, i1 0
+  %nop2869 = alloca i1, i1 0
+  %nop2870 = alloca i1, i1 0
+  %nop2871 = alloca i1, i1 0
+  %nop2872 = alloca i1, i1 0
+  %nop2873 = alloca i1, i1 0
+  %nop2874 = alloca i1, i1 0
+  %nop2875 = alloca i1, i1 0
+  %nop2876 = alloca i1, i1 0
+  %nop2877 = alloca i1, i1 0
+  %nop2878 = alloca i1, i1 0
+  %nop2879 = alloca i1, i1 0
+  %nop2880 = alloca i1, i1 0
+  %nop2881 = alloca i1, i1 0
+  %nop2882 = alloca i1, i1 0
+  %nop2883 = alloca i1, i1 0
+  %nop2884 = alloca i1, i1 0
+  %nop2885 = alloca i1, i1 0
+  %nop2886 = alloca i1, i1 0
+  %nop2887 = alloca i1, i1 0
+  %nop2888 = alloca i1, i1 0
+  %nop2889 = alloca i1, i1 0
+  %nop2890 = alloca i1, i1 0
+  %nop2891 = alloca i1, i1 0
+  %nop2892 = alloca i1, i1 0
+  %nop2893 = alloca i1, i1 0
+  %nop2894 = alloca i1, i1 0
+  %nop2895 = alloca i1, i1 0
+  %nop2896 = alloca i1, i1 0
+  %nop2897 = alloca i1, i1 0
+  %nop2898 = alloca i1, i1 0
+  %nop2899 = alloca i1, i1 0
+  %nop2900 = alloca i1, i1 0
+  %nop2901 = alloca i1, i1 0
+  %nop2902 = alloca i1, i1 0
+  %nop2903 = alloca i1, i1 0
+  %nop2904 = alloca i1, i1 0
+  %nop2905 = alloca i1, i1 0
+  %nop2906 = alloca i1, i1 0
+  %nop2907 = alloca i1, i1 0
+  %nop2908 = alloca i1, i1 0
+  %nop2909 = alloca i1, i1 0
+  %nop2910 = alloca i1, i1 0
+  %nop2911 = alloca i1, i1 0
+  %nop2912 = alloca i1, i1 0
+  %nop2913 = alloca i1, i1 0
+  %nop2914 = alloca i1, i1 0
+  %nop2915 = alloca i1, i1 0
+  %nop2916 = alloca i1, i1 0
+  %nop2917 = alloca i1, i1 0
+  %nop2918 = alloca i1, i1 0
+  %nop2919 = alloca i1, i1 0
+  %nop2920 = alloca i1, i1 0
+  %nop2921 = alloca i1, i1 0
+  %nop2922 = alloca i1, i1 0
+  %nop2923 = alloca i1, i1 0
+  %nop2924 = alloca i1, i1 0
+  %nop2925 = alloca i1, i1 0
+  %nop2926 = alloca i1, i1 0
+  %nop2927 = alloca i1, i1 0
+  %nop2928 = alloca i1, i1 0
+  %nop2929 = alloca i1, i1 0
+  %nop2930 = alloca i1, i1 0
+  %nop2931 = alloca i1, i1 0
+  %nop2932 = alloca i1, i1 0
+  %nop2933 = alloca i1, i1 0
+  %nop2934 = alloca i1, i1 0
+  %nop2935 = alloca i1, i1 0
+  %nop2936 = alloca i1, i1 0
+  %nop2937 = alloca i1, i1 0
+  %nop2938 = alloca i1, i1 0
+  %nop2939 = alloca i1, i1 0
+  %nop2940 = alloca i1, i1 0
+  %nop2941 = alloca i1, i1 0
+  %nop2942 = alloca i1, i1 0
+  %nop2943 = alloca i1, i1 0
+  %nop2944 = alloca i1, i1 0
+  %nop2945 = alloca i1, i1 0
+  %nop2946 = alloca i1, i1 0
+  %nop2947 = alloca i1, i1 0
+  %nop2948 = alloca i1, i1 0
+  %nop2949 = alloca i1, i1 0
+  %nop2950 = alloca i1, i1 0
+  %nop2951 = alloca i1, i1 0
+  %nop2952 = alloca i1, i1 0
+  %nop2953 = alloca i1, i1 0
+  %nop2954 = alloca i1, i1 0
+  %nop2955 = alloca i1, i1 0
+  %nop2956 = alloca i1, i1 0
+  %nop2957 = alloca i1, i1 0
+  %nop2958 = alloca i1, i1 0
+  %nop2959 = alloca i1, i1 0
+  %nop2960 = alloca i1, i1 0
+  %nop2961 = alloca i1, i1 0
+  %nop2962 = alloca i1, i1 0
+  %nop2963 = alloca i1, i1 0
+  %nop2964 = alloca i1, i1 0
+  %nop2965 = alloca i1, i1 0
+  %nop2966 = alloca i1, i1 0
+  %nop2967 = alloca i1, i1 0
+  %nop2968 = alloca i1, i1 0
+  %nop2969 = alloca i1, i1 0
+  %nop2970 = alloca i1, i1 0
+  %nop2971 = alloca i1, i1 0
+  %nop2972 = alloca i1, i1 0
+  %nop2973 = alloca i1, i1 0
+  %nop2974 = alloca i1, i1 0
+  %nop2975 = alloca i1, i1 0
+  %nop2976 = alloca i1, i1 0
+  %nop2977 = alloca i1, i1 0
+  %nop2978 = alloca i1, i1 0
+  %nop2979 = alloca i1, i1 0
+  %nop2980 = alloca i1, i1 0
+  %nop2981 = alloca i1, i1 0
+  %nop2982 = alloca i1, i1 0
+  %nop2983 = alloca i1, i1 0
+  %nop2984 = alloca i1, i1 0
+  %nop2985 = alloca i1, i1 0
+  %nop2986 = alloca i1, i1 0
+  %nop2987 = alloca i1, i1 0
+  %nop2988 = alloca i1, i1 0
+  %nop2989 = alloca i1, i1 0
+  %nop2990 = alloca i1, i1 0
+  %nop2991 = alloca i1, i1 0
+  %nop2992 = alloca i1, i1 0
+  %nop2993 = alloca i1, i1 0
+  %nop2994 = alloca i1, i1 0
+  %nop2995 = alloca i1, i1 0
+  %nop2996 = alloca i1, i1 0
+  %nop2997 = alloca i1, i1 0
+  %nop2998 = alloca i1, i1 0
+  %nop2999 = alloca i1, i1 0
+  %nop3000 = alloca i1, i1 0
+  %nop3001 = alloca i1, i1 0
+  %nop3002 = alloca i1, i1 0
+  %nop3003 = alloca i1, i1 0
+  %nop3004 = alloca i1, i1 0
+  %nop3005 = alloca i1, i1 0
+  %nop3006 = alloca i1, i1 0
+  %nop3007 = alloca i1, i1 0
+  %nop3008 = alloca i1, i1 0
+  %nop3009 = alloca i1, i1 0
+  %nop3010 = alloca i1, i1 0
+  %nop3011 = alloca i1, i1 0
+  %nop3012 = alloca i1, i1 0
+  %nop3013 = alloca i1, i1 0
+  %nop3014 = alloca i1, i1 0
+  %nop3015 = alloca i1, i1 0
+  %nop3016 = alloca i1, i1 0
+  %nop3017 = alloca i1, i1 0
+  %nop3018 = alloca i1, i1 0
+  %nop3019 = alloca i1, i1 0
+  %nop3020 = alloca i1, i1 0
+  %nop3021 = alloca i1, i1 0
+  %nop3022 = alloca i1, i1 0
+  %nop3023 = alloca i1, i1 0
+  %nop3024 = alloca i1, i1 0
+  %nop3025 = alloca i1, i1 0
+  %nop3026 = alloca i1, i1 0
+  %nop3027 = alloca i1, i1 0
+  %nop3028 = alloca i1, i1 0
+  %nop3029 = alloca i1, i1 0
+  %nop3030 = alloca i1, i1 0
+  %nop3031 = alloca i1, i1 0
+  %nop3032 = alloca i1, i1 0
+  %nop3033 = alloca i1, i1 0
+  %nop3034 = alloca i1, i1 0
+  %nop3035 = alloca i1, i1 0
+  %nop3036 = alloca i1, i1 0
+  %nop3037 = alloca i1, i1 0
+  %nop3038 = alloca i1, i1 0
+  %nop3039 = alloca i1, i1 0
+  %nop3040 = alloca i1, i1 0
+  %nop3041 = alloca i1, i1 0
+  %nop3042 = alloca i1, i1 0
+  %nop3043 = alloca i1, i1 0
+  %nop3044 = alloca i1, i1 0
+  %nop3045 = alloca i1, i1 0
+  %nop3046 = alloca i1, i1 0
+  %nop3047 = alloca i1, i1 0
+  %nop3048 = alloca i1, i1 0
+  %nop3049 = alloca i1, i1 0
+  %nop3050 = alloca i1, i1 0
+  %nop3051 = alloca i1, i1 0
+  %nop3052 = alloca i1, i1 0
+  %nop3053 = alloca i1, i1 0
+  %nop3054 = alloca i1, i1 0
+  %nop3055 = alloca i1, i1 0
+  %nop3056 = alloca i1, i1 0
+  %nop3057 = alloca i1, i1 0
+  %nop3058 = alloca i1, i1 0
+  %nop3059 = alloca i1, i1 0
+  %nop3060 = alloca i1, i1 0
+  %nop3061 = alloca i1, i1 0
+  %nop3062 = alloca i1, i1 0
+  %nop3063 = alloca i1, i1 0
+  %nop3064 = alloca i1, i1 0
+  %nop3065 = alloca i1, i1 0
+  %nop3066 = alloca i1, i1 0
+  %nop3067 = alloca i1, i1 0
+  %nop3068 = alloca i1, i1 0
+  %nop3069 = alloca i1, i1 0
+  %nop3070 = alloca i1, i1 0
+  %nop3071 = alloca i1, i1 0
+  %nop3072 = alloca i1, i1 0
+  %nop3073 = alloca i1, i1 0
+  %nop3074 = alloca i1, i1 0
+  %nop3075 = alloca i1, i1 0
+  %nop3076 = alloca i1, i1 0
+  %nop3077 = alloca i1, i1 0
+  %nop3078 = alloca i1, i1 0
+  %nop3079 = alloca i1, i1 0
+  %nop3080 = alloca i1, i1 0
+  %nop3081 = alloca i1, i1 0
+  %nop3082 = alloca i1, i1 0
+  %nop3083 = alloca i1, i1 0
+  %nop3084 = alloca i1, i1 0
+  %nop3085 = alloca i1, i1 0
+  %nop3086 = alloca i1, i1 0
+  %nop3087 = alloca i1, i1 0
+  %nop3088 = alloca i1, i1 0
+  %nop3089 = alloca i1, i1 0
+  %nop3090 = alloca i1, i1 0
+  %nop3091 = alloca i1, i1 0
+  %nop3092 = alloca i1, i1 0
+  %nop3093 = alloca i1, i1 0
+  %nop3094 = alloca i1, i1 0
+  %nop3095 = alloca i1, i1 0
+  %nop3096 = alloca i1, i1 0
+  %nop3097 = alloca i1, i1 0
+  %nop3098 = alloca i1, i1 0
+  %nop3099 = alloca i1, i1 0
+  %nop3100 = alloca i1, i1 0
+  %nop3101 = alloca i1, i1 0
+  %nop3102 = alloca i1, i1 0
+  %nop3103 = alloca i1, i1 0
+  %nop3104 = alloca i1, i1 0
+  %nop3105 = alloca i1, i1 0
+  %nop3106 = alloca i1, i1 0
+  %nop3107 = alloca i1, i1 0
+  %nop3108 = alloca i1, i1 0
+  %nop3109 = alloca i1, i1 0
+  %nop3110 = alloca i1, i1 0
+  %nop3111 = alloca i1, i1 0
+  %nop3112 = alloca i1, i1 0
+  %nop3113 = alloca i1, i1 0
+  %nop3114 = alloca i1, i1 0
+  %nop3115 = alloca i1, i1 0
+  %nop3116 = alloca i1, i1 0
+  %nop3117 = alloca i1, i1 0
+  %nop3118 = alloca i1, i1 0
+  %nop3119 = alloca i1, i1 0
+  %nop3120 = alloca i1, i1 0
+  %nop3121 = alloca i1, i1 0
+  %nop3122 = alloca i1, i1 0
+  %nop3123 = alloca i1, i1 0
+  %nop3124 = alloca i1, i1 0
+  %nop3125 = alloca i1, i1 0
+  %nop3126 = alloca i1, i1 0
+  %nop3127 = alloca i1, i1 0
+  %nop3128 = alloca i1, i1 0
+  %nop3129 = alloca i1, i1 0
+  %nop3130 = alloca i1, i1 0
+  %nop3131 = alloca i1, i1 0
+  %nop3132 = alloca i1, i1 0
+  %nop3133 = alloca i1, i1 0
+  %nop3134 = alloca i1, i1 0
+  %nop3135 = alloca i1, i1 0
+  %nop3136 = alloca i1, i1 0
+  %nop3137 = alloca i1, i1 0
+  %nop3138 = alloca i1, i1 0
+  %nop3139 = alloca i1, i1 0
+  %nop3140 = alloca i1, i1 0
+  %nop3141 = alloca i1, i1 0
+  %nop3142 = alloca i1, i1 0
+  %nop3143 = alloca i1, i1 0
+  %nop3144 = alloca i1, i1 0
+  %nop3145 = alloca i1, i1 0
+  %nop3146 = alloca i1, i1 0
+  %nop3147 = alloca i1, i1 0
+  %nop3148 = alloca i1, i1 0
+  %nop3149 = alloca i1, i1 0
+  %nop3150 = alloca i1, i1 0
+  %nop3151 = alloca i1, i1 0
+  %nop3152 = alloca i1, i1 0
+  %nop3153 = alloca i1, i1 0
+  %nop3154 = alloca i1, i1 0
+  %nop3155 = alloca i1, i1 0
+  %nop3156 = alloca i1, i1 0
+  %nop3157 = alloca i1, i1 0
+  %nop3158 = alloca i1, i1 0
+  %nop3159 = alloca i1, i1 0
+  %nop3160 = alloca i1, i1 0
+  %nop3161 = alloca i1, i1 0
+  %nop3162 = alloca i1, i1 0
+  %nop3163 = alloca i1, i1 0
+  %nop3164 = alloca i1, i1 0
+  %nop3165 = alloca i1, i1 0
+  %nop3166 = alloca i1, i1 0
+  %nop3167 = alloca i1, i1 0
+  %nop3168 = alloca i1, i1 0
+  %nop3169 = alloca i1, i1 0
+  %nop3170 = alloca i1, i1 0
+  %nop3171 = alloca i1, i1 0
+  %nop3172 = alloca i1, i1 0
+  %nop3173 = alloca i1, i1 0
+  %nop3174 = alloca i1, i1 0
+  %nop3175 = alloca i1, i1 0
+  %nop3176 = alloca i1, i1 0
+  %nop3177 = alloca i1, i1 0
+  %nop3178 = alloca i1, i1 0
+  %nop3179 = alloca i1, i1 0
+  %nop3180 = alloca i1, i1 0
+  %nop3181 = alloca i1, i1 0
+  %nop3182 = alloca i1, i1 0
+  %nop3183 = alloca i1, i1 0
+  %nop3184 = alloca i1, i1 0
+  %nop3185 = alloca i1, i1 0
+  %nop3186 = alloca i1, i1 0
+  %nop3187 = alloca i1, i1 0
+  %nop3188 = alloca i1, i1 0
+  %nop3189 = alloca i1, i1 0
+  %nop3190 = alloca i1, i1 0
+  %nop3191 = alloca i1, i1 0
+  %nop3192 = alloca i1, i1 0
+  %nop3193 = alloca i1, i1 0
+  %nop3194 = alloca i1, i1 0
+  %nop3195 = alloca i1, i1 0
+  %nop3196 = alloca i1, i1 0
+  %nop3197 = alloca i1, i1 0
+  %nop3198 = alloca i1, i1 0
+  %nop3199 = alloca i1, i1 0
+  %nop3200 = alloca i1, i1 0
+  %nop3201 = alloca i1, i1 0
+  %nop3202 = alloca i1, i1 0
+  %nop3203 = alloca i1, i1 0
+  %nop3204 = alloca i1, i1 0
+  %nop3205 = alloca i1, i1 0
+  %nop3206 = alloca i1, i1 0
+  %nop3207 = alloca i1, i1 0
+  %nop3208 = alloca i1, i1 0
+  %nop3209 = alloca i1, i1 0
+  %nop3210 = alloca i1, i1 0
+  %nop3211 = alloca i1, i1 0
+  %nop3212 = alloca i1, i1 0
+  %nop3213 = alloca i1, i1 0
+  %nop3214 = alloca i1, i1 0
+  %nop3215 = alloca i1, i1 0
+  %nop3216 = alloca i1, i1 0
+  %nop3217 = alloca i1, i1 0
+  %nop3218 = alloca i1, i1 0
+  %nop3219 = alloca i1, i1 0
+  %nop3220 = alloca i1, i1 0
+  %nop3221 = alloca i1, i1 0
+  %nop3222 = alloca i1, i1 0
+  %nop3223 = alloca i1, i1 0
+  %nop3224 = alloca i1, i1 0
+  %nop3225 = alloca i1, i1 0
+  %nop3226 = alloca i1, i1 0
+  %nop3227 = alloca i1, i1 0
+  %nop3228 = alloca i1, i1 0
+  %nop3229 = alloca i1, i1 0
+  %nop3230 = alloca i1, i1 0
+  %nop3231 = alloca i1, i1 0
+  %nop3232 = alloca i1, i1 0
+  %nop3233 = alloca i1, i1 0
+  %nop3234 = alloca i1, i1 0
+  %nop3235 = alloca i1, i1 0
+  %nop3236 = alloca i1, i1 0
+  %nop3237 = alloca i1, i1 0
+  %nop3238 = alloca i1, i1 0
+  %nop3239 = alloca i1, i1 0
+  %nop3240 = alloca i1, i1 0
+  %nop3241 = alloca i1, i1 0
+  %nop3242 = alloca i1, i1 0
+  %nop3243 = alloca i1, i1 0
+  %nop3244 = alloca i1, i1 0
+  %nop3245 = alloca i1, i1 0
+  %nop3246 = alloca i1, i1 0
+  %nop3247 = alloca i1, i1 0
+  %nop3248 = alloca i1, i1 0
+  %nop3249 = alloca i1, i1 0
+  %nop3250 = alloca i1, i1 0
+  %nop3251 = alloca i1, i1 0
+  %nop3252 = alloca i1, i1 0
+  %nop3253 = alloca i1, i1 0
+  %nop3254 = alloca i1, i1 0
+  %nop3255 = alloca i1, i1 0
+  %nop3256 = alloca i1, i1 0
+  %nop3257 = alloca i1, i1 0
+  %nop3258 = alloca i1, i1 0
+  %nop3259 = alloca i1, i1 0
+  %nop3260 = alloca i1, i1 0
+  %nop3261 = alloca i1, i1 0
+  %nop3262 = alloca i1, i1 0
+  %nop3263 = alloca i1, i1 0
+  %nop3264 = alloca i1, i1 0
+  %nop3265 = alloca i1, i1 0
+  %nop3266 = alloca i1, i1 0
+  %nop3267 = alloca i1, i1 0
+  %nop3268 = alloca i1, i1 0
+  %nop3269 = alloca i1, i1 0
+  %nop3270 = alloca i1, i1 0
+  %nop3271 = alloca i1, i1 0
+  %nop3272 = alloca i1, i1 0
+  %nop3273 = alloca i1, i1 0
+  %nop3274 = alloca i1, i1 0
+  %nop3275 = alloca i1, i1 0
+  %nop3276 = alloca i1, i1 0
+  %nop3277 = alloca i1, i1 0
+  %nop3278 = alloca i1, i1 0
+  %nop3279 = alloca i1, i1 0
+  %nop3280 = alloca i1, i1 0
+  %nop3281 = alloca i1, i1 0
+  %nop3282 = alloca i1, i1 0
+  %nop3283 = alloca i1, i1 0
+  %nop3284 = alloca i1, i1 0
+  %nop3285 = alloca i1, i1 0
+  %nop3286 = alloca i1, i1 0
+  %nop3287 = alloca i1, i1 0
+  %nop3288 = alloca i1, i1 0
+  %nop3289 = alloca i1, i1 0
+  %nop3290 = alloca i1, i1 0
+  %nop3291 = alloca i1, i1 0
+  %nop3292 = alloca i1, i1 0
+  %nop3293 = alloca i1, i1 0
+  %nop3294 = alloca i1, i1 0
+  %nop3295 = alloca i1, i1 0
+  %nop3296 = alloca i1, i1 0
+  %nop3297 = alloca i1, i1 0
+  %nop3298 = alloca i1, i1 0
+  %nop3299 = alloca i1, i1 0
+  %nop3300 = alloca i1, i1 0
+  %nop3301 = alloca i1, i1 0
+  %nop3302 = alloca i1, i1 0
+  %nop3303 = alloca i1, i1 0
+  %nop3304 = alloca i1, i1 0
+  %nop3305 = alloca i1, i1 0
+  %nop3306 = alloca i1, i1 0
+  %nop3307 = alloca i1, i1 0
+  %nop3308 = alloca i1, i1 0
+  %nop3309 = alloca i1, i1 0
+  %nop3310 = alloca i1, i1 0
+  %nop3311 = alloca i1, i1 0
+  %nop3312 = alloca i1, i1 0
+  %nop3313 = alloca i1, i1 0
+  %nop3314 = alloca i1, i1 0
+  %nop3315 = alloca i1, i1 0
+  %nop3316 = alloca i1, i1 0
+  %nop3317 = alloca i1, i1 0
+  %nop3318 = alloca i1, i1 0
+  %nop3319 = alloca i1, i1 0
+  %nop3320 = alloca i1, i1 0
+  %nop3321 = alloca i1, i1 0
+  %nop3322 = alloca i1, i1 0
+  %nop3323 = alloca i1, i1 0
+  %nop3324 = alloca i1, i1 0
+  %nop3325 = alloca i1, i1 0
+  %nop3326 = alloca i1, i1 0
+  %nop3327 = alloca i1, i1 0
+  %nop3328 = alloca i1, i1 0
+  %nop3329 = alloca i1, i1 0
+  %nop3330 = alloca i1, i1 0
+  %nop3331 = alloca i1, i1 0
+  %nop3332 = alloca i1, i1 0
+  %nop3333 = alloca i1, i1 0
+  %nop3334 = alloca i1, i1 0
+  %nop3335 = alloca i1, i1 0
+  %nop3336 = alloca i1, i1 0
+  %nop3337 = alloca i1, i1 0
+  %nop3338 = alloca i1, i1 0
+  %nop3339 = alloca i1, i1 0
+  %nop3340 = alloca i1, i1 0
+  %nop3341 = alloca i1, i1 0
+  %nop3342 = alloca i1, i1 0
+  %nop3343 = alloca i1, i1 0
+  %nop3344 = alloca i1, i1 0
+  %nop3345 = alloca i1, i1 0
+  %nop3346 = alloca i1, i1 0
+  %nop3347 = alloca i1, i1 0
+  %nop3348 = alloca i1, i1 0
+  %nop3349 = alloca i1, i1 0
+  %nop3350 = alloca i1, i1 0
+  %nop3351 = alloca i1, i1 0
+  %nop3352 = alloca i1, i1 0
+  %nop3353 = alloca i1, i1 0
+  %nop3354 = alloca i1, i1 0
+  %nop3355 = alloca i1, i1 0
+  %nop3356 = alloca i1, i1 0
+  %nop3357 = alloca i1, i1 0
+  %nop3358 = alloca i1, i1 0
+  %nop3359 = alloca i1, i1 0
+  %nop3360 = alloca i1, i1 0
+  %nop3361 = alloca i1, i1 0
+  %nop3362 = alloca i1, i1 0
+  %nop3363 = alloca i1, i1 0
+  %nop3364 = alloca i1, i1 0
+  %nop3365 = alloca i1, i1 0
+  %nop3366 = alloca i1, i1 0
+  %nop3367 = alloca i1, i1 0
+  %nop3368 = alloca i1, i1 0
+  %nop3369 = alloca i1, i1 0
+  %nop3370 = alloca i1, i1 0
+  %nop3371 = alloca i1, i1 0
+  %nop3372 = alloca i1, i1 0
+  %nop3373 = alloca i1, i1 0
+  %nop3374 = alloca i1, i1 0
+  %nop3375 = alloca i1, i1 0
+  %nop3376 = alloca i1, i1 0
+  %nop3377 = alloca i1, i1 0
+  %nop3378 = alloca i1, i1 0
+  %nop3379 = alloca i1, i1 0
+  %nop3380 = alloca i1, i1 0
+  %nop3381 = alloca i1, i1 0
+  %nop3382 = alloca i1, i1 0
+  %nop3383 = alloca i1, i1 0
+  %nop3384 = alloca i1, i1 0
+  %nop3385 = alloca i1, i1 0
+  %nop3386 = alloca i1, i1 0
+  %nop3387 = alloca i1, i1 0
+  %nop3388 = alloca i1, i1 0
+  %nop3389 = alloca i1, i1 0
+  %nop3390 = alloca i1, i1 0
+  %nop3391 = alloca i1, i1 0
+  %nop3392 = alloca i1, i1 0
+  %nop3393 = alloca i1, i1 0
+  %nop3394 = alloca i1, i1 0
+  %nop3395 = alloca i1, i1 0
+  %nop3396 = alloca i1, i1 0
+  %nop3397 = alloca i1, i1 0
+  %nop3398 = alloca i1, i1 0
+  %nop3399 = alloca i1, i1 0
+  %nop3400 = alloca i1, i1 0
+  %nop3401 = alloca i1, i1 0
+  %nop3402 = alloca i1, i1 0
+  %nop3403 = alloca i1, i1 0
+  %nop3404 = alloca i1, i1 0
+  %nop3405 = alloca i1, i1 0
+  %nop3406 = alloca i1, i1 0
+  %nop3407 = alloca i1, i1 0
+  %nop3408 = alloca i1, i1 0
+  %nop3409 = alloca i1, i1 0
+  %nop3410 = alloca i1, i1 0
+  %nop3411 = alloca i1, i1 0
+  %nop3412 = alloca i1, i1 0
+  %nop3413 = alloca i1, i1 0
+  %nop3414 = alloca i1, i1 0
+  %nop3415 = alloca i1, i1 0
+  %nop3416 = alloca i1, i1 0
+  %nop3417 = alloca i1, i1 0
+  %nop3418 = alloca i1, i1 0
+  %nop3419 = alloca i1, i1 0
+  %nop3420 = alloca i1, i1 0
+  %nop3421 = alloca i1, i1 0
+  %nop3422 = alloca i1, i1 0
+  %nop3423 = alloca i1, i1 0
+  %nop3424 = alloca i1, i1 0
+  %nop3425 = alloca i1, i1 0
+  %nop3426 = alloca i1, i1 0
+  %nop3427 = alloca i1, i1 0
+  %nop3428 = alloca i1, i1 0
+  %nop3429 = alloca i1, i1 0
+  %nop3430 = alloca i1, i1 0
+  %nop3431 = alloca i1, i1 0
+  %nop3432 = alloca i1, i1 0
+  %nop3433 = alloca i1, i1 0
+  %nop3434 = alloca i1, i1 0
+  %nop3435 = alloca i1, i1 0
+  %nop3436 = alloca i1, i1 0
+  %nop3437 = alloca i1, i1 0
+  %nop3438 = alloca i1, i1 0
+  %nop3439 = alloca i1, i1 0
+  %nop3440 = alloca i1, i1 0
+  %nop3441 = alloca i1, i1 0
+  %nop3442 = alloca i1, i1 0
+  %nop3443 = alloca i1, i1 0
+  %nop3444 = alloca i1, i1 0
+  %nop3445 = alloca i1, i1 0
+  %nop3446 = alloca i1, i1 0
+  %nop3447 = alloca i1, i1 0
+  %nop3448 = alloca i1, i1 0
+  %nop3449 = alloca i1, i1 0
+  %nop3450 = alloca i1, i1 0
+  %nop3451 = alloca i1, i1 0
+  %nop3452 = alloca i1, i1 0
+  %nop3453 = alloca i1, i1 0
+  %nop3454 = alloca i1, i1 0
+  %nop3455 = alloca i1, i1 0
+  %nop3456 = alloca i1, i1 0
+  %nop3457 = alloca i1, i1 0
+  %nop3458 = alloca i1, i1 0
+  %nop3459 = alloca i1, i1 0
+  %nop3460 = alloca i1, i1 0
+  %nop3461 = alloca i1, i1 0
+  %nop3462 = alloca i1, i1 0
+  %nop3463 = alloca i1, i1 0
+  %nop3464 = alloca i1, i1 0
+  %nop3465 = alloca i1, i1 0
+  %nop3466 = alloca i1, i1 0
+  %nop3467 = alloca i1, i1 0
+  %nop3468 = alloca i1, i1 0
+  %nop3469 = alloca i1, i1 0
+  %nop3470 = alloca i1, i1 0
+  %nop3471 = alloca i1, i1 0
+  %nop3472 = alloca i1, i1 0
+  %nop3473 = alloca i1, i1 0
+  %nop3474 = alloca i1, i1 0
+  %nop3475 = alloca i1, i1 0
+  %nop3476 = alloca i1, i1 0
+  %nop3477 = alloca i1, i1 0
+  %nop3478 = alloca i1, i1 0
+  %nop3479 = alloca i1, i1 0
+  %nop3480 = alloca i1, i1 0
+  %nop3481 = alloca i1, i1 0
+  %nop3482 = alloca i1, i1 0
+  %nop3483 = alloca i1, i1 0
+  %nop3484 = alloca i1, i1 0
+  %nop3485 = alloca i1, i1 0
+  %nop3486 = alloca i1, i1 0
+  %nop3487 = alloca i1, i1 0
+  %nop3488 = alloca i1, i1 0
+  %nop3489 = alloca i1, i1 0
+  %nop3490 = alloca i1, i1 0
+  %nop3491 = alloca i1, i1 0
+  %nop3492 = alloca i1, i1 0
+  %nop3493 = alloca i1, i1 0
+  %nop3494 = alloca i1, i1 0
+  %nop3495 = alloca i1, i1 0
+  %nop3496 = alloca i1, i1 0
+  %nop3497 = alloca i1, i1 0
+  %nop3498 = alloca i1, i1 0
+  %nop3499 = alloca i1, i1 0
+  %nop3500 = alloca i1, i1 0
+  %nop3501 = alloca i1, i1 0
+  %nop3502 = alloca i1, i1 0
+  %nop3503 = alloca i1, i1 0
+  %nop3504 = alloca i1, i1 0
+  %nop3505 = alloca i1, i1 0
+  %nop3506 = alloca i1, i1 0
+  %nop3507 = alloca i1, i1 0
+  %nop3508 = alloca i1, i1 0
+  %nop3509 = alloca i1, i1 0
+  %nop3510 = alloca i1, i1 0
+  %nop3511 = alloca i1, i1 0
+  %nop3512 = alloca i1, i1 0
+  %nop3513 = alloca i1, i1 0
+  %nop3514 = alloca i1, i1 0
+  %nop3515 = alloca i1, i1 0
+  %nop3516 = alloca i1, i1 0
+  %nop3517 = alloca i1, i1 0
+  %nop3518 = alloca i1, i1 0
+  %nop3519 = alloca i1, i1 0
+  %nop3520 = alloca i1, i1 0
+  %nop3521 = alloca i1, i1 0
+  %nop3522 = alloca i1, i1 0
+  %nop3523 = alloca i1, i1 0
+  %nop3524 = alloca i1, i1 0
+  %nop3525 = alloca i1, i1 0
+  %nop3526 = alloca i1, i1 0
+  %nop3527 = alloca i1, i1 0
+  %nop3528 = alloca i1, i1 0
+  %nop3529 = alloca i1, i1 0
+  %nop3530 = alloca i1, i1 0
+  %nop3531 = alloca i1, i1 0
+  %nop3532 = alloca i1, i1 0
+  %nop3533 = alloca i1, i1 0
+  %nop3534 = alloca i1, i1 0
+  %nop3535 = alloca i1, i1 0
+  %nop3536 = alloca i1, i1 0
+  %nop3537 = alloca i1, i1 0
+  %nop3538 = alloca i1, i1 0
+  %nop3539 = alloca i1, i1 0
+  %nop3540 = alloca i1, i1 0
+  %nop3541 = alloca i1, i1 0
+  %nop3542 = alloca i1, i1 0
+  %nop3543 = alloca i1, i1 0
+  %nop3544 = alloca i1, i1 0
+  %nop3545 = alloca i1, i1 0
+  %nop3546 = alloca i1, i1 0
+  %nop3547 = alloca i1, i1 0
+  %nop3548 = alloca i1, i1 0
+  %nop3549 = alloca i1, i1 0
+  %nop3550 = alloca i1, i1 0
+  %nop3551 = alloca i1, i1 0
+  %nop3552 = alloca i1, i1 0
+  %nop3553 = alloca i1, i1 0
+  %nop3554 = alloca i1, i1 0
+  %nop3555 = alloca i1, i1 0
+  %nop3556 = alloca i1, i1 0
+  %nop3557 = alloca i1, i1 0
+  %nop3558 = alloca i1, i1 0
+  %nop3559 = alloca i1, i1 0
+  %nop3560 = alloca i1, i1 0
+  %nop3561 = alloca i1, i1 0
+  %nop3562 = alloca i1, i1 0
+  %nop3563 = alloca i1, i1 0
+  %nop3564 = alloca i1, i1 0
+  %nop3565 = alloca i1, i1 0
+  %nop3566 = alloca i1, i1 0
+  %nop3567 = alloca i1, i1 0
+  %nop3568 = alloca i1, i1 0
+  %nop3569 = alloca i1, i1 0
+  %nop3570 = alloca i1, i1 0
+  %nop3571 = alloca i1, i1 0
+  %nop3572 = alloca i1, i1 0
+  %nop3573 = alloca i1, i1 0
+  %nop3574 = alloca i1, i1 0
+  %nop3575 = alloca i1, i1 0
+  %nop3576 = alloca i1, i1 0
+  %nop3577 = alloca i1, i1 0
+  %nop3578 = alloca i1, i1 0
+  %nop3579 = alloca i1, i1 0
+  %nop3580 = alloca i1, i1 0
+  %nop3581 = alloca i1, i1 0
+  %nop3582 = alloca i1, i1 0
+  %nop3583 = alloca i1, i1 0
+  %nop3584 = alloca i1, i1 0
+  %nop3585 = alloca i1, i1 0
+  %nop3586 = alloca i1, i1 0
+  %nop3587 = alloca i1, i1 0
+  %nop3588 = alloca i1, i1 0
+  %nop3589 = alloca i1, i1 0
+  %nop3590 = alloca i1, i1 0
+  %nop3591 = alloca i1, i1 0
+  %nop3592 = alloca i1, i1 0
+  %nop3593 = alloca i1, i1 0
+  %nop3594 = alloca i1, i1 0
+  %nop3595 = alloca i1, i1 0
+  %nop3596 = alloca i1, i1 0
+  %nop3597 = alloca i1, i1 0
+  %nop3598 = alloca i1, i1 0
+  %nop3599 = alloca i1, i1 0
+  %nop3600 = alloca i1, i1 0
+  %nop3601 = alloca i1, i1 0
+  %nop3602 = alloca i1, i1 0
+  %nop3603 = alloca i1, i1 0
+  %nop3604 = alloca i1, i1 0
+  %nop3605 = alloca i1, i1 0
+  %nop3606 = alloca i1, i1 0
+  %nop3607 = alloca i1, i1 0
+  %nop3608 = alloca i1, i1 0
+  %nop3609 = alloca i1, i1 0
+  %nop3610 = alloca i1, i1 0
+  %nop3611 = alloca i1, i1 0
+  %nop3612 = alloca i1, i1 0
+  %nop3613 = alloca i1, i1 0
+  %nop3614 = alloca i1, i1 0
+  %nop3615 = alloca i1, i1 0
+  %nop3616 = alloca i1, i1 0
+  %nop3617 = alloca i1, i1 0
+  %nop3618 = alloca i1, i1 0
+  %nop3619 = alloca i1, i1 0
+  %nop3620 = alloca i1, i1 0
+  %nop3621 = alloca i1, i1 0
+  %nop3622 = alloca i1, i1 0
+  %nop3623 = alloca i1, i1 0
+  %nop3624 = alloca i1, i1 0
+  %nop3625 = alloca i1, i1 0
+  %nop3626 = alloca i1, i1 0
+  %nop3627 = alloca i1, i1 0
+  %nop3628 = alloca i1, i1 0
+  %nop3629 = alloca i1, i1 0
+  %nop3630 = alloca i1, i1 0
+  %nop3631 = alloca i1, i1 0
+  %nop3632 = alloca i1, i1 0
+  %nop3633 = alloca i1, i1 0
+  %nop3634 = alloca i1, i1 0
+  %nop3635 = alloca i1, i1 0
+  %nop3636 = alloca i1, i1 0
+  %nop3637 = alloca i1, i1 0
+  %nop3638 = alloca i1, i1 0
+  %nop3639 = alloca i1, i1 0
+  %nop3640 = alloca i1, i1 0
+  %nop3641 = alloca i1, i1 0
+  %nop3642 = alloca i1, i1 0
+  %nop3643 = alloca i1, i1 0
+  %nop3644 = alloca i1, i1 0
+  %nop3645 = alloca i1, i1 0
+  %nop3646 = alloca i1, i1 0
+  %nop3647 = alloca i1, i1 0
+  %nop3648 = alloca i1, i1 0
+  %nop3649 = alloca i1, i1 0
+  %nop3650 = alloca i1, i1 0
+  %nop3651 = alloca i1, i1 0
+  %nop3652 = alloca i1, i1 0
+  %nop3653 = alloca i1, i1 0
+  %nop3654 = alloca i1, i1 0
+  %nop3655 = alloca i1, i1 0
+  %nop3656 = alloca i1, i1 0
+  %nop3657 = alloca i1, i1 0
+  %nop3658 = alloca i1, i1 0
+  %nop3659 = alloca i1, i1 0
+  %nop3660 = alloca i1, i1 0
+  %nop3661 = alloca i1, i1 0
+  %nop3662 = alloca i1, i1 0
+  %nop3663 = alloca i1, i1 0
+  %nop3664 = alloca i1, i1 0
+  %nop3665 = alloca i1, i1 0
+  %nop3666 = alloca i1, i1 0
+  %nop3667 = alloca i1, i1 0
+  %nop3668 = alloca i1, i1 0
+  %nop3669 = alloca i1, i1 0
+  %nop3670 = alloca i1, i1 0
+  %nop3671 = alloca i1, i1 0
+  %nop3672 = alloca i1, i1 0
+  %nop3673 = alloca i1, i1 0
+  %nop3674 = alloca i1, i1 0
+  %nop3675 = alloca i1, i1 0
+  %nop3676 = alloca i1, i1 0
+  %nop3677 = alloca i1, i1 0
+  %nop3678 = alloca i1, i1 0
+  %nop3679 = alloca i1, i1 0
+  %nop3680 = alloca i1, i1 0
+  %nop3681 = alloca i1, i1 0
+  %nop3682 = alloca i1, i1 0
+  %nop3683 = alloca i1, i1 0
+  %nop3684 = alloca i1, i1 0
+  %nop3685 = alloca i1, i1 0
+  %nop3686 = alloca i1, i1 0
+  %nop3687 = alloca i1, i1 0
+  %nop3688 = alloca i1, i1 0
+  %nop3689 = alloca i1, i1 0
+  %nop3690 = alloca i1, i1 0
+  %nop3691 = alloca i1, i1 0
+  %nop3692 = alloca i1, i1 0
+  %nop3693 = alloca i1, i1 0
+  %nop3694 = alloca i1, i1 0
+  %nop3695 = alloca i1, i1 0
+  %nop3696 = alloca i1, i1 0
+  %nop3697 = alloca i1, i1 0
+  %nop3698 = alloca i1, i1 0
+  %nop3699 = alloca i1, i1 0
+  %nop3700 = alloca i1, i1 0
+  %nop3701 = alloca i1, i1 0
+  %nop3702 = alloca i1, i1 0
+  %nop3703 = alloca i1, i1 0
+  %nop3704 = alloca i1, i1 0
+  %nop3705 = alloca i1, i1 0
+  %nop3706 = alloca i1, i1 0
+  %nop3707 = alloca i1, i1 0
+  %nop3708 = alloca i1, i1 0
+  %nop3709 = alloca i1, i1 0
+  %nop3710 = alloca i1, i1 0
+  %nop3711 = alloca i1, i1 0
+  %nop3712 = alloca i1, i1 0
+  %nop3713 = alloca i1, i1 0
+  %nop3714 = alloca i1, i1 0
+  %nop3715 = alloca i1, i1 0
+  %nop3716 = alloca i1, i1 0
+  %nop3717 = alloca i1, i1 0
+  %nop3718 = alloca i1, i1 0
+  %nop3719 = alloca i1, i1 0
+  %nop3720 = alloca i1, i1 0
+  %nop3721 = alloca i1, i1 0
+  %nop3722 = alloca i1, i1 0
+  %nop3723 = alloca i1, i1 0
+  %nop3724 = alloca i1, i1 0
+  %nop3725 = alloca i1, i1 0
+  %nop3726 = alloca i1, i1 0
+  %nop3727 = alloca i1, i1 0
+  %nop3728 = alloca i1, i1 0
+  %nop3729 = alloca i1, i1 0
+  %nop3730 = alloca i1, i1 0
+  %nop3731 = alloca i1, i1 0
+  %nop3732 = alloca i1, i1 0
+  %nop3733 = alloca i1, i1 0
+  %nop3734 = alloca i1, i1 0
+  %nop3735 = alloca i1, i1 0
+  %nop3736 = alloca i1, i1 0
+  %nop3737 = alloca i1, i1 0
+  %nop3738 = alloca i1, i1 0
+  %nop3739 = alloca i1, i1 0
+  %nop3740 = alloca i1, i1 0
+  %nop3741 = alloca i1, i1 0
+  %nop3742 = alloca i1, i1 0
+  %nop3743 = alloca i1, i1 0
+  %nop3744 = alloca i1, i1 0
+  %nop3745 = alloca i1, i1 0
+  %nop3746 = alloca i1, i1 0
+  %nop3747 = alloca i1, i1 0
+  %nop3748 = alloca i1, i1 0
+  %nop3749 = alloca i1, i1 0
+  %nop3750 = alloca i1, i1 0
+  %nop3751 = alloca i1, i1 0
+  %nop3752 = alloca i1, i1 0
+  %nop3753 = alloca i1, i1 0
+  %nop3754 = alloca i1, i1 0
+  %nop3755 = alloca i1, i1 0
+  %nop3756 = alloca i1, i1 0
+  %nop3757 = alloca i1, i1 0
+  %nop3758 = alloca i1, i1 0
+  %nop3759 = alloca i1, i1 0
+  %nop3760 = alloca i1, i1 0
+  %nop3761 = alloca i1, i1 0
+  %nop3762 = alloca i1, i1 0
+  %nop3763 = alloca i1, i1 0
+  %nop3764 = alloca i1, i1 0
+  %nop3765 = alloca i1, i1 0
+  %nop3766 = alloca i1, i1 0
+  %nop3767 = alloca i1, i1 0
+  %nop3768 = alloca i1, i1 0
+  %nop3769 = alloca i1, i1 0
+  %nop3770 = alloca i1, i1 0
+  %nop3771 = alloca i1, i1 0
+  %nop3772 = alloca i1, i1 0
+  %nop3773 = alloca i1, i1 0
+  %nop3774 = alloca i1, i1 0
+  %nop3775 = alloca i1, i1 0
+  %nop3776 = alloca i1, i1 0
+  %nop3777 = alloca i1, i1 0
+  %nop3778 = alloca i1, i1 0
+  %nop3779 = alloca i1, i1 0
+  %nop3780 = alloca i1, i1 0
+  %nop3781 = alloca i1, i1 0
+  %nop3782 = alloca i1, i1 0
+  %nop3783 = alloca i1, i1 0
+  %nop3784 = alloca i1, i1 0
+  %nop3785 = alloca i1, i1 0
+  %nop3786 = alloca i1, i1 0
+  %nop3787 = alloca i1, i1 0
+  %nop3788 = alloca i1, i1 0
+  %nop3789 = alloca i1, i1 0
+  %nop3790 = alloca i1, i1 0
+  %nop3791 = alloca i1, i1 0
+  %nop3792 = alloca i1, i1 0
+  %nop3793 = alloca i1, i1 0
+  %nop3794 = alloca i1, i1 0
+  %nop3795 = alloca i1, i1 0
+  %nop3796 = alloca i1, i1 0
+  %nop3797 = alloca i1, i1 0
+  %nop3798 = alloca i1, i1 0
+  %nop3799 = alloca i1, i1 0
+  %nop3800 = alloca i1, i1 0
+  %nop3801 = alloca i1, i1 0
+  %nop3802 = alloca i1, i1 0
+  %nop3803 = alloca i1, i1 0
+  %nop3804 = alloca i1, i1 0
+  %nop3805 = alloca i1, i1 0
+  %nop3806 = alloca i1, i1 0
+  %nop3807 = alloca i1, i1 0
+  %nop3808 = alloca i1, i1 0
+  %nop3809 = alloca i1, i1 0
+  %nop3810 = alloca i1, i1 0
+  %nop3811 = alloca i1, i1 0
+  %nop3812 = alloca i1, i1 0
+  %nop3813 = alloca i1, i1 0
+  %nop3814 = alloca i1, i1 0
+  %nop3815 = alloca i1, i1 0
+  %nop3816 = alloca i1, i1 0
+  %nop3817 = alloca i1, i1 0
+  %nop3818 = alloca i1, i1 0
+  %nop3819 = alloca i1, i1 0
+  %nop3820 = alloca i1, i1 0
+  %nop3821 = alloca i1, i1 0
+  %nop3822 = alloca i1, i1 0
+  %nop3823 = alloca i1, i1 0
+  %nop3824 = alloca i1, i1 0
+  %nop3825 = alloca i1, i1 0
+  %nop3826 = alloca i1, i1 0
+  %nop3827 = alloca i1, i1 0
+  %nop3828 = alloca i1, i1 0
+  %nop3829 = alloca i1, i1 0
+  %nop3830 = alloca i1, i1 0
+  %nop3831 = alloca i1, i1 0
+  %nop3832 = alloca i1, i1 0
+  %nop3833 = alloca i1, i1 0
+  %nop3834 = alloca i1, i1 0
+  %nop3835 = alloca i1, i1 0
+  %nop3836 = alloca i1, i1 0
+  %nop3837 = alloca i1, i1 0
+  %nop3838 = alloca i1, i1 0
+  %nop3839 = alloca i1, i1 0
+  %nop3840 = alloca i1, i1 0
+  %nop3841 = alloca i1, i1 0
+  %nop3842 = alloca i1, i1 0
+  %nop3843 = alloca i1, i1 0
+  %nop3844 = alloca i1, i1 0
+  %nop3845 = alloca i1, i1 0
+  %nop3846 = alloca i1, i1 0
+  %nop3847 = alloca i1, i1 0
+  %nop3848 = alloca i1, i1 0
+  %nop3849 = alloca i1, i1 0
+  %nop3850 = alloca i1, i1 0
+  %nop3851 = alloca i1, i1 0
+  %nop3852 = alloca i1, i1 0
+  %nop3853 = alloca i1, i1 0
+  %nop3854 = alloca i1, i1 0
+  %nop3855 = alloca i1, i1 0
+  %nop3856 = alloca i1, i1 0
+  %nop3857 = alloca i1, i1 0
+  %nop3858 = alloca i1, i1 0
+  %nop3859 = alloca i1, i1 0
+  %nop3860 = alloca i1, i1 0
+  %nop3861 = alloca i1, i1 0
+  %nop3862 = alloca i1, i1 0
+  %nop3863 = alloca i1, i1 0
+  %nop3864 = alloca i1, i1 0
+  %nop3865 = alloca i1, i1 0
+  %nop3866 = alloca i1, i1 0
+  %nop3867 = alloca i1, i1 0
+  %nop3868 = alloca i1, i1 0
+  %nop3869 = alloca i1, i1 0
+  %nop3870 = alloca i1, i1 0
+  %nop3871 = alloca i1, i1 0
+  %nop3872 = alloca i1, i1 0
+  %nop3873 = alloca i1, i1 0
+  %nop3874 = alloca i1, i1 0
+  %nop3875 = alloca i1, i1 0
+  %nop3876 = alloca i1, i1 0
+  %nop3877 = alloca i1, i1 0
+  %nop3878 = alloca i1, i1 0
+  %nop3879 = alloca i1, i1 0
+  %nop3880 = alloca i1, i1 0
+  %nop3881 = alloca i1, i1 0
+  %nop3882 = alloca i1, i1 0
+  %nop3883 = alloca i1, i1 0
+  %nop3884 = alloca i1, i1 0
+  %nop3885 = alloca i1, i1 0
+  %nop3886 = alloca i1, i1 0
+  %nop3887 = alloca i1, i1 0
+  %nop3888 = alloca i1, i1 0
+  %nop3889 = alloca i1, i1 0
+  %nop3890 = alloca i1, i1 0
+  %nop3891 = alloca i1, i1 0
+  %nop3892 = alloca i1, i1 0
+  %nop3893 = alloca i1, i1 0
+  %nop3894 = alloca i1, i1 0
+  %nop3895 = alloca i1, i1 0
+  %nop3896 = alloca i1, i1 0
+  %nop3897 = alloca i1, i1 0
+  %nop3898 = alloca i1, i1 0
+  %nop3899 = alloca i1, i1 0
+  %nop3900 = alloca i1, i1 0
+  %nop3901 = alloca i1, i1 0
+  %nop3902 = alloca i1, i1 0
+  %nop3903 = alloca i1, i1 0
+  %nop3904 = alloca i1, i1 0
+  %nop3905 = alloca i1, i1 0
+  %nop3906 = alloca i1, i1 0
+  %nop3907 = alloca i1, i1 0
+  %nop3908 = alloca i1, i1 0
+  %nop3909 = alloca i1, i1 0
+  %nop3910 = alloca i1, i1 0
+  %nop3911 = alloca i1, i1 0
+  %nop3912 = alloca i1, i1 0
+  %nop3913 = alloca i1, i1 0
+  %nop3914 = alloca i1, i1 0
+  %nop3915 = alloca i1, i1 0
+  %nop3916 = alloca i1, i1 0
+  %nop3917 = alloca i1, i1 0
+  %nop3918 = alloca i1, i1 0
+  %nop3919 = alloca i1, i1 0
+  %nop3920 = alloca i1, i1 0
+  %nop3921 = alloca i1, i1 0
+  %nop3922 = alloca i1, i1 0
+  %nop3923 = alloca i1, i1 0
+  %nop3924 = alloca i1, i1 0
+  %nop3925 = alloca i1, i1 0
+  %nop3926 = alloca i1, i1 0
+  %nop3927 = alloca i1, i1 0
+  %nop3928 = alloca i1, i1 0
+  %nop3929 = alloca i1, i1 0
+  %nop3930 = alloca i1, i1 0
+  %nop3931 = alloca i1, i1 0
+  %nop3932 = alloca i1, i1 0
+  %nop3933 = alloca i1, i1 0
+  %nop3934 = alloca i1, i1 0
+  %nop3935 = alloca i1, i1 0
+  %nop3936 = alloca i1, i1 0
+  %nop3937 = alloca i1, i1 0
+  %nop3938 = alloca i1, i1 0
+  %nop3939 = alloca i1, i1 0
+  %nop3940 = alloca i1, i1 0
+  %nop3941 = alloca i1, i1 0
+  %nop3942 = alloca i1, i1 0
+  %nop3943 = alloca i1, i1 0
+  %nop3944 = alloca i1, i1 0
+  %nop3945 = alloca i1, i1 0
+  %nop3946 = alloca i1, i1 0
+  %nop3947 = alloca i1, i1 0
+  %nop3948 = alloca i1, i1 0
+  %nop3949 = alloca i1, i1 0
+  %nop3950 = alloca i1, i1 0
+  %nop3951 = alloca i1, i1 0
+  %nop3952 = alloca i1, i1 0
+  %nop3953 = alloca i1, i1 0
+  %nop3954 = alloca i1, i1 0
+  %nop3955 = alloca i1, i1 0
+  %nop3956 = alloca i1, i1 0
+  %nop3957 = alloca i1, i1 0
+  %nop3958 = alloca i1, i1 0
+  %nop3959 = alloca i1, i1 0
+  %nop3960 = alloca i1, i1 0
+  %nop3961 = alloca i1, i1 0
+  %nop3962 = alloca i1, i1 0
+  %nop3963 = alloca i1, i1 0
+  %nop3964 = alloca i1, i1 0
+  %nop3965 = alloca i1, i1 0
+  %nop3966 = alloca i1, i1 0
+  %nop3967 = alloca i1, i1 0
+  %nop3968 = alloca i1, i1 0
+  %nop3969 = alloca i1, i1 0
+  %nop3970 = alloca i1, i1 0
+  %nop3971 = alloca i1, i1 0
+  %nop3972 = alloca i1, i1 0
+  %nop3973 = alloca i1, i1 0
+  %nop3974 = alloca i1, i1 0
+  %nop3975 = alloca i1, i1 0
+  %nop3976 = alloca i1, i1 0
+  %nop3977 = alloca i1, i1 0
+  %nop3978 = alloca i1, i1 0
+  %nop3979 = alloca i1, i1 0
+  %nop3980 = alloca i1, i1 0
+  %nop3981 = alloca i1, i1 0
+  %nop3982 = alloca i1, i1 0
+  %nop3983 = alloca i1, i1 0
+  %nop3984 = alloca i1, i1 0
+  %nop3985 = alloca i1, i1 0
+  %nop3986 = alloca i1, i1 0
+  %nop3987 = alloca i1, i1 0
+  %nop3988 = alloca i1, i1 0
+  %nop3989 = alloca i1, i1 0
+  %nop3990 = alloca i1, i1 0
+  %nop3991 = alloca i1, i1 0
+  %nop3992 = alloca i1, i1 0
+  %nop3993 = alloca i1, i1 0
+  %nop3994 = alloca i1, i1 0
+  %nop3995 = alloca i1, i1 0
+  %nop3996 = alloca i1, i1 0
+  %nop3997 = alloca i1, i1 0
+  %nop3998 = alloca i1, i1 0
+  %nop3999 = alloca i1, i1 0
+  %nop4000 = alloca i1, i1 0
+  %nop4001 = alloca i1, i1 0
+  %nop4002 = alloca i1, i1 0
+  %nop4003 = alloca i1, i1 0
+  %nop4004 = alloca i1, i1 0
+  %nop4005 = alloca i1, i1 0
+  %nop4006 = alloca i1, i1 0
+  %nop4007 = alloca i1, i1 0
+  %nop4008 = alloca i1, i1 0
+  %nop4009 = alloca i1, i1 0
+  %nop4010 = alloca i1, i1 0
+  %nop4011 = alloca i1, i1 0
+  %nop4012 = alloca i1, i1 0
+  %nop4013 = alloca i1, i1 0
+  %nop4014 = alloca i1, i1 0
+  %nop4015 = alloca i1, i1 0
+  %nop4016 = alloca i1, i1 0
+  %nop4017 = alloca i1, i1 0
+  %nop4018 = alloca i1, i1 0
+  %nop4019 = alloca i1, i1 0
+  %nop4020 = alloca i1, i1 0
+  %nop4021 = alloca i1, i1 0
+  %nop4022 = alloca i1, i1 0
+  %nop4023 = alloca i1, i1 0
+  %nop4024 = alloca i1, i1 0
+  %nop4025 = alloca i1, i1 0
+  %nop4026 = alloca i1, i1 0
+  %nop4027 = alloca i1, i1 0
+  %nop4028 = alloca i1, i1 0
+  %nop4029 = alloca i1, i1 0
+  %nop4030 = alloca i1, i1 0
+  %nop4031 = alloca i1, i1 0
+  %nop4032 = alloca i1, i1 0
+  %nop4033 = alloca i1, i1 0
+  %nop4034 = alloca i1, i1 0
+  %nop4035 = alloca i1, i1 0
+  %nop4036 = alloca i1, i1 0
+  %nop4037 = alloca i1, i1 0
+  %nop4038 = alloca i1, i1 0
+  %nop4039 = alloca i1, i1 0
+  %nop4040 = alloca i1, i1 0
+  %nop4041 = alloca i1, i1 0
+  %nop4042 = alloca i1, i1 0
+  %nop4043 = alloca i1, i1 0
+  %nop4044 = alloca i1, i1 0
+  %nop4045 = alloca i1, i1 0
+  %nop4046 = alloca i1, i1 0
+  %nop4047 = alloca i1, i1 0
+  %nop4048 = alloca i1, i1 0
+  %nop4049 = alloca i1, i1 0
+  %nop4050 = alloca i1, i1 0
+  %nop4051 = alloca i1, i1 0
+  %nop4052 = alloca i1, i1 0
+  %nop4053 = alloca i1, i1 0
+  %nop4054 = alloca i1, i1 0
+  %nop4055 = alloca i1, i1 0
+  %nop4056 = alloca i1, i1 0
+  %nop4057 = alloca i1, i1 0
+  %nop4058 = alloca i1, i1 0
+  %nop4059 = alloca i1, i1 0
+  %nop4060 = alloca i1, i1 0
+  %nop4061 = alloca i1, i1 0
+  %nop4062 = alloca i1, i1 0
+  %nop4063 = alloca i1, i1 0
+  %nop4064 = alloca i1, i1 0
+  %nop4065 = alloca i1, i1 0
+  %nop4066 = alloca i1, i1 0
+  %nop4067 = alloca i1, i1 0
+  %nop4068 = alloca i1, i1 0
+  %nop4069 = alloca i1, i1 0
+  %nop4070 = alloca i1, i1 0
+  %nop4071 = alloca i1, i1 0
+  %nop4072 = alloca i1, i1 0
+  %nop4073 = alloca i1, i1 0
+  %nop4074 = alloca i1, i1 0
+  %nop4075 = alloca i1, i1 0
+  %nop4076 = alloca i1, i1 0
+  %nop4077 = alloca i1, i1 0
+  %nop4078 = alloca i1, i1 0
+  %nop4079 = alloca i1, i1 0
+  %nop4080 = alloca i1, i1 0
+  %nop4081 = alloca i1, i1 0
+  %nop4082 = alloca i1, i1 0
+  %nop4083 = alloca i1, i1 0
+  %nop4084 = alloca i1, i1 0
+  %nop4085 = alloca i1, i1 0
+  %nop4086 = alloca i1, i1 0
+  %nop4087 = alloca i1, i1 0
+  %nop4088 = alloca i1, i1 0
+  %nop4089 = alloca i1, i1 0
+  %nop4090 = alloca i1, i1 0
+  %nop4091 = alloca i1, i1 0
+  %nop4092 = alloca i1, i1 0
+  %nop4093 = alloca i1, i1 0
+  %nop4094 = alloca i1, i1 0
+  %nop4095 = alloca i1, i1 0
+  %nop4096 = alloca i1, i1 0
+  %nop4097 = alloca i1, i1 0
+  %nop4098 = alloca i1, i1 0
+  %nop4099 = alloca i1, i1 0
+  %nop4100 = alloca i1, i1 0
+  %nop4101 = alloca i1, i1 0
+  %nop4102 = alloca i1, i1 0
+  %nop4103 = alloca i1, i1 0
+  %nop4104 = alloca i1, i1 0
+  %nop4105 = alloca i1, i1 0
+  %nop4106 = alloca i1, i1 0
+  %nop4107 = alloca i1, i1 0
+  %nop4108 = alloca i1, i1 0
+  %nop4109 = alloca i1, i1 0
+  %nop4110 = alloca i1, i1 0
+  %nop4111 = alloca i1, i1 0
+  %nop4112 = alloca i1, i1 0
+  %nop4113 = alloca i1, i1 0
+  %nop4114 = alloca i1, i1 0
+  %nop4115 = alloca i1, i1 0
+  %nop4116 = alloca i1, i1 0
+  %nop4117 = alloca i1, i1 0
+  %nop4118 = alloca i1, i1 0
+  %nop4119 = alloca i1, i1 0
+  %nop4120 = alloca i1, i1 0
+  %nop4121 = alloca i1, i1 0
+  %nop4122 = alloca i1, i1 0
+  %nop4123 = alloca i1, i1 0
+  %nop4124 = alloca i1, i1 0
+  %nop4125 = alloca i1, i1 0
+  %nop4126 = alloca i1, i1 0
+  %nop4127 = alloca i1, i1 0
+  %nop4128 = alloca i1, i1 0
+  %nop4129 = alloca i1, i1 0
+  %nop4130 = alloca i1, i1 0
+  %nop4131 = alloca i1, i1 0
+  %nop4132 = alloca i1, i1 0
+  %nop4133 = alloca i1, i1 0
+  %nop4134 = alloca i1, i1 0
+  %nop4135 = alloca i1, i1 0
+  %nop4136 = alloca i1, i1 0
+  %nop4137 = alloca i1, i1 0
+  %nop4138 = alloca i1, i1 0
+  %nop4139 = alloca i1, i1 0
+  %nop4140 = alloca i1, i1 0
+  %nop4141 = alloca i1, i1 0
+  %nop4142 = alloca i1, i1 0
+  %nop4143 = alloca i1, i1 0
+  %nop4144 = alloca i1, i1 0
+  %nop4145 = alloca i1, i1 0
+  %nop4146 = alloca i1, i1 0
+  %nop4147 = alloca i1, i1 0
+  %nop4148 = alloca i1, i1 0
+  %nop4149 = alloca i1, i1 0
+  %nop4150 = alloca i1, i1 0
+  %nop4151 = alloca i1, i1 0
+  %nop4152 = alloca i1, i1 0
+  %nop4153 = alloca i1, i1 0
+  %nop4154 = alloca i1, i1 0
+  %nop4155 = alloca i1, i1 0
+  %nop4156 = alloca i1, i1 0
+  %nop4157 = alloca i1, i1 0
+  %nop4158 = alloca i1, i1 0
+  %nop4159 = alloca i1, i1 0
+  %nop4160 = alloca i1, i1 0
+  %nop4161 = alloca i1, i1 0
+  %nop4162 = alloca i1, i1 0
+  %nop4163 = alloca i1, i1 0
+  %nop4164 = alloca i1, i1 0
+  %nop4165 = alloca i1, i1 0
+  %nop4166 = alloca i1, i1 0
+  %nop4167 = alloca i1, i1 0
+  %nop4168 = alloca i1, i1 0
+  %nop4169 = alloca i1, i1 0
+  %nop4170 = alloca i1, i1 0
+  %nop4171 = alloca i1, i1 0
+  %nop4172 = alloca i1, i1 0
+  %nop4173 = alloca i1, i1 0
+  %nop4174 = alloca i1, i1 0
+  %nop4175 = alloca i1, i1 0
+  %nop4176 = alloca i1, i1 0
+  %nop4177 = alloca i1, i1 0
+  %nop4178 = alloca i1, i1 0
+  %nop4179 = alloca i1, i1 0
+  %nop4180 = alloca i1, i1 0
+  %nop4181 = alloca i1, i1 0
+  %nop4182 = alloca i1, i1 0
+  %nop4183 = alloca i1, i1 0
+  %nop4184 = alloca i1, i1 0
+  %nop4185 = alloca i1, i1 0
+  %nop4186 = alloca i1, i1 0
+  %nop4187 = alloca i1, i1 0
+  %nop4188 = alloca i1, i1 0
+  %nop4189 = alloca i1, i1 0
+  %nop4190 = alloca i1, i1 0
+  %nop4191 = alloca i1, i1 0
+  %nop4192 = alloca i1, i1 0
+  %nop4193 = alloca i1, i1 0
+  %nop4194 = alloca i1, i1 0
+  %nop4195 = alloca i1, i1 0
+  %nop4196 = alloca i1, i1 0
+  %nop4197 = alloca i1, i1 0
+  %nop4198 = alloca i1, i1 0
+  %nop4199 = alloca i1, i1 0
+  %nop4200 = alloca i1, i1 0
+  %nop4201 = alloca i1, i1 0
+  %nop4202 = alloca i1, i1 0
+  %nop4203 = alloca i1, i1 0
+  %nop4204 = alloca i1, i1 0
+  %nop4205 = alloca i1, i1 0
+  %nop4206 = alloca i1, i1 0
+  %nop4207 = alloca i1, i1 0
+  %nop4208 = alloca i1, i1 0
+  %nop4209 = alloca i1, i1 0
+  %nop4210 = alloca i1, i1 0
+  %nop4211 = alloca i1, i1 0
+  %nop4212 = alloca i1, i1 0
+  %nop4213 = alloca i1, i1 0
+  %nop4214 = alloca i1, i1 0
+  %nop4215 = alloca i1, i1 0
+  %nop4216 = alloca i1, i1 0
+  %nop4217 = alloca i1, i1 0
+  %nop4218 = alloca i1, i1 0
+  %nop4219 = alloca i1, i1 0
+  %nop4220 = alloca i1, i1 0
+  %nop4221 = alloca i1, i1 0
+  %nop4222 = alloca i1, i1 0
+  %nop4223 = alloca i1, i1 0
+  %nop4224 = alloca i1, i1 0
+  %nop4225 = alloca i1, i1 0
+  %nop4226 = alloca i1, i1 0
+  %nop4227 = alloca i1, i1 0
+  %nop4228 = alloca i1, i1 0
+  %nop4229 = alloca i1, i1 0
+  %nop4230 = alloca i1, i1 0
+  %nop4231 = alloca i1, i1 0
+  %nop4232 = alloca i1, i1 0
+  %nop4233 = alloca i1, i1 0
+  %nop4234 = alloca i1, i1 0
+  %nop4235 = alloca i1, i1 0
+  %nop4236 = alloca i1, i1 0
+  %nop4237 = alloca i1, i1 0
+  %nop4238 = alloca i1, i1 0
+  %nop4239 = alloca i1, i1 0
+  %nop4240 = alloca i1, i1 0
+  %nop4241 = alloca i1, i1 0
+  %nop4242 = alloca i1, i1 0
+  %nop4243 = alloca i1, i1 0
+  %nop4244 = alloca i1, i1 0
+  %nop4245 = alloca i1, i1 0
+  %nop4246 = alloca i1, i1 0
+  %nop4247 = alloca i1, i1 0
+  %nop4248 = alloca i1, i1 0
+  %nop4249 = alloca i1, i1 0
+  %nop4250 = alloca i1, i1 0
+  %nop4251 = alloca i1, i1 0
+  %nop4252 = alloca i1, i1 0
+  %nop4253 = alloca i1, i1 0
+  %nop4254 = alloca i1, i1 0
+  %nop4255 = alloca i1, i1 0
+  %nop4256 = alloca i1, i1 0
+  %nop4257 = alloca i1, i1 0
+  %nop4258 = alloca i1, i1 0
+  %nop4259 = alloca i1, i1 0
+  %nop4260 = alloca i1, i1 0
+  %nop4261 = alloca i1, i1 0
+  %nop4262 = alloca i1, i1 0
+  %nop4263 = alloca i1, i1 0
+  %nop4264 = alloca i1, i1 0
+  %nop4265 = alloca i1, i1 0
+  %nop4266 = alloca i1, i1 0
+  %nop4267 = alloca i1, i1 0
+  %nop4268 = alloca i1, i1 0
+  %nop4269 = alloca i1, i1 0
+  %nop4270 = alloca i1, i1 0
+  %nop4271 = alloca i1, i1 0
+  %nop4272 = alloca i1, i1 0
+  %nop4273 = alloca i1, i1 0
+  %nop4274 = alloca i1, i1 0
+  %nop4275 = alloca i1, i1 0
+  %nop4276 = alloca i1, i1 0
+  %nop4277 = alloca i1, i1 0
+  %nop4278 = alloca i1, i1 0
+  %nop4279 = alloca i1, i1 0
+  %nop4280 = alloca i1, i1 0
+  %nop4281 = alloca i1, i1 0
+  %nop4282 = alloca i1, i1 0
+  %nop4283 = alloca i1, i1 0
+  %nop4284 = alloca i1, i1 0
+  %nop4285 = alloca i1, i1 0
+  %nop4286 = alloca i1, i1 0
+  %nop4287 = alloca i1, i1 0
+  %nop4288 = alloca i1, i1 0
+  %nop4289 = alloca i1, i1 0
+  %nop4290 = alloca i1, i1 0
+  %nop4291 = alloca i1, i1 0
+  %nop4292 = alloca i1, i1 0
+  %nop4293 = alloca i1, i1 0
+  %nop4294 = alloca i1, i1 0
+  %nop4295 = alloca i1, i1 0
+  %nop4296 = alloca i1, i1 0
+  %nop4297 = alloca i1, i1 0
+  %nop4298 = alloca i1, i1 0
+  %nop4299 = alloca i1, i1 0
+  %nop4300 = alloca i1, i1 0
+  %nop4301 = alloca i1, i1 0
+  %nop4302 = alloca i1, i1 0
+  %nop4303 = alloca i1, i1 0
+  %nop4304 = alloca i1, i1 0
+  %nop4305 = alloca i1, i1 0
+  %nop4306 = alloca i1, i1 0
+  %nop4307 = alloca i1, i1 0
+  %nop4308 = alloca i1, i1 0
+  %nop4309 = alloca i1, i1 0
+  %nop4310 = alloca i1, i1 0
+  %nop4311 = alloca i1, i1 0
+  %nop4312 = alloca i1, i1 0
+  %nop4313 = alloca i1, i1 0
+  %nop4314 = alloca i1, i1 0
+  %nop4315 = alloca i1, i1 0
+  %nop4316 = alloca i1, i1 0
+  %nop4317 = alloca i1, i1 0
+  %nop4318 = alloca i1, i1 0
+  %nop4319 = alloca i1, i1 0
+  %nop4320 = alloca i1, i1 0
+  %nop4321 = alloca i1, i1 0
+  %nop4322 = alloca i1, i1 0
+  %nop4323 = alloca i1, i1 0
+  %nop4324 = alloca i1, i1 0
+  %nop4325 = alloca i1, i1 0
+  %nop4326 = alloca i1, i1 0
+  %nop4327 = alloca i1, i1 0
+  %nop4328 = alloca i1, i1 0
+  %nop4329 = alloca i1, i1 0
+  %nop4330 = alloca i1, i1 0
+  %nop4331 = alloca i1, i1 0
+  %nop4332 = alloca i1, i1 0
+  %nop4333 = alloca i1, i1 0
+  %nop4334 = alloca i1, i1 0
+  %nop4335 = alloca i1, i1 0
+  %nop4336 = alloca i1, i1 0
+  %nop4337 = alloca i1, i1 0
+  %nop4338 = alloca i1, i1 0
+  %nop4339 = alloca i1, i1 0
+  %nop4340 = alloca i1, i1 0
+  %nop4341 = alloca i1, i1 0
+  %nop4342 = alloca i1, i1 0
+  %nop4343 = alloca i1, i1 0
+  %nop4344 = alloca i1, i1 0
+  %nop4345 = alloca i1, i1 0
+  %nop4346 = alloca i1, i1 0
+  %nop4347 = alloca i1, i1 0
+  %nop4348 = alloca i1, i1 0
+  %nop4349 = alloca i1, i1 0
+  %nop4350 = alloca i1, i1 0
+  %nop4351 = alloca i1, i1 0
+  %nop4352 = alloca i1, i1 0
+  %nop4353 = alloca i1, i1 0
+  %nop4354 = alloca i1, i1 0
+  %nop4355 = alloca i1, i1 0
+  %nop4356 = alloca i1, i1 0
+  %nop4357 = alloca i1, i1 0
+  %nop4358 = alloca i1, i1 0
+  %nop4359 = alloca i1, i1 0
+  %nop4360 = alloca i1, i1 0
+  %nop4361 = alloca i1, i1 0
+  %nop4362 = alloca i1, i1 0
+  %nop4363 = alloca i1, i1 0
+  %nop4364 = alloca i1, i1 0
+  %nop4365 = alloca i1, i1 0
+  %nop4366 = alloca i1, i1 0
+  %nop4367 = alloca i1, i1 0
+  %nop4368 = alloca i1, i1 0
+  %nop4369 = alloca i1, i1 0
+  %nop4370 = alloca i1, i1 0
+  %nop4371 = alloca i1, i1 0
+  %nop4372 = alloca i1, i1 0
+  %nop4373 = alloca i1, i1 0
+  %nop4374 = alloca i1, i1 0
+  %nop4375 = alloca i1, i1 0
+  %nop4376 = alloca i1, i1 0
+  %nop4377 = alloca i1, i1 0
+  %nop4378 = alloca i1, i1 0
+  %nop4379 = alloca i1, i1 0
+  %nop4380 = alloca i1, i1 0
+  %nop4381 = alloca i1, i1 0
+  %nop4382 = alloca i1, i1 0
+  %nop4383 = alloca i1, i1 0
+  %nop4384 = alloca i1, i1 0
+  %nop4385 = alloca i1, i1 0
+  %nop4386 = alloca i1, i1 0
+  %nop4387 = alloca i1, i1 0
+  %nop4388 = alloca i1, i1 0
+  %nop4389 = alloca i1, i1 0
+  %nop4390 = alloca i1, i1 0
+  %nop4391 = alloca i1, i1 0
+  %nop4392 = alloca i1, i1 0
+  %nop4393 = alloca i1, i1 0
+  %nop4394 = alloca i1, i1 0
+  %nop4395 = alloca i1, i1 0
+  %nop4396 = alloca i1, i1 0
+  %nop4397 = alloca i1, i1 0
+  %nop4398 = alloca i1, i1 0
+  %nop4399 = alloca i1, i1 0
+  %nop4400 = alloca i1, i1 0
+  %nop4401 = alloca i1, i1 0
+  %nop4402 = alloca i1, i1 0
+  %nop4403 = alloca i1, i1 0
+  %nop4404 = alloca i1, i1 0
+  %nop4405 = alloca i1, i1 0
+  %nop4406 = alloca i1, i1 0
+  %nop4407 = alloca i1, i1 0
+  %nop4408 = alloca i1, i1 0
+  %nop4409 = alloca i1, i1 0
+  %nop4410 = alloca i1, i1 0
+  %nop4411 = alloca i1, i1 0
+  %nop4412 = alloca i1, i1 0
+  %nop4413 = alloca i1, i1 0
+  %nop4414 = alloca i1, i1 0
+  %nop4415 = alloca i1, i1 0
+  %nop4416 = alloca i1, i1 0
+  %nop4417 = alloca i1, i1 0
+  %nop4418 = alloca i1, i1 0
+  %nop4419 = alloca i1, i1 0
+  %nop4420 = alloca i1, i1 0
+  %nop4421 = alloca i1, i1 0
+  %nop4422 = alloca i1, i1 0
+  %nop4423 = alloca i1, i1 0
+  %nop4424 = alloca i1, i1 0
+  %nop4425 = alloca i1, i1 0
+  %nop4426 = alloca i1, i1 0
+  %nop4427 = alloca i1, i1 0
+  %nop4428 = alloca i1, i1 0
+  %nop4429 = alloca i1, i1 0
+  %nop4430 = alloca i1, i1 0
+  %nop4431 = alloca i1, i1 0
+  %nop4432 = alloca i1, i1 0
+  %nop4433 = alloca i1, i1 0
+  %nop4434 = alloca i1, i1 0
+  %nop4435 = alloca i1, i1 0
+  %nop4436 = alloca i1, i1 0
+  %nop4437 = alloca i1, i1 0
+  %nop4438 = alloca i1, i1 0
+  %nop4439 = alloca i1, i1 0
+  %nop4440 = alloca i1, i1 0
+  %nop4441 = alloca i1, i1 0
+  %nop4442 = alloca i1, i1 0
+  %nop4443 = alloca i1, i1 0
+  %nop4444 = alloca i1, i1 0
+  %nop4445 = alloca i1, i1 0
+  %nop4446 = alloca i1, i1 0
+  %nop4447 = alloca i1, i1 0
+  %nop4448 = alloca i1, i1 0
+  %nop4449 = alloca i1, i1 0
+  %nop4450 = alloca i1, i1 0
+  %nop4451 = alloca i1, i1 0
+  %nop4452 = alloca i1, i1 0
+  %nop4453 = alloca i1, i1 0
+  %nop4454 = alloca i1, i1 0
+  %nop4455 = alloca i1, i1 0
+  %nop4456 = alloca i1, i1 0
+  %nop4457 = alloca i1, i1 0
+  %nop4458 = alloca i1, i1 0
+  %nop4459 = alloca i1, i1 0
+  %nop4460 = alloca i1, i1 0
+  %nop4461 = alloca i1, i1 0
+  %nop4462 = alloca i1, i1 0
+  %nop4463 = alloca i1, i1 0
+  %nop4464 = alloca i1, i1 0
+  %nop4465 = alloca i1, i1 0
+  %nop4466 = alloca i1, i1 0
+  %nop4467 = alloca i1, i1 0
+  %nop4468 = alloca i1, i1 0
+  %nop4469 = alloca i1, i1 0
+  %nop4470 = alloca i1, i1 0
+  %nop4471 = alloca i1, i1 0
+  %nop4472 = alloca i1, i1 0
+  %nop4473 = alloca i1, i1 0
+  %nop4474 = alloca i1, i1 0
+  %nop4475 = alloca i1, i1 0
+  %nop4476 = alloca i1, i1 0
+  %nop4477 = alloca i1, i1 0
+  %nop4478 = alloca i1, i1 0
+  %nop4479 = alloca i1, i1 0
+  %nop4480 = alloca i1, i1 0
+  %nop4481 = alloca i1, i1 0
+  %nop4482 = alloca i1, i1 0
+  %nop4483 = alloca i1, i1 0
+  %nop4484 = alloca i1, i1 0
+  %nop4485 = alloca i1, i1 0
+  %nop4486 = alloca i1, i1 0
+  %nop4487 = alloca i1, i1 0
+  %nop4488 = alloca i1, i1 0
+  %nop4489 = alloca i1, i1 0
+  %nop4490 = alloca i1, i1 0
+  %nop4491 = alloca i1, i1 0
+  %nop4492 = alloca i1, i1 0
+  %nop4493 = alloca i1, i1 0
+  %nop4494 = alloca i1, i1 0
+  %nop4495 = alloca i1, i1 0
+  %nop4496 = alloca i1, i1 0
+  %nop4497 = alloca i1, i1 0
+  %nop4498 = alloca i1, i1 0
+  %nop4499 = alloca i1, i1 0
+  %nop4500 = alloca i1, i1 0
+  %nop4501 = alloca i1, i1 0
+  %nop4502 = alloca i1, i1 0
+  %nop4503 = alloca i1, i1 0
+  %nop4504 = alloca i1, i1 0
+  %nop4505 = alloca i1, i1 0
+  %nop4506 = alloca i1, i1 0
+  %nop4507 = alloca i1, i1 0
+  %nop4508 = alloca i1, i1 0
+  %nop4509 = alloca i1, i1 0
+  %nop4510 = alloca i1, i1 0
+  %nop4511 = alloca i1, i1 0
+  %nop4512 = alloca i1, i1 0
+  %nop4513 = alloca i1, i1 0
+  %nop4514 = alloca i1, i1 0
+  %nop4515 = alloca i1, i1 0
+  %nop4516 = alloca i1, i1 0
+  %nop4517 = alloca i1, i1 0
+  %nop4518 = alloca i1, i1 0
+  %nop4519 = alloca i1, i1 0
+  %nop4520 = alloca i1, i1 0
+  %nop4521 = alloca i1, i1 0
+  %nop4522 = alloca i1, i1 0
+  %nop4523 = alloca i1, i1 0
+  %nop4524 = alloca i1, i1 0
+  %nop4525 = alloca i1, i1 0
+  %nop4526 = alloca i1, i1 0
+  %nop4527 = alloca i1, i1 0
+  %nop4528 = alloca i1, i1 0
+  %nop4529 = alloca i1, i1 0
+  %nop4530 = alloca i1, i1 0
+  %nop4531 = alloca i1, i1 0
+  %nop4532 = alloca i1, i1 0
+  %nop4533 = alloca i1, i1 0
+  %nop4534 = alloca i1, i1 0
+  %nop4535 = alloca i1, i1 0
+  %nop4536 = alloca i1, i1 0
+  %nop4537 = alloca i1, i1 0
+  %nop4538 = alloca i1, i1 0
+  %nop4539 = alloca i1, i1 0
+  %nop4540 = alloca i1, i1 0
+  %nop4541 = alloca i1, i1 0
+  %nop4542 = alloca i1, i1 0
+  %nop4543 = alloca i1, i1 0
+  %nop4544 = alloca i1, i1 0
+  %nop4545 = alloca i1, i1 0
+  %nop4546 = alloca i1, i1 0
+  %nop4547 = alloca i1, i1 0
+  %nop4548 = alloca i1, i1 0
+  %nop4549 = alloca i1, i1 0
+  %nop4550 = alloca i1, i1 0
+  %nop4551 = alloca i1, i1 0
+  %nop4552 = alloca i1, i1 0
+  %nop4553 = alloca i1, i1 0
+  %nop4554 = alloca i1, i1 0
+  %nop4555 = alloca i1, i1 0
+  %nop4556 = alloca i1, i1 0
+  %nop4557 = alloca i1, i1 0
+  %nop4558 = alloca i1, i1 0
+  %nop4559 = alloca i1, i1 0
+  %nop4560 = alloca i1, i1 0
+  %nop4561 = alloca i1, i1 0
+  %nop4562 = alloca i1, i1 0
+  %nop4563 = alloca i1, i1 0
+  %nop4564 = alloca i1, i1 0
+  %nop4565 = alloca i1, i1 0
+  %nop4566 = alloca i1, i1 0
+  %nop4567 = alloca i1, i1 0
+  %nop4568 = alloca i1, i1 0
+  %nop4569 = alloca i1, i1 0
+  %nop4570 = alloca i1, i1 0
+  %nop4571 = alloca i1, i1 0
+  %nop4572 = alloca i1, i1 0
+  %nop4573 = alloca i1, i1 0
+  %nop4574 = alloca i1, i1 0
+  %nop4575 = alloca i1, i1 0
+  %nop4576 = alloca i1, i1 0
+  %nop4577 = alloca i1, i1 0
+  %nop4578 = alloca i1, i1 0
+  %nop4579 = alloca i1, i1 0
+  %nop4580 = alloca i1, i1 0
+  %nop4581 = alloca i1, i1 0
+  %nop4582 = alloca i1, i1 0
+  %nop4583 = alloca i1, i1 0
+  %nop4584 = alloca i1, i1 0
+  %nop4585 = alloca i1, i1 0
+  %nop4586 = alloca i1, i1 0
+  %nop4587 = alloca i1, i1 0
+  %nop4588 = alloca i1, i1 0
+  %nop4589 = alloca i1, i1 0
+  %nop4590 = alloca i1, i1 0
+  %nop4591 = alloca i1, i1 0
+  %nop4592 = alloca i1, i1 0
+  %nop4593 = alloca i1, i1 0
+  %nop4594 = alloca i1, i1 0
+  %nop4595 = alloca i1, i1 0
+  %nop4596 = alloca i1, i1 0
+  %nop4597 = alloca i1, i1 0
+  %nop4598 = alloca i1, i1 0
+  %nop4599 = alloca i1, i1 0
+  %nop4600 = alloca i1, i1 0
+  %nop4601 = alloca i1, i1 0
+  %nop4602 = alloca i1, i1 0
+  %nop4603 = alloca i1, i1 0
+  %nop4604 = alloca i1, i1 0
+  %nop4605 = alloca i1, i1 0
+  %nop4606 = alloca i1, i1 0
+  %nop4607 = alloca i1, i1 0
+  %nop4608 = alloca i1, i1 0
+  %nop4609 = alloca i1, i1 0
+  %nop4610 = alloca i1, i1 0
+  %nop4611 = alloca i1, i1 0
+  %nop4612 = alloca i1, i1 0
+  %nop4613 = alloca i1, i1 0
+  %nop4614 = alloca i1, i1 0
+  %nop4615 = alloca i1, i1 0
+  %nop4616 = alloca i1, i1 0
+  %nop4617 = alloca i1, i1 0
+  %nop4618 = alloca i1, i1 0
+  %nop4619 = alloca i1, i1 0
+  %nop4620 = alloca i1, i1 0
+  %nop4621 = alloca i1, i1 0
+  %nop4622 = alloca i1, i1 0
+  %nop4623 = alloca i1, i1 0
+  %nop4624 = alloca i1, i1 0
+  %nop4625 = alloca i1, i1 0
+  %nop4626 = alloca i1, i1 0
+  %nop4627 = alloca i1, i1 0
+  %nop4628 = alloca i1, i1 0
+  %nop4629 = alloca i1, i1 0
+  %nop4630 = alloca i1, i1 0
+  %nop4631 = alloca i1, i1 0
+  %nop4632 = alloca i1, i1 0
+  %nop4633 = alloca i1, i1 0
+  %nop4634 = alloca i1, i1 0
+  %nop4635 = alloca i1, i1 0
+  %nop4636 = alloca i1, i1 0
+  %nop4637 = alloca i1, i1 0
+  %nop4638 = alloca i1, i1 0
+  %nop4639 = alloca i1, i1 0
+  %nop4640 = alloca i1, i1 0
+  %nop4641 = alloca i1, i1 0
+  %nop4642 = alloca i1, i1 0
+  %nop4643 = alloca i1, i1 0
+  %nop4644 = alloca i1, i1 0
+  %nop4645 = alloca i1, i1 0
+  %nop4646 = alloca i1, i1 0
+  %nop4647 = alloca i1, i1 0
+  %nop4648 = alloca i1, i1 0
+  %nop4649 = alloca i1, i1 0
+  %nop4650 = alloca i1, i1 0
+  %nop4651 = alloca i1, i1 0
+  %nop4652 = alloca i1, i1 0
+  %nop4653 = alloca i1, i1 0
+  %nop4654 = alloca i1, i1 0
+  %nop4655 = alloca i1, i1 0
+  %nop4656 = alloca i1, i1 0
+  %nop4657 = alloca i1, i1 0
+  %nop4658 = alloca i1, i1 0
+  %nop4659 = alloca i1, i1 0
+  %nop4660 = alloca i1, i1 0
+  %nop4661 = alloca i1, i1 0
+  %nop4662 = alloca i1, i1 0
+  %nop4663 = alloca i1, i1 0
+  %nop4664 = alloca i1, i1 0
+  %nop4665 = alloca i1, i1 0
+  %nop4666 = alloca i1, i1 0
+  %nop4667 = alloca i1, i1 0
+  %nop4668 = alloca i1, i1 0
+  %nop4669 = alloca i1, i1 0
+  %nop4670 = alloca i1, i1 0
+  %nop4671 = alloca i1, i1 0
+  %nop4672 = alloca i1, i1 0
+  %nop4673 = alloca i1, i1 0
+  %nop4674 = alloca i1, i1 0
+  %nop4675 = alloca i1, i1 0
+  %nop4676 = alloca i1, i1 0
+  %nop4677 = alloca i1, i1 0
+  %nop4678 = alloca i1, i1 0
+  %nop4679 = alloca i1, i1 0
+  %nop4680 = alloca i1, i1 0
+  %nop4681 = alloca i1, i1 0
+  %nop4682 = alloca i1, i1 0
+  %nop4683 = alloca i1, i1 0
+  %nop4684 = alloca i1, i1 0
+  %nop4685 = alloca i1, i1 0
+  %nop4686 = alloca i1, i1 0
+  %nop4687 = alloca i1, i1 0
+  %nop4688 = alloca i1, i1 0
+  %nop4689 = alloca i1, i1 0
+  %nop4690 = alloca i1, i1 0
+  %nop4691 = alloca i1, i1 0
+  %nop4692 = alloca i1, i1 0
+  %nop4693 = alloca i1, i1 0
+  %nop4694 = alloca i1, i1 0
+  %nop4695 = alloca i1, i1 0
+  %nop4696 = alloca i1, i1 0
+  %nop4697 = alloca i1, i1 0
+  %nop4698 = alloca i1, i1 0
+  %nop4699 = alloca i1, i1 0
+  %nop4700 = alloca i1, i1 0
+  %nop4701 = alloca i1, i1 0
+  %nop4702 = alloca i1, i1 0
+  %nop4703 = alloca i1, i1 0
+  %nop4704 = alloca i1, i1 0
+  %nop4705 = alloca i1, i1 0
+  %nop4706 = alloca i1, i1 0
+  %nop4707 = alloca i1, i1 0
+  %nop4708 = alloca i1, i1 0
+  %nop4709 = alloca i1, i1 0
+  %nop4710 = alloca i1, i1 0
+  %nop4711 = alloca i1, i1 0
+  %nop4712 = alloca i1, i1 0
+  %nop4713 = alloca i1, i1 0
+  %nop4714 = alloca i1, i1 0
+  %nop4715 = alloca i1, i1 0
+  %nop4716 = alloca i1, i1 0
+  %nop4717 = alloca i1, i1 0
+  %nop4718 = alloca i1, i1 0
+  %nop4719 = alloca i1, i1 0
+  %nop4720 = alloca i1, i1 0
+  %nop4721 = alloca i1, i1 0
+  %nop4722 = alloca i1, i1 0
+  %nop4723 = alloca i1, i1 0
+  %nop4724 = alloca i1, i1 0
+  %nop4725 = alloca i1, i1 0
+  %nop4726 = alloca i1, i1 0
+  %nop4727 = alloca i1, i1 0
+  %nop4728 = alloca i1, i1 0
+  %nop4729 = alloca i1, i1 0
+  %nop4730 = alloca i1, i1 0
+  %nop4731 = alloca i1, i1 0
+  %nop4732 = alloca i1, i1 0
+  %nop4733 = alloca i1, i1 0
+  %nop4734 = alloca i1, i1 0
+  %nop4735 = alloca i1, i1 0
+  %nop4736 = alloca i1, i1 0
+  %nop4737 = alloca i1, i1 0
+  %nop4738 = alloca i1, i1 0
+  %nop4739 = alloca i1, i1 0
+  %nop4740 = alloca i1, i1 0
+  %nop4741 = alloca i1, i1 0
+  %nop4742 = alloca i1, i1 0
+  %nop4743 = alloca i1, i1 0
+  %nop4744 = alloca i1, i1 0
+  %nop4745 = alloca i1, i1 0
+  %nop4746 = alloca i1, i1 0
+  %nop4747 = alloca i1, i1 0
+  %nop4748 = alloca i1, i1 0
+  %nop4749 = alloca i1, i1 0
+  %nop4750 = alloca i1, i1 0
+  %nop4751 = alloca i1, i1 0
+  %nop4752 = alloca i1, i1 0
+  %nop4753 = alloca i1, i1 0
+  %nop4754 = alloca i1, i1 0
+  %nop4755 = alloca i1, i1 0
+  %nop4756 = alloca i1, i1 0
+  %nop4757 = alloca i1, i1 0
+  %nop4758 = alloca i1, i1 0
+  %nop4759 = alloca i1, i1 0
+  %nop4760 = alloca i1, i1 0
+  %nop4761 = alloca i1, i1 0
+  %nop4762 = alloca i1, i1 0
+  %nop4763 = alloca i1, i1 0
+  %nop4764 = alloca i1, i1 0
+  %nop4765 = alloca i1, i1 0
+  %nop4766 = alloca i1, i1 0
+  %nop4767 = alloca i1, i1 0
+  %nop4768 = alloca i1, i1 0
+  %nop4769 = alloca i1, i1 0
+  %nop4770 = alloca i1, i1 0
+  %nop4771 = alloca i1, i1 0
+  %nop4772 = alloca i1, i1 0
+  %nop4773 = alloca i1, i1 0
+  %nop4774 = alloca i1, i1 0
+  %nop4775 = alloca i1, i1 0
+  %nop4776 = alloca i1, i1 0
+  %nop4777 = alloca i1, i1 0
+  %nop4778 = alloca i1, i1 0
+  %nop4779 = alloca i1, i1 0
+  %nop4780 = alloca i1, i1 0
+  %nop4781 = alloca i1, i1 0
+  %nop4782 = alloca i1, i1 0
+  %nop4783 = alloca i1, i1 0
+  %nop4784 = alloca i1, i1 0
+  %nop4785 = alloca i1, i1 0
+  %nop4786 = alloca i1, i1 0
+  %nop4787 = alloca i1, i1 0
+  %nop4788 = alloca i1, i1 0
+  %nop4789 = alloca i1, i1 0
+  %nop4790 = alloca i1, i1 0
+  %nop4791 = alloca i1, i1 0
+  %nop4792 = alloca i1, i1 0
+  %nop4793 = alloca i1, i1 0
+  %nop4794 = alloca i1, i1 0
+  %nop4795 = alloca i1, i1 0
+  %nop4796 = alloca i1, i1 0
+  %nop4797 = alloca i1, i1 0
+  %nop4798 = alloca i1, i1 0
+  %nop4799 = alloca i1, i1 0
+  %nop4800 = alloca i1, i1 0
+  %nop4801 = alloca i1, i1 0
+  %nop4802 = alloca i1, i1 0
+  %nop4803 = alloca i1, i1 0
+  %nop4804 = alloca i1, i1 0
+  %nop4805 = alloca i1, i1 0
+  %nop4806 = alloca i1, i1 0
+  %nop4807 = alloca i1, i1 0
+  %nop4808 = alloca i1, i1 0
+  %nop4809 = alloca i1, i1 0
+  %nop4810 = alloca i1, i1 0
+  %nop4811 = alloca i1, i1 0
+  %nop4812 = alloca i1, i1 0
+  %nop4813 = alloca i1, i1 0
+  %nop4814 = alloca i1, i1 0
+  %nop4815 = alloca i1, i1 0
+  %nop4816 = alloca i1, i1 0
+  %nop4817 = alloca i1, i1 0
+  %nop4818 = alloca i1, i1 0
+  %nop4819 = alloca i1, i1 0
+  %nop4820 = alloca i1, i1 0
+  %nop4821 = alloca i1, i1 0
+  %nop4822 = alloca i1, i1 0
+  %nop4823 = alloca i1, i1 0
+  %nop4824 = alloca i1, i1 0
+  %nop4825 = alloca i1, i1 0
+  %nop4826 = alloca i1, i1 0
+  %nop4827 = alloca i1, i1 0
+  %nop4828 = alloca i1, i1 0
+  %nop4829 = alloca i1, i1 0
+  %nop4830 = alloca i1, i1 0
+  %nop4831 = alloca i1, i1 0
+  %nop4832 = alloca i1, i1 0
+  %nop4833 = alloca i1, i1 0
+  %nop4834 = alloca i1, i1 0
+  %nop4835 = alloca i1, i1 0
+  %nop4836 = alloca i1, i1 0
+  %nop4837 = alloca i1, i1 0
+  %nop4838 = alloca i1, i1 0
+  %nop4839 = alloca i1, i1 0
+  %nop4840 = alloca i1, i1 0
+  %nop4841 = alloca i1, i1 0
+  %nop4842 = alloca i1, i1 0
+  %nop4843 = alloca i1, i1 0
+  %nop4844 = alloca i1, i1 0
+  %nop4845 = alloca i1, i1 0
+  %nop4846 = alloca i1, i1 0
+  %nop4847 = alloca i1, i1 0
+  %nop4848 = alloca i1, i1 0
+  %nop4849 = alloca i1, i1 0
+  %nop4850 = alloca i1, i1 0
+  %nop4851 = alloca i1, i1 0
+  %nop4852 = alloca i1, i1 0
+  %nop4853 = alloca i1, i1 0
+  %nop4854 = alloca i1, i1 0
+  %nop4855 = alloca i1, i1 0
+  %nop4856 = alloca i1, i1 0
+  %nop4857 = alloca i1, i1 0
+  %nop4858 = alloca i1, i1 0
+  %nop4859 = alloca i1, i1 0
+  %nop4860 = alloca i1, i1 0
+  %nop4861 = alloca i1, i1 0
+  %nop4862 = alloca i1, i1 0
+  %nop4863 = alloca i1, i1 0
+  %nop4864 = alloca i1, i1 0
+  %nop4865 = alloca i1, i1 0
+  %nop4866 = alloca i1, i1 0
+  %nop4867 = alloca i1, i1 0
+  %nop4868 = alloca i1, i1 0
+  %nop4869 = alloca i1, i1 0
+  %nop4870 = alloca i1, i1 0
+  %nop4871 = alloca i1, i1 0
+  %nop4872 = alloca i1, i1 0
+  %nop4873 = alloca i1, i1 0
+  %nop4874 = alloca i1, i1 0
+  %nop4875 = alloca i1, i1 0
+  %nop4876 = alloca i1, i1 0
+  %nop4877 = alloca i1, i1 0
+  %nop4878 = alloca i1, i1 0
+  %nop4879 = alloca i1, i1 0
+  %nop4880 = alloca i1, i1 0
+  %nop4881 = alloca i1, i1 0
+  %nop4882 = alloca i1, i1 0
+  %nop4883 = alloca i1, i1 0
+  %nop4884 = alloca i1, i1 0
+  %nop4885 = alloca i1, i1 0
+  %nop4886 = alloca i1, i1 0
+  %nop4887 = alloca i1, i1 0
+  %nop4888 = alloca i1, i1 0
+  %nop4889 = alloca i1, i1 0
+  %nop4890 = alloca i1, i1 0
+  %nop4891 = alloca i1, i1 0
+  %nop4892 = alloca i1, i1 0
+  %nop4893 = alloca i1, i1 0
+  %nop4894 = alloca i1, i1 0
+  %nop4895 = alloca i1, i1 0
+  %nop4896 = alloca i1, i1 0
+  %nop4897 = alloca i1, i1 0
+  %nop4898 = alloca i1, i1 0
+  %nop4899 = alloca i1, i1 0
+  %nop4900 = alloca i1, i1 0
+  %nop4901 = alloca i1, i1 0
+  %nop4902 = alloca i1, i1 0
+  %nop4903 = alloca i1, i1 0
+  %nop4904 = alloca i1, i1 0
+  %nop4905 = alloca i1, i1 0
+  %nop4906 = alloca i1, i1 0
+  %nop4907 = alloca i1, i1 0
+  %nop4908 = alloca i1, i1 0
+  %nop4909 = alloca i1, i1 0
+  %nop4910 = alloca i1, i1 0
+  %nop4911 = alloca i1, i1 0
+  %nop4912 = alloca i1, i1 0
+  %nop4913 = alloca i1, i1 0
+  %nop4914 = alloca i1, i1 0
+  %nop4915 = alloca i1, i1 0
+  %nop4916 = alloca i1, i1 0
+  %nop4917 = alloca i1, i1 0
+  %nop4918 = alloca i1, i1 0
+  %nop4919 = alloca i1, i1 0
+  %nop4920 = alloca i1, i1 0
+  %nop4921 = alloca i1, i1 0
+  %nop4922 = alloca i1, i1 0
+  %nop4923 = alloca i1, i1 0
+  %nop4924 = alloca i1, i1 0
+  %nop4925 = alloca i1, i1 0
+  %nop4926 = alloca i1, i1 0
+  %nop4927 = alloca i1, i1 0
+  %nop4928 = alloca i1, i1 0
+  %nop4929 = alloca i1, i1 0
+  %nop4930 = alloca i1, i1 0
+  %nop4931 = alloca i1, i1 0
+  %nop4932 = alloca i1, i1 0
+  %nop4933 = alloca i1, i1 0
+  %nop4934 = alloca i1, i1 0
+  %nop4935 = alloca i1, i1 0
+  %nop4936 = alloca i1, i1 0
+  %nop4937 = alloca i1, i1 0
+  %nop4938 = alloca i1, i1 0
+  %nop4939 = alloca i1, i1 0
+  %nop4940 = alloca i1, i1 0
+  %nop4941 = alloca i1, i1 0
+  %nop4942 = alloca i1, i1 0
+  %nop4943 = alloca i1, i1 0
+  %nop4944 = alloca i1, i1 0
+  %nop4945 = alloca i1, i1 0
+  %nop4946 = alloca i1, i1 0
+  %nop4947 = alloca i1, i1 0
+  %nop4948 = alloca i1, i1 0
+  %nop4949 = alloca i1, i1 0
+  %nop4950 = alloca i1, i1 0
+  %nop4951 = alloca i1, i1 0
+  %nop4952 = alloca i1, i1 0
+  %nop4953 = alloca i1, i1 0
+  %nop4954 = alloca i1, i1 0
+  %nop4955 = alloca i1, i1 0
+  %nop4956 = alloca i1, i1 0
+  %nop4957 = alloca i1, i1 0
+  %nop4958 = alloca i1, i1 0
+  %nop4959 = alloca i1, i1 0
+  %nop4960 = alloca i1, i1 0
+  %nop4961 = alloca i1, i1 0
+  %nop4962 = alloca i1, i1 0
+  %nop4963 = alloca i1, i1 0
+  %nop4964 = alloca i1, i1 0
+  %nop4965 = alloca i1, i1 0
+  %nop4966 = alloca i1, i1 0
+  %nop4967 = alloca i1, i1 0
+  %nop4968 = alloca i1, i1 0
+  %nop4969 = alloca i1, i1 0
+  %nop4970 = alloca i1, i1 0
+  %nop4971 = alloca i1, i1 0
+  %nop4972 = alloca i1, i1 0
+  %nop4973 = alloca i1, i1 0
+  %nop4974 = alloca i1, i1 0
+  %nop4975 = alloca i1, i1 0
+  %nop4976 = alloca i1, i1 0
+  %nop4977 = alloca i1, i1 0
+  %nop4978 = alloca i1, i1 0
+  %nop4979 = alloca i1, i1 0
+  %nop4980 = alloca i1, i1 0
+  %nop4981 = alloca i1, i1 0
+  %nop4982 = alloca i1, i1 0
+  %nop4983 = alloca i1, i1 0
+  %nop4984 = alloca i1, i1 0
+  %nop4985 = alloca i1, i1 0
+  %nop4986 = alloca i1, i1 0
+  %nop4987 = alloca i1, i1 0
+  %nop4988 = alloca i1, i1 0
+  %nop4989 = alloca i1, i1 0
+  %nop4990 = alloca i1, i1 0
+  %nop4991 = alloca i1, i1 0
+  %nop4992 = alloca i1, i1 0
+  %nop4993 = alloca i1, i1 0
+  %nop4994 = alloca i1, i1 0
+  %nop4995 = alloca i1, i1 0
+  %nop4996 = alloca i1, i1 0
+  %nop4997 = alloca i1, i1 0
+  %nop4998 = alloca i1, i1 0
+  %nop4999 = alloca i1, i1 0
+  %nop5000 = alloca i1, i1 0
+  %nop5001 = alloca i1, i1 0
+  %nop5002 = alloca i1, i1 0
+  %nop5003 = alloca i1, i1 0
+  %nop5004 = alloca i1, i1 0
+  %nop5005 = alloca i1, i1 0
+  %nop5006 = alloca i1, i1 0
+  %nop5007 = alloca i1, i1 0
+  %nop5008 = alloca i1, i1 0
+  %nop5009 = alloca i1, i1 0
+  %nop5010 = alloca i1, i1 0
+  %nop5011 = alloca i1, i1 0
+  %nop5012 = alloca i1, i1 0
+  %nop5013 = alloca i1, i1 0
+  %nop5014 = alloca i1, i1 0
+  %nop5015 = alloca i1, i1 0
+  %nop5016 = alloca i1, i1 0
+  %nop5017 = alloca i1, i1 0
+  %nop5018 = alloca i1, i1 0
+  %nop5019 = alloca i1, i1 0
+  %nop5020 = alloca i1, i1 0
+  %nop5021 = alloca i1, i1 0
+  %nop5022 = alloca i1, i1 0
+  %nop5023 = alloca i1, i1 0
+  %nop5024 = alloca i1, i1 0
+  %nop5025 = alloca i1, i1 0
+  %nop5026 = alloca i1, i1 0
+  %nop5027 = alloca i1, i1 0
+  %nop5028 = alloca i1, i1 0
+  %nop5029 = alloca i1, i1 0
+  %nop5030 = alloca i1, i1 0
+  %nop5031 = alloca i1, i1 0
+  %nop5032 = alloca i1, i1 0
+  %nop5033 = alloca i1, i1 0
+  %nop5034 = alloca i1, i1 0
+  %nop5035 = alloca i1, i1 0
+  %nop5036 = alloca i1, i1 0
+  %nop5037 = alloca i1, i1 0
+  %nop5038 = alloca i1, i1 0
+  %nop5039 = alloca i1, i1 0
+  %nop5040 = alloca i1, i1 0
+  %nop5041 = alloca i1, i1 0
+  %nop5042 = alloca i1, i1 0
+  %nop5043 = alloca i1, i1 0
+  %nop5044 = alloca i1, i1 0
+  %nop5045 = alloca i1, i1 0
+  %nop5046 = alloca i1, i1 0
+  %nop5047 = alloca i1, i1 0
+  %nop5048 = alloca i1, i1 0
+  %nop5049 = alloca i1, i1 0
+  %nop5050 = alloca i1, i1 0
+  %nop5051 = alloca i1, i1 0
+  %nop5052 = alloca i1, i1 0
+  %nop5053 = alloca i1, i1 0
+  %nop5054 = alloca i1, i1 0
+  %nop5055 = alloca i1, i1 0
+  %nop5056 = alloca i1, i1 0
+  %nop5057 = alloca i1, i1 0
+  %nop5058 = alloca i1, i1 0
+  %nop5059 = alloca i1, i1 0
+  %nop5060 = alloca i1, i1 0
+  %nop5061 = alloca i1, i1 0
+  %nop5062 = alloca i1, i1 0
+  %nop5063 = alloca i1, i1 0
+  %nop5064 = alloca i1, i1 0
+  %nop5065 = alloca i1, i1 0
+  %nop5066 = alloca i1, i1 0
+  %nop5067 = alloca i1, i1 0
+  %nop5068 = alloca i1, i1 0
+  %nop5069 = alloca i1, i1 0
+  %nop5070 = alloca i1, i1 0
+  %nop5071 = alloca i1, i1 0
+  %nop5072 = alloca i1, i1 0
+  %nop5073 = alloca i1, i1 0
+  %nop5074 = alloca i1, i1 0
+  %nop5075 = alloca i1, i1 0
+  %nop5076 = alloca i1, i1 0
+  %nop5077 = alloca i1, i1 0
+  %nop5078 = alloca i1, i1 0
+  %nop5079 = alloca i1, i1 0
+  %nop5080 = alloca i1, i1 0
+  %nop5081 = alloca i1, i1 0
+  %nop5082 = alloca i1, i1 0
+  %nop5083 = alloca i1, i1 0
+  %nop5084 = alloca i1, i1 0
+  %nop5085 = alloca i1, i1 0
+  %nop5086 = alloca i1, i1 0
+  %nop5087 = alloca i1, i1 0
+  %nop5088 = alloca i1, i1 0
+  %nop5089 = alloca i1, i1 0
+  %nop5090 = alloca i1, i1 0
+  %nop5091 = alloca i1, i1 0
+  %nop5092 = alloca i1, i1 0
+  %nop5093 = alloca i1, i1 0
+  %nop5094 = alloca i1, i1 0
+  %nop5095 = alloca i1, i1 0
+  %nop5096 = alloca i1, i1 0
+  %nop5097 = alloca i1, i1 0
+  %nop5098 = alloca i1, i1 0
+  %nop5099 = alloca i1, i1 0
+  %nop5100 = alloca i1, i1 0
+  %nop5101 = alloca i1, i1 0
+  %nop5102 = alloca i1, i1 0
+  %nop5103 = alloca i1, i1 0
+  %nop5104 = alloca i1, i1 0
+  %nop5105 = alloca i1, i1 0
+  %nop5106 = alloca i1, i1 0
+  %nop5107 = alloca i1, i1 0
+  %nop5108 = alloca i1, i1 0
+  %nop5109 = alloca i1, i1 0
+  %nop5110 = alloca i1, i1 0
+  %nop5111 = alloca i1, i1 0
+  %nop5112 = alloca i1, i1 0
+  %nop5113 = alloca i1, i1 0
+  %nop5114 = alloca i1, i1 0
+  %nop5115 = alloca i1, i1 0
+  %nop5116 = alloca i1, i1 0
+  %nop5117 = alloca i1, i1 0
+  %nop5118 = alloca i1, i1 0
+  %nop5119 = alloca i1, i1 0
+  %nop5120 = alloca i1, i1 0
+  %nop5121 = alloca i1, i1 0
+  %nop5122 = alloca i1, i1 0
+  %nop5123 = alloca i1, i1 0
+  %nop5124 = alloca i1, i1 0
+  %nop5125 = alloca i1, i1 0
+  %nop5126 = alloca i1, i1 0
+  %nop5127 = alloca i1, i1 0
+  %nop5128 = alloca i1, i1 0
+  %nop5129 = alloca i1, i1 0
+  %nop5130 = alloca i1, i1 0
+  %nop5131 = alloca i1, i1 0
+  %nop5132 = alloca i1, i1 0
+  %nop5133 = alloca i1, i1 0
+  %nop5134 = alloca i1, i1 0
+  %nop5135 = alloca i1, i1 0
+  %nop5136 = alloca i1, i1 0
+  %nop5137 = alloca i1, i1 0
+  %nop5138 = alloca i1, i1 0
+  %nop5139 = alloca i1, i1 0
+  %nop5140 = alloca i1, i1 0
+  %nop5141 = alloca i1, i1 0
+  %nop5142 = alloca i1, i1 0
+  %nop5143 = alloca i1, i1 0
+  %nop5144 = alloca i1, i1 0
+  %nop5145 = alloca i1, i1 0
+  %nop5146 = alloca i1, i1 0
+  %nop5147 = alloca i1, i1 0
+  %nop5148 = alloca i1, i1 0
+  %nop5149 = alloca i1, i1 0
+  %nop5150 = alloca i1, i1 0
+  %nop5151 = alloca i1, i1 0
+  %nop5152 = alloca i1, i1 0
+  %nop5153 = alloca i1, i1 0
+  %nop5154 = alloca i1, i1 0
+  %nop5155 = alloca i1, i1 0
+  %nop5156 = alloca i1, i1 0
+  %nop5157 = alloca i1, i1 0
+  %nop5158 = alloca i1, i1 0
+  %nop5159 = alloca i1, i1 0
+  %nop5160 = alloca i1, i1 0
+  %nop5161 = alloca i1, i1 0
+  %nop5162 = alloca i1, i1 0
+  %nop5163 = alloca i1, i1 0
+  %nop5164 = alloca i1, i1 0
+  %nop5165 = alloca i1, i1 0
+  %nop5166 = alloca i1, i1 0
+  %nop5167 = alloca i1, i1 0
+  %nop5168 = alloca i1, i1 0
+  %nop5169 = alloca i1, i1 0
+  %nop5170 = alloca i1, i1 0
+  %nop5171 = alloca i1, i1 0
+  %nop5172 = alloca i1, i1 0
+  %nop5173 = alloca i1, i1 0
+  %nop5174 = alloca i1, i1 0
+  %nop5175 = alloca i1, i1 0
+  %nop5176 = alloca i1, i1 0
+  %nop5177 = alloca i1, i1 0
+  %nop5178 = alloca i1, i1 0
+  %nop5179 = alloca i1, i1 0
+  %nop5180 = alloca i1, i1 0
+  %nop5181 = alloca i1, i1 0
+  %nop5182 = alloca i1, i1 0
+  %nop5183 = alloca i1, i1 0
+  %nop5184 = alloca i1, i1 0
+  %nop5185 = alloca i1, i1 0
+  %nop5186 = alloca i1, i1 0
+  %nop5187 = alloca i1, i1 0
+  %nop5188 = alloca i1, i1 0
+  %nop5189 = alloca i1, i1 0
+  %nop5190 = alloca i1, i1 0
+  %nop5191 = alloca i1, i1 0
+  %nop5192 = alloca i1, i1 0
+  %nop5193 = alloca i1, i1 0
+  %nop5194 = alloca i1, i1 0
+  %nop5195 = alloca i1, i1 0
+  %nop5196 = alloca i1, i1 0
+  %nop5197 = alloca i1, i1 0
+  %nop5198 = alloca i1, i1 0
+  %nop5199 = alloca i1, i1 0
+  %nop5200 = alloca i1, i1 0
+  %nop5201 = alloca i1, i1 0
+  %nop5202 = alloca i1, i1 0
+  %nop5203 = alloca i1, i1 0
+  %nop5204 = alloca i1, i1 0
+  %nop5205 = alloca i1, i1 0
+  %nop5206 = alloca i1, i1 0
+  %nop5207 = alloca i1, i1 0
+  %nop5208 = alloca i1, i1 0
+  %nop5209 = alloca i1, i1 0
+  %nop5210 = alloca i1, i1 0
+  %nop5211 = alloca i1, i1 0
+  %nop5212 = alloca i1, i1 0
+  %nop5213 = alloca i1, i1 0
+  %nop5214 = alloca i1, i1 0
+  %nop5215 = alloca i1, i1 0
+  %nop5216 = alloca i1, i1 0
+  %nop5217 = alloca i1, i1 0
+  %nop5218 = alloca i1, i1 0
+  %nop5219 = alloca i1, i1 0
+  %nop5220 = alloca i1, i1 0
+  %nop5221 = alloca i1, i1 0
+  %nop5222 = alloca i1, i1 0
+  %nop5223 = alloca i1, i1 0
+  %nop5224 = alloca i1, i1 0
+  %nop5225 = alloca i1, i1 0
+  %nop5226 = alloca i1, i1 0
+  %nop5227 = alloca i1, i1 0
+  %nop5228 = alloca i1, i1 0
+  %nop5229 = alloca i1, i1 0
+  %nop5230 = alloca i1, i1 0
+  %nop5231 = alloca i1, i1 0
+  %nop5232 = alloca i1, i1 0
+  %nop5233 = alloca i1, i1 0
+  %nop5234 = alloca i1, i1 0
+  %nop5235 = alloca i1, i1 0
+  %nop5236 = alloca i1, i1 0
+  %nop5237 = alloca i1, i1 0
+  %nop5238 = alloca i1, i1 0
+  %nop5239 = alloca i1, i1 0
+  %nop5240 = alloca i1, i1 0
+  %nop5241 = alloca i1, i1 0
+  %nop5242 = alloca i1, i1 0
+  %nop5243 = alloca i1, i1 0
+  %nop5244 = alloca i1, i1 0
+  %nop5245 = alloca i1, i1 0
+  %nop5246 = alloca i1, i1 0
+  %nop5247 = alloca i1, i1 0
+  %nop5248 = alloca i1, i1 0
+  %nop5249 = alloca i1, i1 0
+  %nop5250 = alloca i1, i1 0
+  %nop5251 = alloca i1, i1 0
+  %nop5252 = alloca i1, i1 0
+  %nop5253 = alloca i1, i1 0
+  %nop5254 = alloca i1, i1 0
+  %nop5255 = alloca i1, i1 0
+  %nop5256 = alloca i1, i1 0
+  %nop5257 = alloca i1, i1 0
+  %nop5258 = alloca i1, i1 0
+  %nop5259 = alloca i1, i1 0
+  %nop5260 = alloca i1, i1 0
+  %nop5261 = alloca i1, i1 0
+  %nop5262 = alloca i1, i1 0
+  %nop5263 = alloca i1, i1 0
+  %nop5264 = alloca i1, i1 0
+  %nop5265 = alloca i1, i1 0
+  %nop5266 = alloca i1, i1 0
+  %nop5267 = alloca i1, i1 0
+  %nop5268 = alloca i1, i1 0
+  %nop5269 = alloca i1, i1 0
+  %nop5270 = alloca i1, i1 0
+  %nop5271 = alloca i1, i1 0
+  %nop5272 = alloca i1, i1 0
+  %nop5273 = alloca i1, i1 0
+  %nop5274 = alloca i1, i1 0
+  %nop5275 = alloca i1, i1 0
+  %nop5276 = alloca i1, i1 0
+  %nop5277 = alloca i1, i1 0
+  %nop5278 = alloca i1, i1 0
+  %nop5279 = alloca i1, i1 0
+  %nop5280 = alloca i1, i1 0
+  %nop5281 = alloca i1, i1 0
+  %nop5282 = alloca i1, i1 0
+  %nop5283 = alloca i1, i1 0
+  %nop5284 = alloca i1, i1 0
+  %nop5285 = alloca i1, i1 0
+  %nop5286 = alloca i1, i1 0
+  %nop5287 = alloca i1, i1 0
+  %nop5288 = alloca i1, i1 0
+  %nop5289 = alloca i1, i1 0
+  %nop5290 = alloca i1, i1 0
+  %nop5291 = alloca i1, i1 0
+  %nop5292 = alloca i1, i1 0
+  %nop5293 = alloca i1, i1 0
+  %nop5294 = alloca i1, i1 0
+  %nop5295 = alloca i1, i1 0
+  %nop5296 = alloca i1, i1 0
+  %nop5297 = alloca i1, i1 0
+  %nop5298 = alloca i1, i1 0
+  %nop5299 = alloca i1, i1 0
+  %nop5300 = alloca i1, i1 0
+  %nop5301 = alloca i1, i1 0
+  %nop5302 = alloca i1, i1 0
+  %nop5303 = alloca i1, i1 0
+  %nop5304 = alloca i1, i1 0
+  %nop5305 = alloca i1, i1 0
+  %nop5306 = alloca i1, i1 0
+  %nop5307 = alloca i1, i1 0
+  %nop5308 = alloca i1, i1 0
+  %nop5309 = alloca i1, i1 0
+  %nop5310 = alloca i1, i1 0
+  %nop5311 = alloca i1, i1 0
+  %nop5312 = alloca i1, i1 0
+  %nop5313 = alloca i1, i1 0
+  %nop5314 = alloca i1, i1 0
+  %nop5315 = alloca i1, i1 0
+  %nop5316 = alloca i1, i1 0
+  %nop5317 = alloca i1, i1 0
+  %nop5318 = alloca i1, i1 0
+  %nop5319 = alloca i1, i1 0
+  %nop5320 = alloca i1, i1 0
+  %nop5321 = alloca i1, i1 0
+  %nop5322 = alloca i1, i1 0
+  %nop5323 = alloca i1, i1 0
+  %nop5324 = alloca i1, i1 0
+  %nop5325 = alloca i1, i1 0
+  %nop5326 = alloca i1, i1 0
+  %nop5327 = alloca i1, i1 0
+  %nop5328 = alloca i1, i1 0
+  %nop5329 = alloca i1, i1 0
+  %nop5330 = alloca i1, i1 0
+  %nop5331 = alloca i1, i1 0
+  %nop5332 = alloca i1, i1 0
+  %nop5333 = alloca i1, i1 0
+  %nop5334 = alloca i1, i1 0
+  %nop5335 = alloca i1, i1 0
+  %nop5336 = alloca i1, i1 0
+  %nop5337 = alloca i1, i1 0
+  %nop5338 = alloca i1, i1 0
+  %nop5339 = alloca i1, i1 0
+  %nop5340 = alloca i1, i1 0
+  %nop5341 = alloca i1, i1 0
+  %nop5342 = alloca i1, i1 0
+  %nop5343 = alloca i1, i1 0
+  %nop5344 = alloca i1, i1 0
+  %nop5345 = alloca i1, i1 0
+  %nop5346 = alloca i1, i1 0
+  %nop5347 = alloca i1, i1 0
+  %nop5348 = alloca i1, i1 0
+  %nop5349 = alloca i1, i1 0
+  %nop5350 = alloca i1, i1 0
+  %nop5351 = alloca i1, i1 0
+  %nop5352 = alloca i1, i1 0
+  %nop5353 = alloca i1, i1 0
+  %nop5354 = alloca i1, i1 0
+  %nop5355 = alloca i1, i1 0
+  %nop5356 = alloca i1, i1 0
+  %nop5357 = alloca i1, i1 0
+  %nop5358 = alloca i1, i1 0
+  %nop5359 = alloca i1, i1 0
+  %nop5360 = alloca i1, i1 0
+  %nop5361 = alloca i1, i1 0
+  %nop5362 = alloca i1, i1 0
+  %nop5363 = alloca i1, i1 0
+  %nop5364 = alloca i1, i1 0
+  %nop5365 = alloca i1, i1 0
+  %nop5366 = alloca i1, i1 0
+  %nop5367 = alloca i1, i1 0
+  %nop5368 = alloca i1, i1 0
+  %nop5369 = alloca i1, i1 0
+  %nop5370 = alloca i1, i1 0
+  %nop5371 = alloca i1, i1 0
+  %nop5372 = alloca i1, i1 0
+  %nop5373 = alloca i1, i1 0
+  %nop5374 = alloca i1, i1 0
+  %nop5375 = alloca i1, i1 0
+  %nop5376 = alloca i1, i1 0
+  %nop5377 = alloca i1, i1 0
+  %nop5378 = alloca i1, i1 0
+  %nop5379 = alloca i1, i1 0
+  %nop5380 = alloca i1, i1 0
+  %nop5381 = alloca i1, i1 0
+  %nop5382 = alloca i1, i1 0
+  %nop5383 = alloca i1, i1 0
+  %nop5384 = alloca i1, i1 0
+  %nop5385 = alloca i1, i1 0
+  %nop5386 = alloca i1, i1 0
+  %nop5387 = alloca i1, i1 0
+  %nop5388 = alloca i1, i1 0
+  %nop5389 = alloca i1, i1 0
+  %nop5390 = alloca i1, i1 0
+  %nop5391 = alloca i1, i1 0
+  %nop5392 = alloca i1, i1 0
+  %nop5393 = alloca i1, i1 0
+  %nop5394 = alloca i1, i1 0
+  %nop5395 = alloca i1, i1 0
+  %nop5396 = alloca i1, i1 0
+  %nop5397 = alloca i1, i1 0
+  %nop5398 = alloca i1, i1 0
+  %nop5399 = alloca i1, i1 0
+  %nop5400 = alloca i1, i1 0
+  %nop5401 = alloca i1, i1 0
+  %nop5402 = alloca i1, i1 0
+  %nop5403 = alloca i1, i1 0
+  %nop5404 = alloca i1, i1 0
+  %nop5405 = alloca i1, i1 0
+  %nop5406 = alloca i1, i1 0
+  %nop5407 = alloca i1, i1 0
+  %nop5408 = alloca i1, i1 0
+  %nop5409 = alloca i1, i1 0
+  %nop5410 = alloca i1, i1 0
+  %nop5411 = alloca i1, i1 0
+  %nop5412 = alloca i1, i1 0
+  %nop5413 = alloca i1, i1 0
+  %nop5414 = alloca i1, i1 0
+  %nop5415 = alloca i1, i1 0
+  %nop5416 = alloca i1, i1 0
+  %nop5417 = alloca i1, i1 0
+  %nop5418 = alloca i1, i1 0
+  %nop5419 = alloca i1, i1 0
+  %nop5420 = alloca i1, i1 0
+  %nop5421 = alloca i1, i1 0
+  %nop5422 = alloca i1, i1 0
+  %nop5423 = alloca i1, i1 0
+  %nop5424 = alloca i1, i1 0
+  %nop5425 = alloca i1, i1 0
+  %nop5426 = alloca i1, i1 0
+  %nop5427 = alloca i1, i1 0
+  %nop5428 = alloca i1, i1 0
+  %nop5429 = alloca i1, i1 0
+  %nop5430 = alloca i1, i1 0
+  %nop5431 = alloca i1, i1 0
+  %nop5432 = alloca i1, i1 0
+  %nop5433 = alloca i1, i1 0
+  %nop5434 = alloca i1, i1 0
+  %nop5435 = alloca i1, i1 0
+  %nop5436 = alloca i1, i1 0
+  %nop5437 = alloca i1, i1 0
+  %nop5438 = alloca i1, i1 0
+  %nop5439 = alloca i1, i1 0
+  %nop5440 = alloca i1, i1 0
+  %nop5441 = alloca i1, i1 0
+  %nop5442 = alloca i1, i1 0
+  %nop5443 = alloca i1, i1 0
+  %nop5444 = alloca i1, i1 0
+  %nop5445 = alloca i1, i1 0
+  %nop5446 = alloca i1, i1 0
+  %nop5447 = alloca i1, i1 0
+  %nop5448 = alloca i1, i1 0
+  %nop5449 = alloca i1, i1 0
+  %nop5450 = alloca i1, i1 0
+  %nop5451 = alloca i1, i1 0
+  %nop5452 = alloca i1, i1 0
+  %nop5453 = alloca i1, i1 0
+  %nop5454 = alloca i1, i1 0
+  %nop5455 = alloca i1, i1 0
+  %nop5456 = alloca i1, i1 0
+  %nop5457 = alloca i1, i1 0
+  %nop5458 = alloca i1, i1 0
+  %nop5459 = alloca i1, i1 0
+  %nop5460 = alloca i1, i1 0
+  %nop5461 = alloca i1, i1 0
+  %nop5462 = alloca i1, i1 0
+  %nop5463 = alloca i1, i1 0
+  %nop5464 = alloca i1, i1 0
+  %nop5465 = alloca i1, i1 0
+  %nop5466 = alloca i1, i1 0
+  %nop5467 = alloca i1, i1 0
+  %nop5468 = alloca i1, i1 0
+  %nop5469 = alloca i1, i1 0
+  %nop5470 = alloca i1, i1 0
+  %nop5471 = alloca i1, i1 0
+  %nop5472 = alloca i1, i1 0
+  %nop5473 = alloca i1, i1 0
+  %nop5474 = alloca i1, i1 0
+  %nop5475 = alloca i1, i1 0
+  %nop5476 = alloca i1, i1 0
+  %nop5477 = alloca i1, i1 0
+  %nop5478 = alloca i1, i1 0
+  %nop5479 = alloca i1, i1 0
+  %nop5480 = alloca i1, i1 0
+  %nop5481 = alloca i1, i1 0
+  %nop5482 = alloca i1, i1 0
+  %nop5483 = alloca i1, i1 0
+  %nop5484 = alloca i1, i1 0
+  %nop5485 = alloca i1, i1 0
+  %nop5486 = alloca i1, i1 0
+  %nop5487 = alloca i1, i1 0
+  %nop5488 = alloca i1, i1 0
+  %nop5489 = alloca i1, i1 0
+  %nop5490 = alloca i1, i1 0
+  %nop5491 = alloca i1, i1 0
+  %nop5492 = alloca i1, i1 0
+  %nop5493 = alloca i1, i1 0
+  %nop5494 = alloca i1, i1 0
+  %nop5495 = alloca i1, i1 0
+  %nop5496 = alloca i1, i1 0
+  %nop5497 = alloca i1, i1 0
+  %nop5498 = alloca i1, i1 0
+  %nop5499 = alloca i1, i1 0
+  %nop5500 = alloca i1, i1 0
+  %nop5501 = alloca i1, i1 0
+  %nop5502 = alloca i1, i1 0
+  %nop5503 = alloca i1, i1 0
+  %nop5504 = alloca i1, i1 0
+  %nop5505 = alloca i1, i1 0
+  %nop5506 = alloca i1, i1 0
+  %nop5507 = alloca i1, i1 0
+  %nop5508 = alloca i1, i1 0
+  %nop5509 = alloca i1, i1 0
+  %nop5510 = alloca i1, i1 0
+  %nop5511 = alloca i1, i1 0
+  %nop5512 = alloca i1, i1 0
+  %nop5513 = alloca i1, i1 0
+  %nop5514 = alloca i1, i1 0
+  %nop5515 = alloca i1, i1 0
+  %nop5516 = alloca i1, i1 0
+  %nop5517 = alloca i1, i1 0
+  %nop5518 = alloca i1, i1 0
+  %nop5519 = alloca i1, i1 0
+  %nop5520 = alloca i1, i1 0
+  %nop5521 = alloca i1, i1 0
+  %nop5522 = alloca i1, i1 0
+  %nop5523 = alloca i1, i1 0
+  %nop5524 = alloca i1, i1 0
+  %nop5525 = alloca i1, i1 0
+  %nop5526 = alloca i1, i1 0
+  %nop5527 = alloca i1, i1 0
+  %nop5528 = alloca i1, i1 0
+  %nop5529 = alloca i1, i1 0
+  %nop5530 = alloca i1, i1 0
+  %nop5531 = alloca i1, i1 0
+  %nop5532 = alloca i1, i1 0
+  %nop5533 = alloca i1, i1 0
+  %nop5534 = alloca i1, i1 0
+  %nop5535 = alloca i1, i1 0
+  %nop5536 = alloca i1, i1 0
+  %nop5537 = alloca i1, i1 0
+  %nop5538 = alloca i1, i1 0
+  %nop5539 = alloca i1, i1 0
+  %nop5540 = alloca i1, i1 0
+  %nop5541 = alloca i1, i1 0
+  %nop5542 = alloca i1, i1 0
+  %nop5543 = alloca i1, i1 0
+  %nop5544 = alloca i1, i1 0
+  %nop5545 = alloca i1, i1 0
+  %nop5546 = alloca i1, i1 0
+  %nop5547 = alloca i1, i1 0
+  %nop5548 = alloca i1, i1 0
+  %nop5549 = alloca i1, i1 0
+  %nop5550 = alloca i1, i1 0
+  %nop5551 = alloca i1, i1 0
+  %nop5552 = alloca i1, i1 0
+  %nop5553 = alloca i1, i1 0
+  %nop5554 = alloca i1, i1 0
+  %nop5555 = alloca i1, i1 0
+  %nop5556 = alloca i1, i1 0
+  %nop5557 = alloca i1, i1 0
+  %nop5558 = alloca i1, i1 0
+  %nop5559 = alloca i1, i1 0
+  %nop5560 = alloca i1, i1 0
+  %nop5561 = alloca i1, i1 0
+  %nop5562 = alloca i1, i1 0
+  %nop5563 = alloca i1, i1 0
+  %nop5564 = alloca i1, i1 0
+  %nop5565 = alloca i1, i1 0
+  %nop5566 = alloca i1, i1 0
+  %nop5567 = alloca i1, i1 0
+  %nop5568 = alloca i1, i1 0
+  %nop5569 = alloca i1, i1 0
+  %nop5570 = alloca i1, i1 0
+  %nop5571 = alloca i1, i1 0
+  %nop5572 = alloca i1, i1 0
+  %nop5573 = alloca i1, i1 0
+  %nop5574 = alloca i1, i1 0
+  %nop5575 = alloca i1, i1 0
+  %nop5576 = alloca i1, i1 0
+  %nop5577 = alloca i1, i1 0
+  %nop5578 = alloca i1, i1 0
+  %nop5579 = alloca i1, i1 0
+  %nop5580 = alloca i1, i1 0
+  %nop5581 = alloca i1, i1 0
+  %nop5582 = alloca i1, i1 0
+  %nop5583 = alloca i1, i1 0
+  %nop5584 = alloca i1, i1 0
+  %nop5585 = alloca i1, i1 0
+  %nop5586 = alloca i1, i1 0
+  %nop5587 = alloca i1, i1 0
+  %nop5588 = alloca i1, i1 0
+  %nop5589 = alloca i1, i1 0
+  %nop5590 = alloca i1, i1 0
+  %nop5591 = alloca i1, i1 0
+  %nop5592 = alloca i1, i1 0
+  %nop5593 = alloca i1, i1 0
+  %nop5594 = alloca i1, i1 0
+  %nop5595 = alloca i1, i1 0
+  %nop5596 = alloca i1, i1 0
+  %nop5597 = alloca i1, i1 0
+  %nop5598 = alloca i1, i1 0
+  %nop5599 = alloca i1, i1 0
+  %nop5600 = alloca i1, i1 0
+  %nop5601 = alloca i1, i1 0
+  %nop5602 = alloca i1, i1 0
+  %nop5603 = alloca i1, i1 0
+  %nop5604 = alloca i1, i1 0
+  %nop5605 = alloca i1, i1 0
+  %nop5606 = alloca i1, i1 0
+  %nop5607 = alloca i1, i1 0
+  %nop5608 = alloca i1, i1 0
+  %nop5609 = alloca i1, i1 0
+  %nop5610 = alloca i1, i1 0
+  %nop5611 = alloca i1, i1 0
+  %nop5612 = alloca i1, i1 0
+  %nop5613 = alloca i1, i1 0
+  %nop5614 = alloca i1, i1 0
+  %nop5615 = alloca i1, i1 0
+  %nop5616 = alloca i1, i1 0
+  %nop5617 = alloca i1, i1 0
+  %nop5618 = alloca i1, i1 0
+  %nop5619 = alloca i1, i1 0
+  %nop5620 = alloca i1, i1 0
+  %nop5621 = alloca i1, i1 0
+  %nop5622 = alloca i1, i1 0
+  %nop5623 = alloca i1, i1 0
+  %nop5624 = alloca i1, i1 0
+  %nop5625 = alloca i1, i1 0
+  %nop5626 = alloca i1, i1 0
+  %nop5627 = alloca i1, i1 0
+  %nop5628 = alloca i1, i1 0
+  %nop5629 = alloca i1, i1 0
+  %nop5630 = alloca i1, i1 0
+  %nop5631 = alloca i1, i1 0
+  %nop5632 = alloca i1, i1 0
+  %nop5633 = alloca i1, i1 0
+  %nop5634 = alloca i1, i1 0
+  %nop5635 = alloca i1, i1 0
+  %nop5636 = alloca i1, i1 0
+  %nop5637 = alloca i1, i1 0
+  %nop5638 = alloca i1, i1 0
+  %nop5639 = alloca i1, i1 0
+  %nop5640 = alloca i1, i1 0
+  %nop5641 = alloca i1, i1 0
+  %nop5642 = alloca i1, i1 0
+  %nop5643 = alloca i1, i1 0
+  %nop5644 = alloca i1, i1 0
+  %nop5645 = alloca i1, i1 0
+  %nop5646 = alloca i1, i1 0
+  %nop5647 = alloca i1, i1 0
+  %nop5648 = alloca i1, i1 0
+  %nop5649 = alloca i1, i1 0
+  %nop5650 = alloca i1, i1 0
+  %nop5651 = alloca i1, i1 0
+  %nop5652 = alloca i1, i1 0
+  %nop5653 = alloca i1, i1 0
+  %nop5654 = alloca i1, i1 0
+  %nop5655 = alloca i1, i1 0
+  %nop5656 = alloca i1, i1 0
+  %nop5657 = alloca i1, i1 0
+  %nop5658 = alloca i1, i1 0
+  %nop5659 = alloca i1, i1 0
+  %nop5660 = alloca i1, i1 0
+  %nop5661 = alloca i1, i1 0
+  %nop5662 = alloca i1, i1 0
+  %nop5663 = alloca i1, i1 0
+  %nop5664 = alloca i1, i1 0
+  %nop5665 = alloca i1, i1 0
+  %nop5666 = alloca i1, i1 0
+  %nop5667 = alloca i1, i1 0
+  %nop5668 = alloca i1, i1 0
+  %nop5669 = alloca i1, i1 0
+  %nop5670 = alloca i1, i1 0
+  %nop5671 = alloca i1, i1 0
+  %nop5672 = alloca i1, i1 0
+  %nop5673 = alloca i1, i1 0
+  %nop5674 = alloca i1, i1 0
+  %nop5675 = alloca i1, i1 0
+  %nop5676 = alloca i1, i1 0
+  %nop5677 = alloca i1, i1 0
+  %nop5678 = alloca i1, i1 0
+  %nop5679 = alloca i1, i1 0
+  %nop5680 = alloca i1, i1 0
+  %nop5681 = alloca i1, i1 0
+  %nop5682 = alloca i1, i1 0
+  %nop5683 = alloca i1, i1 0
+  %nop5684 = alloca i1, i1 0
+  %nop5685 = alloca i1, i1 0
+  %nop5686 = alloca i1, i1 0
+  %nop5687 = alloca i1, i1 0
+  %nop5688 = alloca i1, i1 0
+  %nop5689 = alloca i1, i1 0
+  %nop5690 = alloca i1, i1 0
+  %nop5691 = alloca i1, i1 0
+  %nop5692 = alloca i1, i1 0
+  %nop5693 = alloca i1, i1 0
+  %nop5694 = alloca i1, i1 0
+  %nop5695 = alloca i1, i1 0
+  %nop5696 = alloca i1, i1 0
+  %nop5697 = alloca i1, i1 0
+  %nop5698 = alloca i1, i1 0
+  %nop5699 = alloca i1, i1 0
+  %nop5700 = alloca i1, i1 0
+  %nop5701 = alloca i1, i1 0
+  %nop5702 = alloca i1, i1 0
+  %nop5703 = alloca i1, i1 0
+  %nop5704 = alloca i1, i1 0
+  %nop5705 = alloca i1, i1 0
+  %nop5706 = alloca i1, i1 0
+  %nop5707 = alloca i1, i1 0
+  %nop5708 = alloca i1, i1 0
+  %nop5709 = alloca i1, i1 0
+  %nop5710 = alloca i1, i1 0
+  %nop5711 = alloca i1, i1 0
+  %nop5712 = alloca i1, i1 0
+  %nop5713 = alloca i1, i1 0
+  %nop5714 = alloca i1, i1 0
+  %nop5715 = alloca i1, i1 0
+  %nop5716 = alloca i1, i1 0
+  %nop5717 = alloca i1, i1 0
+  %nop5718 = alloca i1, i1 0
+  %nop5719 = alloca i1, i1 0
+  %nop5720 = alloca i1, i1 0
+  %nop5721 = alloca i1, i1 0
+  %nop5722 = alloca i1, i1 0
+  %nop5723 = alloca i1, i1 0
+  %nop5724 = alloca i1, i1 0
+  %nop5725 = alloca i1, i1 0
+  %nop5726 = alloca i1, i1 0
+  %nop5727 = alloca i1, i1 0
+  %nop5728 = alloca i1, i1 0
+  %nop5729 = alloca i1, i1 0
+  %nop5730 = alloca i1, i1 0
+  %nop5731 = alloca i1, i1 0
+  %nop5732 = alloca i1, i1 0
+  %nop5733 = alloca i1, i1 0
+  %nop5734 = alloca i1, i1 0
+  %nop5735 = alloca i1, i1 0
+  %nop5736 = alloca i1, i1 0
+  %nop5737 = alloca i1, i1 0
+  %nop5738 = alloca i1, i1 0
+  %nop5739 = alloca i1, i1 0
+  %nop5740 = alloca i1, i1 0
+  %nop5741 = alloca i1, i1 0
+  %nop5742 = alloca i1, i1 0
+  %nop5743 = alloca i1, i1 0
+  %nop5744 = alloca i1, i1 0
+  %nop5745 = alloca i1, i1 0
+  %nop5746 = alloca i1, i1 0
+  %nop5747 = alloca i1, i1 0
+  %nop5748 = alloca i1, i1 0
+  %nop5749 = alloca i1, i1 0
+  %nop5750 = alloca i1, i1 0
+  %nop5751 = alloca i1, i1 0
+  %nop5752 = alloca i1, i1 0
+  %nop5753 = alloca i1, i1 0
+  %nop5754 = alloca i1, i1 0
+  %nop5755 = alloca i1, i1 0
+  %nop5756 = alloca i1, i1 0
+  %nop5757 = alloca i1, i1 0
+  %nop5758 = alloca i1, i1 0
+  %nop5759 = alloca i1, i1 0
+  %nop5760 = alloca i1, i1 0
+  %nop5761 = alloca i1, i1 0
+  %nop5762 = alloca i1, i1 0
+  %nop5763 = alloca i1, i1 0
+  %nop5764 = alloca i1, i1 0
+  %nop5765 = alloca i1, i1 0
+  %nop5766 = alloca i1, i1 0
+  %nop5767 = alloca i1, i1 0
+  %nop5768 = alloca i1, i1 0
+  %nop5769 = alloca i1, i1 0
+  %nop5770 = alloca i1, i1 0
+  %nop5771 = alloca i1, i1 0
+  %nop5772 = alloca i1, i1 0
+  %nop5773 = alloca i1, i1 0
+  %nop5774 = alloca i1, i1 0
+  %nop5775 = alloca i1, i1 0
+  %nop5776 = alloca i1, i1 0
+  %nop5777 = alloca i1, i1 0
+  %nop5778 = alloca i1, i1 0
+  %nop5779 = alloca i1, i1 0
+  %nop5780 = alloca i1, i1 0
+  %nop5781 = alloca i1, i1 0
+  %nop5782 = alloca i1, i1 0
+  %nop5783 = alloca i1, i1 0
+  %nop5784 = alloca i1, i1 0
+  %nop5785 = alloca i1, i1 0
+  %nop5786 = alloca i1, i1 0
+  %nop5787 = alloca i1, i1 0
+  %nop5788 = alloca i1, i1 0
+  %nop5789 = alloca i1, i1 0
+  %nop5790 = alloca i1, i1 0
+  %nop5791 = alloca i1, i1 0
+  %nop5792 = alloca i1, i1 0
+  %nop5793 = alloca i1, i1 0
+  %nop5794 = alloca i1, i1 0
+  %nop5795 = alloca i1, i1 0
+  %nop5796 = alloca i1, i1 0
+  %nop5797 = alloca i1, i1 0
+  %nop5798 = alloca i1, i1 0
+  %nop5799 = alloca i1, i1 0
+  %nop5800 = alloca i1, i1 0
+  %nop5801 = alloca i1, i1 0
+  %nop5802 = alloca i1, i1 0
+  %nop5803 = alloca i1, i1 0
+  %nop5804 = alloca i1, i1 0
+  %nop5805 = alloca i1, i1 0
+  %nop5806 = alloca i1, i1 0
+  %nop5807 = alloca i1, i1 0
+  %nop5808 = alloca i1, i1 0
+  %nop5809 = alloca i1, i1 0
+  %nop5810 = alloca i1, i1 0
+  %nop5811 = alloca i1, i1 0
+  %nop5812 = alloca i1, i1 0
+  %nop5813 = alloca i1, i1 0
+  %nop5814 = alloca i1, i1 0
+  %nop5815 = alloca i1, i1 0
+  %nop5816 = alloca i1, i1 0
+  %nop5817 = alloca i1, i1 0
+  %nop5818 = alloca i1, i1 0
+  %nop5819 = alloca i1, i1 0
+  %nop5820 = alloca i1, i1 0
+  %nop5821 = alloca i1, i1 0
+  %nop5822 = alloca i1, i1 0
+  %nop5823 = alloca i1, i1 0
+  %nop5824 = alloca i1, i1 0
+  %nop5825 = alloca i1, i1 0
+  %nop5826 = alloca i1, i1 0
+  %nop5827 = alloca i1, i1 0
+  %nop5828 = alloca i1, i1 0
+  %nop5829 = alloca i1, i1 0
+  %nop5830 = alloca i1, i1 0
+  %nop5831 = alloca i1, i1 0
+  %nop5832 = alloca i1, i1 0
+  %nop5833 = alloca i1, i1 0
+  %nop5834 = alloca i1, i1 0
+  %nop5835 = alloca i1, i1 0
+  %nop5836 = alloca i1, i1 0
+  %nop5837 = alloca i1, i1 0
+  %nop5838 = alloca i1, i1 0
+  %nop5839 = alloca i1, i1 0
+  %nop5840 = alloca i1, i1 0
+  %nop5841 = alloca i1, i1 0
+  %nop5842 = alloca i1, i1 0
+  %nop5843 = alloca i1, i1 0
+  %nop5844 = alloca i1, i1 0
+  %nop5845 = alloca i1, i1 0
+  %nop5846 = alloca i1, i1 0
+  %nop5847 = alloca i1, i1 0
+  %nop5848 = alloca i1, i1 0
+  %nop5849 = alloca i1, i1 0
+  %nop5850 = alloca i1, i1 0
+  %nop5851 = alloca i1, i1 0
+  %nop5852 = alloca i1, i1 0
+  %nop5853 = alloca i1, i1 0
+  %nop5854 = alloca i1, i1 0
+  %nop5855 = alloca i1, i1 0
+  %nop5856 = alloca i1, i1 0
+  %nop5857 = alloca i1, i1 0
+  %nop5858 = alloca i1, i1 0
+  %nop5859 = alloca i1, i1 0
+  %nop5860 = alloca i1, i1 0
+  %nop5861 = alloca i1, i1 0
+  %nop5862 = alloca i1, i1 0
+  %nop5863 = alloca i1, i1 0
+  %nop5864 = alloca i1, i1 0
+  %nop5865 = alloca i1, i1 0
+  %nop5866 = alloca i1, i1 0
+  %nop5867 = alloca i1, i1 0
+  %nop5868 = alloca i1, i1 0
+  %nop5869 = alloca i1, i1 0
+  %nop5870 = alloca i1, i1 0
+  %nop5871 = alloca i1, i1 0
+  %nop5872 = alloca i1, i1 0
+  %nop5873 = alloca i1, i1 0
+  %nop5874 = alloca i1, i1 0
+  %nop5875 = alloca i1, i1 0
+  %nop5876 = alloca i1, i1 0
+  %nop5877 = alloca i1, i1 0
+  %nop5878 = alloca i1, i1 0
+  %nop5879 = alloca i1, i1 0
+  %nop5880 = alloca i1, i1 0
+  %nop5881 = alloca i1, i1 0
+  %nop5882 = alloca i1, i1 0
+  %nop5883 = alloca i1, i1 0
+  %nop5884 = alloca i1, i1 0
+  %nop5885 = alloca i1, i1 0
+  %nop5886 = alloca i1, i1 0
+  %nop5887 = alloca i1, i1 0
+  %nop5888 = alloca i1, i1 0
+  %nop5889 = alloca i1, i1 0
+  %nop5890 = alloca i1, i1 0
+  %nop5891 = alloca i1, i1 0
+  %nop5892 = alloca i1, i1 0
+  %nop5893 = alloca i1, i1 0
+  %nop5894 = alloca i1, i1 0
+  %nop5895 = alloca i1, i1 0
+  %nop5896 = alloca i1, i1 0
+  %nop5897 = alloca i1, i1 0
+  %nop5898 = alloca i1, i1 0
+  %nop5899 = alloca i1, i1 0
+  %nop5900 = alloca i1, i1 0
+  %nop5901 = alloca i1, i1 0
+  %nop5902 = alloca i1, i1 0
+  %nop5903 = alloca i1, i1 0
+  %nop5904 = alloca i1, i1 0
+  %nop5905 = alloca i1, i1 0
+  %nop5906 = alloca i1, i1 0
+  %nop5907 = alloca i1, i1 0
+  %nop5908 = alloca i1, i1 0
+  %nop5909 = alloca i1, i1 0
+  %nop5910 = alloca i1, i1 0
+  %nop5911 = alloca i1, i1 0
+  %nop5912 = alloca i1, i1 0
+  %nop5913 = alloca i1, i1 0
+  %nop5914 = alloca i1, i1 0
+  %nop5915 = alloca i1, i1 0
+  %nop5916 = alloca i1, i1 0
+  %nop5917 = alloca i1, i1 0
+  %nop5918 = alloca i1, i1 0
+  %nop5919 = alloca i1, i1 0
+  %nop5920 = alloca i1, i1 0
+  %nop5921 = alloca i1, i1 0
+  %nop5922 = alloca i1, i1 0
+  %nop5923 = alloca i1, i1 0
+  %nop5924 = alloca i1, i1 0
+  %nop5925 = alloca i1, i1 0
+  %nop5926 = alloca i1, i1 0
+  %nop5927 = alloca i1, i1 0
+  %nop5928 = alloca i1, i1 0
+  %nop5929 = alloca i1, i1 0
+  %nop5930 = alloca i1, i1 0
+  %nop5931 = alloca i1, i1 0
+  %nop5932 = alloca i1, i1 0
+  %nop5933 = alloca i1, i1 0
+  %nop5934 = alloca i1, i1 0
+  %nop5935 = alloca i1, i1 0
+  %nop5936 = alloca i1, i1 0
+  %nop5937 = alloca i1, i1 0
+  %nop5938 = alloca i1, i1 0
+  %nop5939 = alloca i1, i1 0
+  %nop5940 = alloca i1, i1 0
+  %nop5941 = alloca i1, i1 0
+  %nop5942 = alloca i1, i1 0
+  %nop5943 = alloca i1, i1 0
+  %nop5944 = alloca i1, i1 0
+  %nop5945 = alloca i1, i1 0
+  %nop5946 = alloca i1, i1 0
+  %nop5947 = alloca i1, i1 0
+  %nop5948 = alloca i1, i1 0
+  %nop5949 = alloca i1, i1 0
+  %nop5950 = alloca i1, i1 0
+  %nop5951 = alloca i1, i1 0
+  %nop5952 = alloca i1, i1 0
+  %nop5953 = alloca i1, i1 0
+  %nop5954 = alloca i1, i1 0
+  %nop5955 = alloca i1, i1 0
+  %nop5956 = alloca i1, i1 0
+  %nop5957 = alloca i1, i1 0
+  %nop5958 = alloca i1, i1 0
+  %nop5959 = alloca i1, i1 0
+  %nop5960 = alloca i1, i1 0
+  %nop5961 = alloca i1, i1 0
+  %nop5962 = alloca i1, i1 0
+  %nop5963 = alloca i1, i1 0
+  %nop5964 = alloca i1, i1 0
+  %nop5965 = alloca i1, i1 0
+  %nop5966 = alloca i1, i1 0
+  %nop5967 = alloca i1, i1 0
+  %nop5968 = alloca i1, i1 0
+  %nop5969 = alloca i1, i1 0
+  %nop5970 = alloca i1, i1 0
+  %nop5971 = alloca i1, i1 0
+  %nop5972 = alloca i1, i1 0
+  %nop5973 = alloca i1, i1 0
+  %nop5974 = alloca i1, i1 0
+  %nop5975 = alloca i1, i1 0
+  %nop5976 = alloca i1, i1 0
+  %nop5977 = alloca i1, i1 0
+  %nop5978 = alloca i1, i1 0
+  %nop5979 = alloca i1, i1 0
+  %nop5980 = alloca i1, i1 0
+  %nop5981 = alloca i1, i1 0
+  %nop5982 = alloca i1, i1 0
+  %nop5983 = alloca i1, i1 0
+  %nop5984 = alloca i1, i1 0
+  %nop5985 = alloca i1, i1 0
+  %nop5986 = alloca i1, i1 0
+  %nop5987 = alloca i1, i1 0
+  %nop5988 = alloca i1, i1 0
+  %nop5989 = alloca i1, i1 0
+  %nop5990 = alloca i1, i1 0
+  %nop5991 = alloca i1, i1 0
+  %nop5992 = alloca i1, i1 0
+  %nop5993 = alloca i1, i1 0
+  %nop5994 = alloca i1, i1 0
+  %nop5995 = alloca i1, i1 0
+  %nop5996 = alloca i1, i1 0
+  %nop5997 = alloca i1, i1 0
+  %nop5998 = alloca i1, i1 0
+  %nop5999 = alloca i1, i1 0
+  %nop6000 = alloca i1, i1 0
+  %nop6001 = alloca i1, i1 0
+  %nop6002 = alloca i1, i1 0
+  %nop6003 = alloca i1, i1 0
+  %nop6004 = alloca i1, i1 0
+  %nop6005 = alloca i1, i1 0
+  %nop6006 = alloca i1, i1 0
+  %nop6007 = alloca i1, i1 0
+  %nop6008 = alloca i1, i1 0
+  %nop6009 = alloca i1, i1 0
+  %nop6010 = alloca i1, i1 0
+  %nop6011 = alloca i1, i1 0
+  %nop6012 = alloca i1, i1 0
+  %nop6013 = alloca i1, i1 0
+  %nop6014 = alloca i1, i1 0
+  %nop6015 = alloca i1, i1 0
+  %nop6016 = alloca i1, i1 0
+  %nop6017 = alloca i1, i1 0
+  %nop6018 = alloca i1, i1 0
+  %nop6019 = alloca i1, i1 0
+  %nop6020 = alloca i1, i1 0
+  %nop6021 = alloca i1, i1 0
+  %nop6022 = alloca i1, i1 0
+  %nop6023 = alloca i1, i1 0
+  %nop6024 = alloca i1, i1 0
+  %nop6025 = alloca i1, i1 0
+  %nop6026 = alloca i1, i1 0
+  %nop6027 = alloca i1, i1 0
+  %nop6028 = alloca i1, i1 0
+  %nop6029 = alloca i1, i1 0
+  %nop6030 = alloca i1, i1 0
+  %nop6031 = alloca i1, i1 0
+  %nop6032 = alloca i1, i1 0
+  %nop6033 = alloca i1, i1 0
+  %nop6034 = alloca i1, i1 0
+  %nop6035 = alloca i1, i1 0
+  %nop6036 = alloca i1, i1 0
+  %nop6037 = alloca i1, i1 0
+  %nop6038 = alloca i1, i1 0
+  %nop6039 = alloca i1, i1 0
+  %nop6040 = alloca i1, i1 0
+  %nop6041 = alloca i1, i1 0
+  %nop6042 = alloca i1, i1 0
+  %nop6043 = alloca i1, i1 0
+  %nop6044 = alloca i1, i1 0
+  %nop6045 = alloca i1, i1 0
+  %nop6046 = alloca i1, i1 0
+  %nop6047 = alloca i1, i1 0
+  %nop6048 = alloca i1, i1 0
+  %nop6049 = alloca i1, i1 0
+  %nop6050 = alloca i1, i1 0
+  %nop6051 = alloca i1, i1 0
+  %nop6052 = alloca i1, i1 0
+  %nop6053 = alloca i1, i1 0
+  %nop6054 = alloca i1, i1 0
+  %nop6055 = alloca i1, i1 0
+  %nop6056 = alloca i1, i1 0
+  %nop6057 = alloca i1, i1 0
+  %nop6058 = alloca i1, i1 0
+  %nop6059 = alloca i1, i1 0
+  %nop6060 = alloca i1, i1 0
+  %nop6061 = alloca i1, i1 0
+  %nop6062 = alloca i1, i1 0
+  %nop6063 = alloca i1, i1 0
+  %nop6064 = alloca i1, i1 0
+  %nop6065 = alloca i1, i1 0
+  %nop6066 = alloca i1, i1 0
+  %nop6067 = alloca i1, i1 0
+  %nop6068 = alloca i1, i1 0
+  %nop6069 = alloca i1, i1 0
+  %nop6070 = alloca i1, i1 0
+  %nop6071 = alloca i1, i1 0
+  %nop6072 = alloca i1, i1 0
+  %nop6073 = alloca i1, i1 0
+  %nop6074 = alloca i1, i1 0
+  %nop6075 = alloca i1, i1 0
+  %nop6076 = alloca i1, i1 0
+  %nop6077 = alloca i1, i1 0
+  %nop6078 = alloca i1, i1 0
+  %nop6079 = alloca i1, i1 0
+  %nop6080 = alloca i1, i1 0
+  %nop6081 = alloca i1, i1 0
+  %nop6082 = alloca i1, i1 0
+  %nop6083 = alloca i1, i1 0
+  %nop6084 = alloca i1, i1 0
+  %nop6085 = alloca i1, i1 0
+  %nop6086 = alloca i1, i1 0
+  %nop6087 = alloca i1, i1 0
+  %nop6088 = alloca i1, i1 0
+  %nop6089 = alloca i1, i1 0
+  %nop6090 = alloca i1, i1 0
+  %nop6091 = alloca i1, i1 0
+  %nop6092 = alloca i1, i1 0
+  %nop6093 = alloca i1, i1 0
+  %nop6094 = alloca i1, i1 0
+  %nop6095 = alloca i1, i1 0
+  %nop6096 = alloca i1, i1 0
+  %nop6097 = alloca i1, i1 0
+  %nop6098 = alloca i1, i1 0
+  %nop6099 = alloca i1, i1 0
+  %nop6100 = alloca i1, i1 0
+  %nop6101 = alloca i1, i1 0
+  %nop6102 = alloca i1, i1 0
+  %nop6103 = alloca i1, i1 0
+  %nop6104 = alloca i1, i1 0
+  %nop6105 = alloca i1, i1 0
+  %nop6106 = alloca i1, i1 0
+  %nop6107 = alloca i1, i1 0
+  %nop6108 = alloca i1, i1 0
+  %nop6109 = alloca i1, i1 0
+  %nop6110 = alloca i1, i1 0
+  %nop6111 = alloca i1, i1 0
+  %nop6112 = alloca i1, i1 0
+  %nop6113 = alloca i1, i1 0
+  %nop6114 = alloca i1, i1 0
+  %nop6115 = alloca i1, i1 0
+  %nop6116 = alloca i1, i1 0
+  %nop6117 = alloca i1, i1 0
+  %nop6118 = alloca i1, i1 0
+  %nop6119 = alloca i1, i1 0
+  %nop6120 = alloca i1, i1 0
+  %nop6121 = alloca i1, i1 0
+  %nop6122 = alloca i1, i1 0
+  %nop6123 = alloca i1, i1 0
+  %nop6124 = alloca i1, i1 0
+  %nop6125 = alloca i1, i1 0
+  %nop6126 = alloca i1, i1 0
+  %nop6127 = alloca i1, i1 0
+  %nop6128 = alloca i1, i1 0
+  %nop6129 = alloca i1, i1 0
+  %nop6130 = alloca i1, i1 0
+  %nop6131 = alloca i1, i1 0
+  %nop6132 = alloca i1, i1 0
+  %nop6133 = alloca i1, i1 0
+  %nop6134 = alloca i1, i1 0
+  %nop6135 = alloca i1, i1 0
+  %nop6136 = alloca i1, i1 0
+  %nop6137 = alloca i1, i1 0
+  %nop6138 = alloca i1, i1 0
+  %nop6139 = alloca i1, i1 0
+  %nop6140 = alloca i1, i1 0
+  %nop6141 = alloca i1, i1 0
+  %nop6142 = alloca i1, i1 0
+  %nop6143 = alloca i1, i1 0
+  %nop6144 = alloca i1, i1 0
+  %nop6145 = alloca i1, i1 0
+  %nop6146 = alloca i1, i1 0
+  %nop6147 = alloca i1, i1 0
+  %nop6148 = alloca i1, i1 0
+  %nop6149 = alloca i1, i1 0
+  %nop6150 = alloca i1, i1 0
+  %nop6151 = alloca i1, i1 0
+  %nop6152 = alloca i1, i1 0
+  %nop6153 = alloca i1, i1 0
+  %nop6154 = alloca i1, i1 0
+  %nop6155 = alloca i1, i1 0
+  %nop6156 = alloca i1, i1 0
+  %nop6157 = alloca i1, i1 0
+  %nop6158 = alloca i1, i1 0
+  %nop6159 = alloca i1, i1 0
+  %nop6160 = alloca i1, i1 0
+  %nop6161 = alloca i1, i1 0
+  %nop6162 = alloca i1, i1 0
+  %nop6163 = alloca i1, i1 0
+  %nop6164 = alloca i1, i1 0
+  %nop6165 = alloca i1, i1 0
+  %nop6166 = alloca i1, i1 0
+  %nop6167 = alloca i1, i1 0
+  %nop6168 = alloca i1, i1 0
+  %nop6169 = alloca i1, i1 0
+  %nop6170 = alloca i1, i1 0
+  %nop6171 = alloca i1, i1 0
+  %nop6172 = alloca i1, i1 0
+  %nop6173 = alloca i1, i1 0
+  %nop6174 = alloca i1, i1 0
+  %nop6175 = alloca i1, i1 0
+  %nop6176 = alloca i1, i1 0
+  %nop6177 = alloca i1, i1 0
+  %nop6178 = alloca i1, i1 0
+  %nop6179 = alloca i1, i1 0
+  %nop6180 = alloca i1, i1 0
+  %nop6181 = alloca i1, i1 0
+  %nop6182 = alloca i1, i1 0
+  %nop6183 = alloca i1, i1 0
+  %nop6184 = alloca i1, i1 0
+  %nop6185 = alloca i1, i1 0
+  %nop6186 = alloca i1, i1 0
+  %nop6187 = alloca i1, i1 0
+  %nop6188 = alloca i1, i1 0
+  %nop6189 = alloca i1, i1 0
+  %nop6190 = alloca i1, i1 0
+  %nop6191 = alloca i1, i1 0
+  %nop6192 = alloca i1, i1 0
+  %nop6193 = alloca i1, i1 0
+  %nop6194 = alloca i1, i1 0
+  %nop6195 = alloca i1, i1 0
+  %nop6196 = alloca i1, i1 0
+  %nop6197 = alloca i1, i1 0
+  %nop6198 = alloca i1, i1 0
+  %nop6199 = alloca i1, i1 0
+  %nop6200 = alloca i1, i1 0
+  %nop6201 = alloca i1, i1 0
+  %nop6202 = alloca i1, i1 0
+  %nop6203 = alloca i1, i1 0
+  %nop6204 = alloca i1, i1 0
+  %nop6205 = alloca i1, i1 0
+  %nop6206 = alloca i1, i1 0
+  %nop6207 = alloca i1, i1 0
+  %nop6208 = alloca i1, i1 0
+  %nop6209 = alloca i1, i1 0
+  %nop6210 = alloca i1, i1 0
+  %nop6211 = alloca i1, i1 0
+  %nop6212 = alloca i1, i1 0
+  %nop6213 = alloca i1, i1 0
+  %nop6214 = alloca i1, i1 0
+  %nop6215 = alloca i1, i1 0
+  %nop6216 = alloca i1, i1 0
+  %nop6217 = alloca i1, i1 0
+  %nop6218 = alloca i1, i1 0
+  %nop6219 = alloca i1, i1 0
+  %nop6220 = alloca i1, i1 0
+  %nop6221 = alloca i1, i1 0
+  %nop6222 = alloca i1, i1 0
+  %nop6223 = alloca i1, i1 0
+  %nop6224 = alloca i1, i1 0
+  %nop6225 = alloca i1, i1 0
+  %nop6226 = alloca i1, i1 0
+  %nop6227 = alloca i1, i1 0
+  %nop6228 = alloca i1, i1 0
+  %nop6229 = alloca i1, i1 0
+  %nop6230 = alloca i1, i1 0
+  %nop6231 = alloca i1, i1 0
+  %nop6232 = alloca i1, i1 0
+  %nop6233 = alloca i1, i1 0
+  %nop6234 = alloca i1, i1 0
+  %nop6235 = alloca i1, i1 0
+  %nop6236 = alloca i1, i1 0
+  %nop6237 = alloca i1, i1 0
+  %nop6238 = alloca i1, i1 0
+  %nop6239 = alloca i1, i1 0
+  %nop6240 = alloca i1, i1 0
+  %nop6241 = alloca i1, i1 0
+  %nop6242 = alloca i1, i1 0
+  %nop6243 = alloca i1, i1 0
+  %nop6244 = alloca i1, i1 0
+  %nop6245 = alloca i1, i1 0
+  %nop6246 = alloca i1, i1 0
+  %nop6247 = alloca i1, i1 0
+  %nop6248 = alloca i1, i1 0
+  %nop6249 = alloca i1, i1 0
+  %nop6250 = alloca i1, i1 0
+  %nop6251 = alloca i1, i1 0
+  %nop6252 = alloca i1, i1 0
+  %nop6253 = alloca i1, i1 0
+  %nop6254 = alloca i1, i1 0
+  %nop6255 = alloca i1, i1 0
+  %nop6256 = alloca i1, i1 0
+  %nop6257 = alloca i1, i1 0
+  %nop6258 = alloca i1, i1 0
+  %nop6259 = alloca i1, i1 0
+  %nop6260 = alloca i1, i1 0
+  %nop6261 = alloca i1, i1 0
+  %nop6262 = alloca i1, i1 0
+  %nop6263 = alloca i1, i1 0
+  %nop6264 = alloca i1, i1 0
+  %nop6265 = alloca i1, i1 0
+  %nop6266 = alloca i1, i1 0
+  %nop6267 = alloca i1, i1 0
+  %nop6268 = alloca i1, i1 0
+  %nop6269 = alloca i1, i1 0
+  %nop6270 = alloca i1, i1 0
+  %nop6271 = alloca i1, i1 0
+  %nop6272 = alloca i1, i1 0
+  %nop6273 = alloca i1, i1 0
+  %nop6274 = alloca i1, i1 0
+  %nop6275 = alloca i1, i1 0
+  %nop6276 = alloca i1, i1 0
+  %nop6277 = alloca i1, i1 0
+  %nop6278 = alloca i1, i1 0
+  %nop6279 = alloca i1, i1 0
+  %nop6280 = alloca i1, i1 0
+  %nop6281 = alloca i1, i1 0
+  %nop6282 = alloca i1, i1 0
+  %nop6283 = alloca i1, i1 0
+  %nop6284 = alloca i1, i1 0
+  %nop6285 = alloca i1, i1 0
+  %nop6286 = alloca i1, i1 0
+  %nop6287 = alloca i1, i1 0
+  %nop6288 = alloca i1, i1 0
+  %nop6289 = alloca i1, i1 0
+  %nop6290 = alloca i1, i1 0
+  %nop6291 = alloca i1, i1 0
+  %nop6292 = alloca i1, i1 0
+  %nop6293 = alloca i1, i1 0
+  %nop6294 = alloca i1, i1 0
+  %nop6295 = alloca i1, i1 0
+  %nop6296 = alloca i1, i1 0
+  %nop6297 = alloca i1, i1 0
+  %nop6298 = alloca i1, i1 0
+  %nop6299 = alloca i1, i1 0
+  %nop6300 = alloca i1, i1 0
+  %nop6301 = alloca i1, i1 0
+  %nop6302 = alloca i1, i1 0
+  %nop6303 = alloca i1, i1 0
+  %nop6304 = alloca i1, i1 0
+  %nop6305 = alloca i1, i1 0
+  %nop6306 = alloca i1, i1 0
+  %nop6307 = alloca i1, i1 0
+  %nop6308 = alloca i1, i1 0
+  %nop6309 = alloca i1, i1 0
+  %nop6310 = alloca i1, i1 0
+  %nop6311 = alloca i1, i1 0
+  %nop6312 = alloca i1, i1 0
+  %nop6313 = alloca i1, i1 0
+  %nop6314 = alloca i1, i1 0
+  %nop6315 = alloca i1, i1 0
+  %nop6316 = alloca i1, i1 0
+  %nop6317 = alloca i1, i1 0
+  %nop6318 = alloca i1, i1 0
+  %nop6319 = alloca i1, i1 0
+  %nop6320 = alloca i1, i1 0
+  %nop6321 = alloca i1, i1 0
+  %nop6322 = alloca i1, i1 0
+  %nop6323 = alloca i1, i1 0
+  %nop6324 = alloca i1, i1 0
+  %nop6325 = alloca i1, i1 0
+  %nop6326 = alloca i1, i1 0
+  %nop6327 = alloca i1, i1 0
+  %nop6328 = alloca i1, i1 0
+  %nop6329 = alloca i1, i1 0
+  %nop6330 = alloca i1, i1 0
+  %nop6331 = alloca i1, i1 0
+  %nop6332 = alloca i1, i1 0
+  %nop6333 = alloca i1, i1 0
+  %nop6334 = alloca i1, i1 0
+  %nop6335 = alloca i1, i1 0
+  %nop6336 = alloca i1, i1 0
+  %nop6337 = alloca i1, i1 0
+  %nop6338 = alloca i1, i1 0
+  %nop6339 = alloca i1, i1 0
+  %nop6340 = alloca i1, i1 0
+  %nop6341 = alloca i1, i1 0
+  %nop6342 = alloca i1, i1 0
+  %nop6343 = alloca i1, i1 0
+  %nop6344 = alloca i1, i1 0
+  %nop6345 = alloca i1, i1 0
+  %nop6346 = alloca i1, i1 0
+  %nop6347 = alloca i1, i1 0
+  %nop6348 = alloca i1, i1 0
+  %nop6349 = alloca i1, i1 0
+  %nop6350 = alloca i1, i1 0
+  %nop6351 = alloca i1, i1 0
+  %nop6352 = alloca i1, i1 0
+  %nop6353 = alloca i1, i1 0
+  %nop6354 = alloca i1, i1 0
+  %nop6355 = alloca i1, i1 0
+  %nop6356 = alloca i1, i1 0
+  %nop6357 = alloca i1, i1 0
+  %nop6358 = alloca i1, i1 0
+  %nop6359 = alloca i1, i1 0
+  %nop6360 = alloca i1, i1 0
+  %nop6361 = alloca i1, i1 0
+  %nop6362 = alloca i1, i1 0
+  %nop6363 = alloca i1, i1 0
+  %nop6364 = alloca i1, i1 0
+  %nop6365 = alloca i1, i1 0
+  %nop6366 = alloca i1, i1 0
+  %nop6367 = alloca i1, i1 0
+  %nop6368 = alloca i1, i1 0
+  %nop6369 = alloca i1, i1 0
+  %nop6370 = alloca i1, i1 0
+  %nop6371 = alloca i1, i1 0
+  %nop6372 = alloca i1, i1 0
+  %nop6373 = alloca i1, i1 0
+  %nop6374 = alloca i1, i1 0
+  %nop6375 = alloca i1, i1 0
+  %nop6376 = alloca i1, i1 0
+  %nop6377 = alloca i1, i1 0
+  %nop6378 = alloca i1, i1 0
+  %nop6379 = alloca i1, i1 0
+  %nop6380 = alloca i1, i1 0
+  %nop6381 = alloca i1, i1 0
+  %nop6382 = alloca i1, i1 0
+  %nop6383 = alloca i1, i1 0
+  %nop6384 = alloca i1, i1 0
+  %nop6385 = alloca i1, i1 0
+  %nop6386 = alloca i1, i1 0
+  %nop6387 = alloca i1, i1 0
+  %nop6388 = alloca i1, i1 0
+  %nop6389 = alloca i1, i1 0
+  %nop6390 = alloca i1, i1 0
+  %nop6391 = alloca i1, i1 0
+  %nop6392 = alloca i1, i1 0
+  %nop6393 = alloca i1, i1 0
+  %nop6394 = alloca i1, i1 0
+  %nop6395 = alloca i1, i1 0
+  %nop6396 = alloca i1, i1 0
+  %nop6397 = alloca i1, i1 0
+  %nop6398 = alloca i1, i1 0
+  %nop6399 = alloca i1, i1 0
+  %nop6400 = alloca i1, i1 0
+  %nop6401 = alloca i1, i1 0
+  %nop6402 = alloca i1, i1 0
+  %nop6403 = alloca i1, i1 0
+  %nop6404 = alloca i1, i1 0
+  %nop6405 = alloca i1, i1 0
+  %nop6406 = alloca i1, i1 0
+  %nop6407 = alloca i1, i1 0
+  %nop6408 = alloca i1, i1 0
+  %nop6409 = alloca i1, i1 0
+  %nop6410 = alloca i1, i1 0
+  %nop6411 = alloca i1, i1 0
+  %nop6412 = alloca i1, i1 0
+  %nop6413 = alloca i1, i1 0
+  %nop6414 = alloca i1, i1 0
+  %nop6415 = alloca i1, i1 0
+  %nop6416 = alloca i1, i1 0
+  %nop6417 = alloca i1, i1 0
+  %nop6418 = alloca i1, i1 0
+  %nop6419 = alloca i1, i1 0
+  %nop6420 = alloca i1, i1 0
+  %nop6421 = alloca i1, i1 0
+  %nop6422 = alloca i1, i1 0
+  %nop6423 = alloca i1, i1 0
+  %nop6424 = alloca i1, i1 0
+  %nop6425 = alloca i1, i1 0
+  %nop6426 = alloca i1, i1 0
+  %nop6427 = alloca i1, i1 0
+  %nop6428 = alloca i1, i1 0
+  %nop6429 = alloca i1, i1 0
+  %nop6430 = alloca i1, i1 0
+  %nop6431 = alloca i1, i1 0
+  %nop6432 = alloca i1, i1 0
+  %nop6433 = alloca i1, i1 0
+  %nop6434 = alloca i1, i1 0
+  %nop6435 = alloca i1, i1 0
+  %nop6436 = alloca i1, i1 0
+  %nop6437 = alloca i1, i1 0
+  %nop6438 = alloca i1, i1 0
+  %nop6439 = alloca i1, i1 0
+  %nop6440 = alloca i1, i1 0
+  %nop6441 = alloca i1, i1 0
+  %nop6442 = alloca i1, i1 0
+  %nop6443 = alloca i1, i1 0
+  %nop6444 = alloca i1, i1 0
+  %nop6445 = alloca i1, i1 0
+  %nop6446 = alloca i1, i1 0
+  %nop6447 = alloca i1, i1 0
+  %nop6448 = alloca i1, i1 0
+  %nop6449 = alloca i1, i1 0
+  %nop6450 = alloca i1, i1 0
+  %nop6451 = alloca i1, i1 0
+  %nop6452 = alloca i1, i1 0
+  %nop6453 = alloca i1, i1 0
+  %nop6454 = alloca i1, i1 0
+  %nop6455 = alloca i1, i1 0
+  %nop6456 = alloca i1, i1 0
+  %nop6457 = alloca i1, i1 0
+  %nop6458 = alloca i1, i1 0
+  %nop6459 = alloca i1, i1 0
+  %nop6460 = alloca i1, i1 0
+  %nop6461 = alloca i1, i1 0
+  %nop6462 = alloca i1, i1 0
+  %nop6463 = alloca i1, i1 0
+  %nop6464 = alloca i1, i1 0
+  %nop6465 = alloca i1, i1 0
+  %nop6466 = alloca i1, i1 0
+  %nop6467 = alloca i1, i1 0
+  %nop6468 = alloca i1, i1 0
+  %nop6469 = alloca i1, i1 0
+  %nop6470 = alloca i1, i1 0
+  %nop6471 = alloca i1, i1 0
+  %nop6472 = alloca i1, i1 0
+  %nop6473 = alloca i1, i1 0
+  %nop6474 = alloca i1, i1 0
+  %nop6475 = alloca i1, i1 0
+  %nop6476 = alloca i1, i1 0
+  %nop6477 = alloca i1, i1 0
+  %nop6478 = alloca i1, i1 0
+  %nop6479 = alloca i1, i1 0
+  %nop6480 = alloca i1, i1 0
+  %nop6481 = alloca i1, i1 0
+  %nop6482 = alloca i1, i1 0
+  %nop6483 = alloca i1, i1 0
+  %nop6484 = alloca i1, i1 0
+  %nop6485 = alloca i1, i1 0
+  %nop6486 = alloca i1, i1 0
+  %nop6487 = alloca i1, i1 0
+  %nop6488 = alloca i1, i1 0
+  %nop6489 = alloca i1, i1 0
+  %nop6490 = alloca i1, i1 0
+  %nop6491 = alloca i1, i1 0
+  %nop6492 = alloca i1, i1 0
+  %nop6493 = alloca i1, i1 0
+  %nop6494 = alloca i1, i1 0
+  %nop6495 = alloca i1, i1 0
+  %nop6496 = alloca i1, i1 0
+  %nop6497 = alloca i1, i1 0
+  %nop6498 = alloca i1, i1 0
+  %nop6499 = alloca i1, i1 0
+  %nop6500 = alloca i1, i1 0
+  %nop6501 = alloca i1, i1 0
+  %nop6502 = alloca i1, i1 0
+  %nop6503 = alloca i1, i1 0
+  %nop6504 = alloca i1, i1 0
+  %nop6505 = alloca i1, i1 0
+  %nop6506 = alloca i1, i1 0
+  %nop6507 = alloca i1, i1 0
+  %nop6508 = alloca i1, i1 0
+  %nop6509 = alloca i1, i1 0
+  %nop6510 = alloca i1, i1 0
+  %nop6511 = alloca i1, i1 0
+  %nop6512 = alloca i1, i1 0
+  %nop6513 = alloca i1, i1 0
+  %nop6514 = alloca i1, i1 0
+  %nop6515 = alloca i1, i1 0
+  %nop6516 = alloca i1, i1 0
+  %nop6517 = alloca i1, i1 0
+  %nop6518 = alloca i1, i1 0
+  %nop6519 = alloca i1, i1 0
+  %nop6520 = alloca i1, i1 0
+  %nop6521 = alloca i1, i1 0
+  %nop6522 = alloca i1, i1 0
+  %nop6523 = alloca i1, i1 0
+  %nop6524 = alloca i1, i1 0
+  %nop6525 = alloca i1, i1 0
+  %nop6526 = alloca i1, i1 0
+  %nop6527 = alloca i1, i1 0
+  %nop6528 = alloca i1, i1 0
+  %nop6529 = alloca i1, i1 0
+  %nop6530 = alloca i1, i1 0
+  %nop6531 = alloca i1, i1 0
+  %nop6532 = alloca i1, i1 0
+  %nop6533 = alloca i1, i1 0
+  %nop6534 = alloca i1, i1 0
+  %nop6535 = alloca i1, i1 0
+  %nop6536 = alloca i1, i1 0
+  %nop6537 = alloca i1, i1 0
+  %nop6538 = alloca i1, i1 0
+  %nop6539 = alloca i1, i1 0
+  %nop6540 = alloca i1, i1 0
+  %nop6541 = alloca i1, i1 0
+  %nop6542 = alloca i1, i1 0
+  %nop6543 = alloca i1, i1 0
+  %nop6544 = alloca i1, i1 0
+  %nop6545 = alloca i1, i1 0
+  %nop6546 = alloca i1, i1 0
+  %nop6547 = alloca i1, i1 0
+  %nop6548 = alloca i1, i1 0
+  %nop6549 = alloca i1, i1 0
+  %nop6550 = alloca i1, i1 0
+  %nop6551 = alloca i1, i1 0
+  %nop6552 = alloca i1, i1 0
+  %nop6553 = alloca i1, i1 0
+  %nop6554 = alloca i1, i1 0
+  %nop6555 = alloca i1, i1 0
+  %nop6556 = alloca i1, i1 0
+  %nop6557 = alloca i1, i1 0
+  %nop6558 = alloca i1, i1 0
+  %nop6559 = alloca i1, i1 0
+  %nop6560 = alloca i1, i1 0
+  %nop6561 = alloca i1, i1 0
+  %nop6562 = alloca i1, i1 0
+  %nop6563 = alloca i1, i1 0
+  %nop6564 = alloca i1, i1 0
+  %nop6565 = alloca i1, i1 0
+  %nop6566 = alloca i1, i1 0
+  %nop6567 = alloca i1, i1 0
+  %nop6568 = alloca i1, i1 0
+  %nop6569 = alloca i1, i1 0
+  %nop6570 = alloca i1, i1 0
+  %nop6571 = alloca i1, i1 0
+  %nop6572 = alloca i1, i1 0
+  %nop6573 = alloca i1, i1 0
+  %nop6574 = alloca i1, i1 0
+  %nop6575 = alloca i1, i1 0
+  %nop6576 = alloca i1, i1 0
+  %nop6577 = alloca i1, i1 0
+  %nop6578 = alloca i1, i1 0
+  %nop6579 = alloca i1, i1 0
+  %nop6580 = alloca i1, i1 0
+  %nop6581 = alloca i1, i1 0
+  %nop6582 = alloca i1, i1 0
+  %nop6583 = alloca i1, i1 0
+  %nop6584 = alloca i1, i1 0
+  %nop6585 = alloca i1, i1 0
+  %nop6586 = alloca i1, i1 0
+  %nop6587 = alloca i1, i1 0
+  %nop6588 = alloca i1, i1 0
+  %nop6589 = alloca i1, i1 0
+  %nop6590 = alloca i1, i1 0
+  %nop6591 = alloca i1, i1 0
+  %nop6592 = alloca i1, i1 0
+  %nop6593 = alloca i1, i1 0
+  %nop6594 = alloca i1, i1 0
+  %nop6595 = alloca i1, i1 0
+  %nop6596 = alloca i1, i1 0
+  %nop6597 = alloca i1, i1 0
+  %nop6598 = alloca i1, i1 0
+  %nop6599 = alloca i1, i1 0
+  %nop6600 = alloca i1, i1 0
+  %nop6601 = alloca i1, i1 0
+  %nop6602 = alloca i1, i1 0
+  %nop6603 = alloca i1, i1 0
+  %nop6604 = alloca i1, i1 0
+  %nop6605 = alloca i1, i1 0
+  %nop6606 = alloca i1, i1 0
+  %nop6607 = alloca i1, i1 0
+  %nop6608 = alloca i1, i1 0
+  %nop6609 = alloca i1, i1 0
+  %nop6610 = alloca i1, i1 0
+  %nop6611 = alloca i1, i1 0
+  %nop6612 = alloca i1, i1 0
+  %nop6613 = alloca i1, i1 0
+  %nop6614 = alloca i1, i1 0
+  %nop6615 = alloca i1, i1 0
+  %nop6616 = alloca i1, i1 0
+  %nop6617 = alloca i1, i1 0
+  %nop6618 = alloca i1, i1 0
+  %nop6619 = alloca i1, i1 0
+  %nop6620 = alloca i1, i1 0
+  %nop6621 = alloca i1, i1 0
+  %nop6622 = alloca i1, i1 0
+  %nop6623 = alloca i1, i1 0
+  %nop6624 = alloca i1, i1 0
+  %nop6625 = alloca i1, i1 0
+  %nop6626 = alloca i1, i1 0
+  %nop6627 = alloca i1, i1 0
+  %nop6628 = alloca i1, i1 0
+  %nop6629 = alloca i1, i1 0
+  %nop6630 = alloca i1, i1 0
+  %nop6631 = alloca i1, i1 0
+  %nop6632 = alloca i1, i1 0
+  %nop6633 = alloca i1, i1 0
+  %nop6634 = alloca i1, i1 0
+  %nop6635 = alloca i1, i1 0
+  %nop6636 = alloca i1, i1 0
+  %nop6637 = alloca i1, i1 0
+  %nop6638 = alloca i1, i1 0
+  %nop6639 = alloca i1, i1 0
+  %nop6640 = alloca i1, i1 0
+  %nop6641 = alloca i1, i1 0
+  %nop6642 = alloca i1, i1 0
+  %nop6643 = alloca i1, i1 0
+  %nop6644 = alloca i1, i1 0
+  %nop6645 = alloca i1, i1 0
+  %nop6646 = alloca i1, i1 0
+  %nop6647 = alloca i1, i1 0
+  %nop6648 = alloca i1, i1 0
+  %nop6649 = alloca i1, i1 0
+  %nop6650 = alloca i1, i1 0
+  %nop6651 = alloca i1, i1 0
+  %nop6652 = alloca i1, i1 0
+  %nop6653 = alloca i1, i1 0
+  %nop6654 = alloca i1, i1 0
+  %nop6655 = alloca i1, i1 0
+  %nop6656 = alloca i1, i1 0
+  %nop6657 = alloca i1, i1 0
+  %nop6658 = alloca i1, i1 0
+  %nop6659 = alloca i1, i1 0
+  %nop6660 = alloca i1, i1 0
+  %nop6661 = alloca i1, i1 0
+  %nop6662 = alloca i1, i1 0
+  %nop6663 = alloca i1, i1 0
+  %nop6664 = alloca i1, i1 0
+  %nop6665 = alloca i1, i1 0
+  %nop6666 = alloca i1, i1 0
+  %nop6667 = alloca i1, i1 0
+  %nop6668 = alloca i1, i1 0
+  %nop6669 = alloca i1, i1 0
+  %nop6670 = alloca i1, i1 0
+  %nop6671 = alloca i1, i1 0
+  %nop6672 = alloca i1, i1 0
+  %nop6673 = alloca i1, i1 0
+  %nop6674 = alloca i1, i1 0
+  %nop6675 = alloca i1, i1 0
+  %nop6676 = alloca i1, i1 0
+  %nop6677 = alloca i1, i1 0
+  %nop6678 = alloca i1, i1 0
+  %nop6679 = alloca i1, i1 0
+  %nop6680 = alloca i1, i1 0
+  %nop6681 = alloca i1, i1 0
+  %nop6682 = alloca i1, i1 0
+  %nop6683 = alloca i1, i1 0
+  %nop6684 = alloca i1, i1 0
+  %nop6685 = alloca i1, i1 0
+  %nop6686 = alloca i1, i1 0
+  %nop6687 = alloca i1, i1 0
+  %nop6688 = alloca i1, i1 0
+  %nop6689 = alloca i1, i1 0
+  %nop6690 = alloca i1, i1 0
+  %nop6691 = alloca i1, i1 0
+  %nop6692 = alloca i1, i1 0
+  %nop6693 = alloca i1, i1 0
+  %nop6694 = alloca i1, i1 0
+  %nop6695 = alloca i1, i1 0
+  %nop6696 = alloca i1, i1 0
+  %nop6697 = alloca i1, i1 0
+  %nop6698 = alloca i1, i1 0
+  %nop6699 = alloca i1, i1 0
+  %nop6700 = alloca i1, i1 0
+  %nop6701 = alloca i1, i1 0
+  %nop6702 = alloca i1, i1 0
+  %nop6703 = alloca i1, i1 0
+  %nop6704 = alloca i1, i1 0
+  %nop6705 = alloca i1, i1 0
+  %nop6706 = alloca i1, i1 0
+  %nop6707 = alloca i1, i1 0
+  %nop6708 = alloca i1, i1 0
+  %nop6709 = alloca i1, i1 0
+  %nop6710 = alloca i1, i1 0
+  %nop6711 = alloca i1, i1 0
+  %nop6712 = alloca i1, i1 0
+  %nop6713 = alloca i1, i1 0
+  %nop6714 = alloca i1, i1 0
+  %nop6715 = alloca i1, i1 0
+  %nop6716 = alloca i1, i1 0
+  %nop6717 = alloca i1, i1 0
+  %nop6718 = alloca i1, i1 0
+  %nop6719 = alloca i1, i1 0
+  %nop6720 = alloca i1, i1 0
+  %nop6721 = alloca i1, i1 0
+  %nop6722 = alloca i1, i1 0
+  %nop6723 = alloca i1, i1 0
+  %nop6724 = alloca i1, i1 0
+  %nop6725 = alloca i1, i1 0
+  %nop6726 = alloca i1, i1 0
+  %nop6727 = alloca i1, i1 0
+  %nop6728 = alloca i1, i1 0
+  %nop6729 = alloca i1, i1 0
+  %nop6730 = alloca i1, i1 0
+  %nop6731 = alloca i1, i1 0
+  %nop6732 = alloca i1, i1 0
+  %nop6733 = alloca i1, i1 0
+  %nop6734 = alloca i1, i1 0
+  %nop6735 = alloca i1, i1 0
+  %nop6736 = alloca i1, i1 0
+  %nop6737 = alloca i1, i1 0
+  %nop6738 = alloca i1, i1 0
+  %nop6739 = alloca i1, i1 0
+  %nop6740 = alloca i1, i1 0
+  %nop6741 = alloca i1, i1 0
+  %nop6742 = alloca i1, i1 0
+  %nop6743 = alloca i1, i1 0
+  %nop6744 = alloca i1, i1 0
+  %nop6745 = alloca i1, i1 0
+  %nop6746 = alloca i1, i1 0
+  %nop6747 = alloca i1, i1 0
+  %nop6748 = alloca i1, i1 0
+  %nop6749 = alloca i1, i1 0
+  %nop6750 = alloca i1, i1 0
+  %nop6751 = alloca i1, i1 0
+  %nop6752 = alloca i1, i1 0
+  %nop6753 = alloca i1, i1 0
+  %nop6754 = alloca i1, i1 0
+  %nop6755 = alloca i1, i1 0
+  %nop6756 = alloca i1, i1 0
+  %nop6757 = alloca i1, i1 0
+  %nop6758 = alloca i1, i1 0
+  %nop6759 = alloca i1, i1 0
+  %nop6760 = alloca i1, i1 0
+  %nop6761 = alloca i1, i1 0
+  %nop6762 = alloca i1, i1 0
+  %nop6763 = alloca i1, i1 0
+  %nop6764 = alloca i1, i1 0
+  %nop6765 = alloca i1, i1 0
+  %nop6766 = alloca i1, i1 0
+  %nop6767 = alloca i1, i1 0
+  %nop6768 = alloca i1, i1 0
+  %nop6769 = alloca i1, i1 0
+  %nop6770 = alloca i1, i1 0
+  %nop6771 = alloca i1, i1 0
+  %nop6772 = alloca i1, i1 0
+  %nop6773 = alloca i1, i1 0
+  %nop6774 = alloca i1, i1 0
+  %nop6775 = alloca i1, i1 0
+  %nop6776 = alloca i1, i1 0
+  %nop6777 = alloca i1, i1 0
+  %nop6778 = alloca i1, i1 0
+  %nop6779 = alloca i1, i1 0
+  %nop6780 = alloca i1, i1 0
+  %nop6781 = alloca i1, i1 0
+  %nop6782 = alloca i1, i1 0
+  %nop6783 = alloca i1, i1 0
+  %nop6784 = alloca i1, i1 0
+  %nop6785 = alloca i1, i1 0
+  %nop6786 = alloca i1, i1 0
+  %nop6787 = alloca i1, i1 0
+  %nop6788 = alloca i1, i1 0
+  %nop6789 = alloca i1, i1 0
+  %nop6790 = alloca i1, i1 0
+  %nop6791 = alloca i1, i1 0
+  %nop6792 = alloca i1, i1 0
+  %nop6793 = alloca i1, i1 0
+  %nop6794 = alloca i1, i1 0
+  %nop6795 = alloca i1, i1 0
+  %nop6796 = alloca i1, i1 0
+  %nop6797 = alloca i1, i1 0
+  %nop6798 = alloca i1, i1 0
+  %nop6799 = alloca i1, i1 0
+  %nop6800 = alloca i1, i1 0
+  %nop6801 = alloca i1, i1 0
+  %nop6802 = alloca i1, i1 0
+  %nop6803 = alloca i1, i1 0
+  %nop6804 = alloca i1, i1 0
+  %nop6805 = alloca i1, i1 0
+  %nop6806 = alloca i1, i1 0
+  %nop6807 = alloca i1, i1 0
+  %nop6808 = alloca i1, i1 0
+  %nop6809 = alloca i1, i1 0
+  %nop6810 = alloca i1, i1 0
+  %nop6811 = alloca i1, i1 0
+  %nop6812 = alloca i1, i1 0
+  %nop6813 = alloca i1, i1 0
+  %nop6814 = alloca i1, i1 0
+  %nop6815 = alloca i1, i1 0
+  %nop6816 = alloca i1, i1 0
+  %nop6817 = alloca i1, i1 0
+  %nop6818 = alloca i1, i1 0
+  %nop6819 = alloca i1, i1 0
+  %nop6820 = alloca i1, i1 0
+  %nop6821 = alloca i1, i1 0
+  %nop6822 = alloca i1, i1 0
+  %nop6823 = alloca i1, i1 0
+  %nop6824 = alloca i1, i1 0
+  %nop6825 = alloca i1, i1 0
+  %nop6826 = alloca i1, i1 0
+  %nop6827 = alloca i1, i1 0
+  %nop6828 = alloca i1, i1 0
+  %nop6829 = alloca i1, i1 0
+  %nop6830 = alloca i1, i1 0
+  %nop6831 = alloca i1, i1 0
+  %nop6832 = alloca i1, i1 0
+  %nop6833 = alloca i1, i1 0
+  %nop6834 = alloca i1, i1 0
+  %nop6835 = alloca i1, i1 0
+  %nop6836 = alloca i1, i1 0
+  %nop6837 = alloca i1, i1 0
+  %nop6838 = alloca i1, i1 0
+  %nop6839 = alloca i1, i1 0
+  %nop6840 = alloca i1, i1 0
+  %nop6841 = alloca i1, i1 0
+  %nop6842 = alloca i1, i1 0
+  %nop6843 = alloca i1, i1 0
+  %nop6844 = alloca i1, i1 0
+  %nop6845 = alloca i1, i1 0
+  %nop6846 = alloca i1, i1 0
+  %nop6847 = alloca i1, i1 0
+  %nop6848 = alloca i1, i1 0
+  %nop6849 = alloca i1, i1 0
+  %nop6850 = alloca i1, i1 0
+  %nop6851 = alloca i1, i1 0
+  %nop6852 = alloca i1, i1 0
+  %nop6853 = alloca i1, i1 0
+  %nop6854 = alloca i1, i1 0
+  %nop6855 = alloca i1, i1 0
+  %nop6856 = alloca i1, i1 0
+  %nop6857 = alloca i1, i1 0
+  %nop6858 = alloca i1, i1 0
+  %nop6859 = alloca i1, i1 0
+  %nop6860 = alloca i1, i1 0
+  %nop6861 = alloca i1, i1 0
+  %nop6862 = alloca i1, i1 0
+  %nop6863 = alloca i1, i1 0
+  %nop6864 = alloca i1, i1 0
+  %nop6865 = alloca i1, i1 0
+  %nop6866 = alloca i1, i1 0
+  %nop6867 = alloca i1, i1 0
+  %nop6868 = alloca i1, i1 0
+  %nop6869 = alloca i1, i1 0
+  %nop6870 = alloca i1, i1 0
+  %nop6871 = alloca i1, i1 0
+  %nop6872 = alloca i1, i1 0
+  %nop6873 = alloca i1, i1 0
+  %nop6874 = alloca i1, i1 0
+  %nop6875 = alloca i1, i1 0
+  %nop6876 = alloca i1, i1 0
+  %nop6877 = alloca i1, i1 0
+  %nop6878 = alloca i1, i1 0
+  %nop6879 = alloca i1, i1 0
+  %nop6880 = alloca i1, i1 0
+  %nop6881 = alloca i1, i1 0
+  %nop6882 = alloca i1, i1 0
+  %nop6883 = alloca i1, i1 0
+  %nop6884 = alloca i1, i1 0
+  %nop6885 = alloca i1, i1 0
+  %nop6886 = alloca i1, i1 0
+  %nop6887 = alloca i1, i1 0
+  %nop6888 = alloca i1, i1 0
+  %nop6889 = alloca i1, i1 0
+  %nop6890 = alloca i1, i1 0
+  %nop6891 = alloca i1, i1 0
+  %nop6892 = alloca i1, i1 0
+  %nop6893 = alloca i1, i1 0
+  %nop6894 = alloca i1, i1 0
+  %nop6895 = alloca i1, i1 0
+  %nop6896 = alloca i1, i1 0
+  %nop6897 = alloca i1, i1 0
+  %nop6898 = alloca i1, i1 0
+  %nop6899 = alloca i1, i1 0
+  %nop6900 = alloca i1, i1 0
+  %nop6901 = alloca i1, i1 0
+  %nop6902 = alloca i1, i1 0
+  %nop6903 = alloca i1, i1 0
+  %nop6904 = alloca i1, i1 0
+  %nop6905 = alloca i1, i1 0
+  %nop6906 = alloca i1, i1 0
+  %nop6907 = alloca i1, i1 0
+  %nop6908 = alloca i1, i1 0
+  %nop6909 = alloca i1, i1 0
+  %nop6910 = alloca i1, i1 0
+  %nop6911 = alloca i1, i1 0
+  %nop6912 = alloca i1, i1 0
+  %nop6913 = alloca i1, i1 0
+  %nop6914 = alloca i1, i1 0
+  %nop6915 = alloca i1, i1 0
+  %nop6916 = alloca i1, i1 0
+  %nop6917 = alloca i1, i1 0
+  %nop6918 = alloca i1, i1 0
+  %nop6919 = alloca i1, i1 0
+  %nop6920 = alloca i1, i1 0
+  %nop6921 = alloca i1, i1 0
+  %nop6922 = alloca i1, i1 0
+  %nop6923 = alloca i1, i1 0
+  %nop6924 = alloca i1, i1 0
+  %nop6925 = alloca i1, i1 0
+  %nop6926 = alloca i1, i1 0
+  %nop6927 = alloca i1, i1 0
+  %nop6928 = alloca i1, i1 0
+  %nop6929 = alloca i1, i1 0
+  %nop6930 = alloca i1, i1 0
+  %nop6931 = alloca i1, i1 0
+  %nop6932 = alloca i1, i1 0
+  %nop6933 = alloca i1, i1 0
+  %nop6934 = alloca i1, i1 0
+  %nop6935 = alloca i1, i1 0
+  %nop6936 = alloca i1, i1 0
+  %nop6937 = alloca i1, i1 0
+  %nop6938 = alloca i1, i1 0
+  %nop6939 = alloca i1, i1 0
+  %nop6940 = alloca i1, i1 0
+  %nop6941 = alloca i1, i1 0
+  %nop6942 = alloca i1, i1 0
+  %nop6943 = alloca i1, i1 0
+  %nop6944 = alloca i1, i1 0
+  %nop6945 = alloca i1, i1 0
+  %nop6946 = alloca i1, i1 0
+  %nop6947 = alloca i1, i1 0
+  %nop6948 = alloca i1, i1 0
+  %nop6949 = alloca i1, i1 0
+  %nop6950 = alloca i1, i1 0
+  %nop6951 = alloca i1, i1 0
+  %nop6952 = alloca i1, i1 0
+  %nop6953 = alloca i1, i1 0
+  %nop6954 = alloca i1, i1 0
+  %nop6955 = alloca i1, i1 0
+  %nop6956 = alloca i1, i1 0
+  %nop6957 = alloca i1, i1 0
+  %nop6958 = alloca i1, i1 0
+  %nop6959 = alloca i1, i1 0
+  %nop6960 = alloca i1, i1 0
+  %nop6961 = alloca i1, i1 0
+  %nop6962 = alloca i1, i1 0
+  %nop6963 = alloca i1, i1 0
+  %nop6964 = alloca i1, i1 0
+  %nop6965 = alloca i1, i1 0
+  %nop6966 = alloca i1, i1 0
+  %nop6967 = alloca i1, i1 0
+  %nop6968 = alloca i1, i1 0
+  %nop6969 = alloca i1, i1 0
+  %nop6970 = alloca i1, i1 0
+  %nop6971 = alloca i1, i1 0
+  %nop6972 = alloca i1, i1 0
+  %nop6973 = alloca i1, i1 0
+  %nop6974 = alloca i1, i1 0
+  %nop6975 = alloca i1, i1 0
+  %nop6976 = alloca i1, i1 0
+  %nop6977 = alloca i1, i1 0
+  %nop6978 = alloca i1, i1 0
+  %nop6979 = alloca i1, i1 0
+  %nop6980 = alloca i1, i1 0
+  %nop6981 = alloca i1, i1 0
+  %nop6982 = alloca i1, i1 0
+  %nop6983 = alloca i1, i1 0
+  %nop6984 = alloca i1, i1 0
+  %nop6985 = alloca i1, i1 0
+  %nop6986 = alloca i1, i1 0
+  %nop6987 = alloca i1, i1 0
+  %nop6988 = alloca i1, i1 0
+  %nop6989 = alloca i1, i1 0
+  %nop6990 = alloca i1, i1 0
+  %nop6991 = alloca i1, i1 0
+  %nop6992 = alloca i1, i1 0
+  %nop6993 = alloca i1, i1 0
+  %nop6994 = alloca i1, i1 0
+  %nop6995 = alloca i1, i1 0
+  %nop6996 = alloca i1, i1 0
+  %nop6997 = alloca i1, i1 0
+  %nop6998 = alloca i1, i1 0
+  %nop6999 = alloca i1, i1 0
+  %nop7000 = alloca i1, i1 0
+  %nop7001 = alloca i1, i1 0
+  %nop7002 = alloca i1, i1 0
+  %nop7003 = alloca i1, i1 0
+  %nop7004 = alloca i1, i1 0
+  %nop7005 = alloca i1, i1 0
+  %nop7006 = alloca i1, i1 0
+  %nop7007 = alloca i1, i1 0
+  %nop7008 = alloca i1, i1 0
+  %nop7009 = alloca i1, i1 0
+  %nop7010 = alloca i1, i1 0
+  %nop7011 = alloca i1, i1 0
+  %nop7012 = alloca i1, i1 0
+  %nop7013 = alloca i1, i1 0
+  %nop7014 = alloca i1, i1 0
+  %nop7015 = alloca i1, i1 0
+  %nop7016 = alloca i1, i1 0
+  %nop7017 = alloca i1, i1 0
+  %nop7018 = alloca i1, i1 0
+  %nop7019 = alloca i1, i1 0
+  %nop7020 = alloca i1, i1 0
+  %nop7021 = alloca i1, i1 0
+  %nop7022 = alloca i1, i1 0
+  %nop7023 = alloca i1, i1 0
+  %nop7024 = alloca i1, i1 0
+  %nop7025 = alloca i1, i1 0
+  %nop7026 = alloca i1, i1 0
+  %nop7027 = alloca i1, i1 0
+  %nop7028 = alloca i1, i1 0
+  %nop7029 = alloca i1, i1 0
+  %nop7030 = alloca i1, i1 0
+  %nop7031 = alloca i1, i1 0
+  %nop7032 = alloca i1, i1 0
+  %nop7033 = alloca i1, i1 0
+  %nop7034 = alloca i1, i1 0
+  %nop7035 = alloca i1, i1 0
+  %nop7036 = alloca i1, i1 0
+  %nop7037 = alloca i1, i1 0
+  %nop7038 = alloca i1, i1 0
+  %nop7039 = alloca i1, i1 0
+  %nop7040 = alloca i1, i1 0
+  %nop7041 = alloca i1, i1 0
+  %nop7042 = alloca i1, i1 0
+  %nop7043 = alloca i1, i1 0
+  %nop7044 = alloca i1, i1 0
+  %nop7045 = alloca i1, i1 0
+  %nop7046 = alloca i1, i1 0
+  %nop7047 = alloca i1, i1 0
+  %nop7048 = alloca i1, i1 0
+  %nop7049 = alloca i1, i1 0
+  %nop7050 = alloca i1, i1 0
+  %nop7051 = alloca i1, i1 0
+  %nop7052 = alloca i1, i1 0
+  %nop7053 = alloca i1, i1 0
+  %nop7054 = alloca i1, i1 0
+  %nop7055 = alloca i1, i1 0
+  %nop7056 = alloca i1, i1 0
+  %nop7057 = alloca i1, i1 0
+  %nop7058 = alloca i1, i1 0
+  %nop7059 = alloca i1, i1 0
+  %nop7060 = alloca i1, i1 0
+  %nop7061 = alloca i1, i1 0
+  %nop7062 = alloca i1, i1 0
+  %nop7063 = alloca i1, i1 0
+  %nop7064 = alloca i1, i1 0
+  %nop7065 = alloca i1, i1 0
+  %nop7066 = alloca i1, i1 0
+  %nop7067 = alloca i1, i1 0
+  %nop7068 = alloca i1, i1 0
+  %nop7069 = alloca i1, i1 0
+  %nop7070 = alloca i1, i1 0
+  %nop7071 = alloca i1, i1 0
+  %nop7072 = alloca i1, i1 0
+  %nop7073 = alloca i1, i1 0
+  %nop7074 = alloca i1, i1 0
+  %nop7075 = alloca i1, i1 0
+  %nop7076 = alloca i1, i1 0
+  %nop7077 = alloca i1, i1 0
+  %nop7078 = alloca i1, i1 0
+  %nop7079 = alloca i1, i1 0
+  %nop7080 = alloca i1, i1 0
+  %nop7081 = alloca i1, i1 0
+  %nop7082 = alloca i1, i1 0
+  %nop7083 = alloca i1, i1 0
+  %nop7084 = alloca i1, i1 0
+  %nop7085 = alloca i1, i1 0
+  %nop7086 = alloca i1, i1 0
+  %nop7087 = alloca i1, i1 0
+  %nop7088 = alloca i1, i1 0
+  %nop7089 = alloca i1, i1 0
+  %nop7090 = alloca i1, i1 0
+  %nop7091 = alloca i1, i1 0
+  %nop7092 = alloca i1, i1 0
+  %nop7093 = alloca i1, i1 0
+  %nop7094 = alloca i1, i1 0
+  %nop7095 = alloca i1, i1 0
+  %nop7096 = alloca i1, i1 0
+  %nop7097 = alloca i1, i1 0
+  %nop7098 = alloca i1, i1 0
+  %nop7099 = alloca i1, i1 0
+  %nop7100 = alloca i1, i1 0
+  %nop7101 = alloca i1, i1 0
+  %nop7102 = alloca i1, i1 0
+  %nop7103 = alloca i1, i1 0
+  %nop7104 = alloca i1, i1 0
+  %nop7105 = alloca i1, i1 0
+  %nop7106 = alloca i1, i1 0
+  %nop7107 = alloca i1, i1 0
+  %nop7108 = alloca i1, i1 0
+  %nop7109 = alloca i1, i1 0
+  %nop7110 = alloca i1, i1 0
+  %nop7111 = alloca i1, i1 0
+  %nop7112 = alloca i1, i1 0
+  %nop7113 = alloca i1, i1 0
+  %nop7114 = alloca i1, i1 0
+  %nop7115 = alloca i1, i1 0
+  %nop7116 = alloca i1, i1 0
+  %nop7117 = alloca i1, i1 0
+  %nop7118 = alloca i1, i1 0
+  %nop7119 = alloca i1, i1 0
+  %nop7120 = alloca i1, i1 0
+  %nop7121 = alloca i1, i1 0
+  %nop7122 = alloca i1, i1 0
+  %nop7123 = alloca i1, i1 0
+  %nop7124 = alloca i1, i1 0
+  %nop7125 = alloca i1, i1 0
+  %nop7126 = alloca i1, i1 0
+  %nop7127 = alloca i1, i1 0
+  %nop7128 = alloca i1, i1 0
+  %nop7129 = alloca i1, i1 0
+  %nop7130 = alloca i1, i1 0
+  %nop7131 = alloca i1, i1 0
+  %nop7132 = alloca i1, i1 0
+  %nop7133 = alloca i1, i1 0
+  %nop7134 = alloca i1, i1 0
+  %nop7135 = alloca i1, i1 0
+  %nop7136 = alloca i1, i1 0
+  %nop7137 = alloca i1, i1 0
+  %nop7138 = alloca i1, i1 0
+  %nop7139 = alloca i1, i1 0
+  %nop7140 = alloca i1, i1 0
+  %nop7141 = alloca i1, i1 0
+  %nop7142 = alloca i1, i1 0
+  %nop7143 = alloca i1, i1 0
+  %nop7144 = alloca i1, i1 0
+  %nop7145 = alloca i1, i1 0
+  %nop7146 = alloca i1, i1 0
+  %nop7147 = alloca i1, i1 0
+  %nop7148 = alloca i1, i1 0
+  %nop7149 = alloca i1, i1 0
+  %nop7150 = alloca i1, i1 0
+  %nop7151 = alloca i1, i1 0
+  %nop7152 = alloca i1, i1 0
+  %nop7153 = alloca i1, i1 0
+  %nop7154 = alloca i1, i1 0
+  %nop7155 = alloca i1, i1 0
+  %nop7156 = alloca i1, i1 0
+  %nop7157 = alloca i1, i1 0
+  %nop7158 = alloca i1, i1 0
+  %nop7159 = alloca i1, i1 0
+  %nop7160 = alloca i1, i1 0
+  %nop7161 = alloca i1, i1 0
+  %nop7162 = alloca i1, i1 0
+  %nop7163 = alloca i1, i1 0
+  %nop7164 = alloca i1, i1 0
+  %nop7165 = alloca i1, i1 0
+  %nop7166 = alloca i1, i1 0
+  %nop7167 = alloca i1, i1 0
+  %nop7168 = alloca i1, i1 0
+  %nop7169 = alloca i1, i1 0
+  %nop7170 = alloca i1, i1 0
+  %nop7171 = alloca i1, i1 0
+  %nop7172 = alloca i1, i1 0
+  %nop7173 = alloca i1, i1 0
+  %nop7174 = alloca i1, i1 0
+  %nop7175 = alloca i1, i1 0
+  %nop7176 = alloca i1, i1 0
+  %nop7177 = alloca i1, i1 0
+  %nop7178 = alloca i1, i1 0
+  %nop7179 = alloca i1, i1 0
+  %nop7180 = alloca i1, i1 0
+  %nop7181 = alloca i1, i1 0
+  %nop7182 = alloca i1, i1 0
+  %nop7183 = alloca i1, i1 0
+  %nop7184 = alloca i1, i1 0
+  %nop7185 = alloca i1, i1 0
+  %nop7186 = alloca i1, i1 0
+  %nop7187 = alloca i1, i1 0
+  %nop7188 = alloca i1, i1 0
+  %nop7189 = alloca i1, i1 0
+  %nop7190 = alloca i1, i1 0
+  %nop7191 = alloca i1, i1 0
+  %nop7192 = alloca i1, i1 0
+  %nop7193 = alloca i1, i1 0
+  %nop7194 = alloca i1, i1 0
+  %nop7195 = alloca i1, i1 0
+  %nop7196 = alloca i1, i1 0
+  %nop7197 = alloca i1, i1 0
+  %nop7198 = alloca i1, i1 0
+  %nop7199 = alloca i1, i1 0
+  %nop7200 = alloca i1, i1 0
+  %nop7201 = alloca i1, i1 0
+  %nop7202 = alloca i1, i1 0
+  %nop7203 = alloca i1, i1 0
+  %nop7204 = alloca i1, i1 0
+  %nop7205 = alloca i1, i1 0
+  %nop7206 = alloca i1, i1 0
+  %nop7207 = alloca i1, i1 0
+  %nop7208 = alloca i1, i1 0
+  %nop7209 = alloca i1, i1 0
+  %nop7210 = alloca i1, i1 0
+  %nop7211 = alloca i1, i1 0
+  %nop7212 = alloca i1, i1 0
+  %nop7213 = alloca i1, i1 0
+  %nop7214 = alloca i1, i1 0
+  %nop7215 = alloca i1, i1 0
+  %nop7216 = alloca i1, i1 0
+  %nop7217 = alloca i1, i1 0
+  %nop7218 = alloca i1, i1 0
+  %nop7219 = alloca i1, i1 0
+  %nop7220 = alloca i1, i1 0
+  %nop7221 = alloca i1, i1 0
+  %nop7222 = alloca i1, i1 0
+  %nop7223 = alloca i1, i1 0
+  %nop7224 = alloca i1, i1 0
+  %nop7225 = alloca i1, i1 0
+  %nop7226 = alloca i1, i1 0
+  %nop7227 = alloca i1, i1 0
+  %nop7228 = alloca i1, i1 0
+  %nop7229 = alloca i1, i1 0
+  %nop7230 = alloca i1, i1 0
+  %nop7231 = alloca i1, i1 0
+  %nop7232 = alloca i1, i1 0
+  %nop7233 = alloca i1, i1 0
+  %nop7234 = alloca i1, i1 0
+  %nop7235 = alloca i1, i1 0
+  %nop7236 = alloca i1, i1 0
+  %nop7237 = alloca i1, i1 0
+  %nop7238 = alloca i1, i1 0
+  %nop7239 = alloca i1, i1 0
+  %nop7240 = alloca i1, i1 0
+  %nop7241 = alloca i1, i1 0
+  %nop7242 = alloca i1, i1 0
+  %nop7243 = alloca i1, i1 0
+  %nop7244 = alloca i1, i1 0
+  %nop7245 = alloca i1, i1 0
+  %nop7246 = alloca i1, i1 0
+  %nop7247 = alloca i1, i1 0
+  %nop7248 = alloca i1, i1 0
+  %nop7249 = alloca i1, i1 0
+  %nop7250 = alloca i1, i1 0
+  %nop7251 = alloca i1, i1 0
+  %nop7252 = alloca i1, i1 0
+  %nop7253 = alloca i1, i1 0
+  %nop7254 = alloca i1, i1 0
+  %nop7255 = alloca i1, i1 0
+  %nop7256 = alloca i1, i1 0
+  %nop7257 = alloca i1, i1 0
+  %nop7258 = alloca i1, i1 0
+  %nop7259 = alloca i1, i1 0
+  %nop7260 = alloca i1, i1 0
+  %nop7261 = alloca i1, i1 0
+  %nop7262 = alloca i1, i1 0
+  %nop7263 = alloca i1, i1 0
+  %nop7264 = alloca i1, i1 0
+  %nop7265 = alloca i1, i1 0
+  %nop7266 = alloca i1, i1 0
+  %nop7267 = alloca i1, i1 0
+  %nop7268 = alloca i1, i1 0
+  %nop7269 = alloca i1, i1 0
+  %nop7270 = alloca i1, i1 0
+  %nop7271 = alloca i1, i1 0
+  %nop7272 = alloca i1, i1 0
+  %nop7273 = alloca i1, i1 0
+  %nop7274 = alloca i1, i1 0
+  %nop7275 = alloca i1, i1 0
+  %nop7276 = alloca i1, i1 0
+  %nop7277 = alloca i1, i1 0
+  %nop7278 = alloca i1, i1 0
+  %nop7279 = alloca i1, i1 0
+  %nop7280 = alloca i1, i1 0
+  %nop7281 = alloca i1, i1 0
+  %nop7282 = alloca i1, i1 0
+  %nop7283 = alloca i1, i1 0
+  %nop7284 = alloca i1, i1 0
+  %nop7285 = alloca i1, i1 0
+  %nop7286 = alloca i1, i1 0
+  %nop7287 = alloca i1, i1 0
+  %nop7288 = alloca i1, i1 0
+  %nop7289 = alloca i1, i1 0
+  %nop7290 = alloca i1, i1 0
+  %nop7291 = alloca i1, i1 0
+  %nop7292 = alloca i1, i1 0
+  %nop7293 = alloca i1, i1 0
+  %nop7294 = alloca i1, i1 0
+  %nop7295 = alloca i1, i1 0
+  %nop7296 = alloca i1, i1 0
+  %nop7297 = alloca i1, i1 0
+  %nop7298 = alloca i1, i1 0
+  %nop7299 = alloca i1, i1 0
+  %nop7300 = alloca i1, i1 0
+  %nop7301 = alloca i1, i1 0
+  %nop7302 = alloca i1, i1 0
+  %nop7303 = alloca i1, i1 0
+  %nop7304 = alloca i1, i1 0
+  %nop7305 = alloca i1, i1 0
+  %nop7306 = alloca i1, i1 0
+  %nop7307 = alloca i1, i1 0
+  %nop7308 = alloca i1, i1 0
+  %nop7309 = alloca i1, i1 0
+  %nop7310 = alloca i1, i1 0
+  %nop7311 = alloca i1, i1 0
+  %nop7312 = alloca i1, i1 0
+  %nop7313 = alloca i1, i1 0
+  %nop7314 = alloca i1, i1 0
+  %nop7315 = alloca i1, i1 0
+  %nop7316 = alloca i1, i1 0
+  %nop7317 = alloca i1, i1 0
+  %nop7318 = alloca i1, i1 0
+  %nop7319 = alloca i1, i1 0
+  %nop7320 = alloca i1, i1 0
+  %nop7321 = alloca i1, i1 0
+  %nop7322 = alloca i1, i1 0
+  %nop7323 = alloca i1, i1 0
+  %nop7324 = alloca i1, i1 0
+  %nop7325 = alloca i1, i1 0
+  %nop7326 = alloca i1, i1 0
+  %nop7327 = alloca i1, i1 0
+  %nop7328 = alloca i1, i1 0
+  %nop7329 = alloca i1, i1 0
+  %nop7330 = alloca i1, i1 0
+  %nop7331 = alloca i1, i1 0
+  %nop7332 = alloca i1, i1 0
+  %nop7333 = alloca i1, i1 0
+  %nop7334 = alloca i1, i1 0
+  %nop7335 = alloca i1, i1 0
+  %nop7336 = alloca i1, i1 0
+  %nop7337 = alloca i1, i1 0
+  %nop7338 = alloca i1, i1 0
+  %nop7339 = alloca i1, i1 0
+  %nop7340 = alloca i1, i1 0
+  %nop7341 = alloca i1, i1 0
+  %nop7342 = alloca i1, i1 0
+  %nop7343 = alloca i1, i1 0
+  %nop7344 = alloca i1, i1 0
+  %nop7345 = alloca i1, i1 0
+  %nop7346 = alloca i1, i1 0
+  %nop7347 = alloca i1, i1 0
+  %nop7348 = alloca i1, i1 0
+  %nop7349 = alloca i1, i1 0
+  %nop7350 = alloca i1, i1 0
+  %nop7351 = alloca i1, i1 0
+  %nop7352 = alloca i1, i1 0
+  %nop7353 = alloca i1, i1 0
+  %nop7354 = alloca i1, i1 0
+  %nop7355 = alloca i1, i1 0
+  %nop7356 = alloca i1, i1 0
+  %nop7357 = alloca i1, i1 0
+  %nop7358 = alloca i1, i1 0
+  %nop7359 = alloca i1, i1 0
+  %nop7360 = alloca i1, i1 0
+  %nop7361 = alloca i1, i1 0
+  %nop7362 = alloca i1, i1 0
+  %nop7363 = alloca i1, i1 0
+  %nop7364 = alloca i1, i1 0
+  %nop7365 = alloca i1, i1 0
+  %nop7366 = alloca i1, i1 0
+  %nop7367 = alloca i1, i1 0
+  %nop7368 = alloca i1, i1 0
+  %nop7369 = alloca i1, i1 0
+  %nop7370 = alloca i1, i1 0
+  %nop7371 = alloca i1, i1 0
+  %nop7372 = alloca i1, i1 0
+  %nop7373 = alloca i1, i1 0
+  %nop7374 = alloca i1, i1 0
+  %nop7375 = alloca i1, i1 0
+  %nop7376 = alloca i1, i1 0
+  %nop7377 = alloca i1, i1 0
+  %nop7378 = alloca i1, i1 0
+  %nop7379 = alloca i1, i1 0
+  %nop7380 = alloca i1, i1 0
+  %nop7381 = alloca i1, i1 0
+  %nop7382 = alloca i1, i1 0
+  %nop7383 = alloca i1, i1 0
+  %nop7384 = alloca i1, i1 0
+  %nop7385 = alloca i1, i1 0
+  %nop7386 = alloca i1, i1 0
+  %nop7387 = alloca i1, i1 0
+  %nop7388 = alloca i1, i1 0
+  %nop7389 = alloca i1, i1 0
+  %nop7390 = alloca i1, i1 0
+  %nop7391 = alloca i1, i1 0
+  %nop7392 = alloca i1, i1 0
+  %nop7393 = alloca i1, i1 0
+  %nop7394 = alloca i1, i1 0
+  %nop7395 = alloca i1, i1 0
+  %nop7396 = alloca i1, i1 0
+  %nop7397 = alloca i1, i1 0
+  %nop7398 = alloca i1, i1 0
+  %nop7399 = alloca i1, i1 0
+  %nop7400 = alloca i1, i1 0
+  %nop7401 = alloca i1, i1 0
+  %nop7402 = alloca i1, i1 0
+  %nop7403 = alloca i1, i1 0
+  %nop7404 = alloca i1, i1 0
+  %nop7405 = alloca i1, i1 0
+  %nop7406 = alloca i1, i1 0
+  %nop7407 = alloca i1, i1 0
+  %nop7408 = alloca i1, i1 0
+  %nop7409 = alloca i1, i1 0
+  %nop7410 = alloca i1, i1 0
+  %nop7411 = alloca i1, i1 0
+  %nop7412 = alloca i1, i1 0
+  %nop7413 = alloca i1, i1 0
+  %nop7414 = alloca i1, i1 0
+  %nop7415 = alloca i1, i1 0
+  %nop7416 = alloca i1, i1 0
+  %nop7417 = alloca i1, i1 0
+  %nop7418 = alloca i1, i1 0
+  %nop7419 = alloca i1, i1 0
+  %nop7420 = alloca i1, i1 0
+  %nop7421 = alloca i1, i1 0
+  %nop7422 = alloca i1, i1 0
+  %nop7423 = alloca i1, i1 0
+  %nop7424 = alloca i1, i1 0
+  %nop7425 = alloca i1, i1 0
+  %nop7426 = alloca i1, i1 0
+  %nop7427 = alloca i1, i1 0
+  %nop7428 = alloca i1, i1 0
+  %nop7429 = alloca i1, i1 0
+  %nop7430 = alloca i1, i1 0
+  %nop7431 = alloca i1, i1 0
+  %nop7432 = alloca i1, i1 0
+  %nop7433 = alloca i1, i1 0
+  %nop7434 = alloca i1, i1 0
+  %nop7435 = alloca i1, i1 0
+  %nop7436 = alloca i1, i1 0
+  %nop7437 = alloca i1, i1 0
+  %nop7438 = alloca i1, i1 0
+  %nop7439 = alloca i1, i1 0
+  %nop7440 = alloca i1, i1 0
+  %nop7441 = alloca i1, i1 0
+  %nop7442 = alloca i1, i1 0
+  %nop7443 = alloca i1, i1 0
+  %nop7444 = alloca i1, i1 0
+  %nop7445 = alloca i1, i1 0
+  %nop7446 = alloca i1, i1 0
+  %nop7447 = alloca i1, i1 0
+  %nop7448 = alloca i1, i1 0
+  %nop7449 = alloca i1, i1 0
+  %nop7450 = alloca i1, i1 0
+  %nop7451 = alloca i1, i1 0
+  %nop7452 = alloca i1, i1 0
+  %nop7453 = alloca i1, i1 0
+  %nop7454 = alloca i1, i1 0
+  %nop7455 = alloca i1, i1 0
+  %nop7456 = alloca i1, i1 0
+  %nop7457 = alloca i1, i1 0
+  %nop7458 = alloca i1, i1 0
+  %nop7459 = alloca i1, i1 0
+  %nop7460 = alloca i1, i1 0
+  %nop7461 = alloca i1, i1 0
+  %nop7462 = alloca i1, i1 0
+  %nop7463 = alloca i1, i1 0
+  %nop7464 = alloca i1, i1 0
+  %nop7465 = alloca i1, i1 0
+  %nop7466 = alloca i1, i1 0
+  %nop7467 = alloca i1, i1 0
+  %nop7468 = alloca i1, i1 0
+  %nop7469 = alloca i1, i1 0
+  %nop7470 = alloca i1, i1 0
+  %nop7471 = alloca i1, i1 0
+  %nop7472 = alloca i1, i1 0
+  %nop7473 = alloca i1, i1 0
+  %nop7474 = alloca i1, i1 0
+  %nop7475 = alloca i1, i1 0
+  %nop7476 = alloca i1, i1 0
+  %nop7477 = alloca i1, i1 0
+  %nop7478 = alloca i1, i1 0
+  %nop7479 = alloca i1, i1 0
+  %nop7480 = alloca i1, i1 0
+  %nop7481 = alloca i1, i1 0
+  %nop7482 = alloca i1, i1 0
+  %nop7483 = alloca i1, i1 0
+  %nop7484 = alloca i1, i1 0
+  %nop7485 = alloca i1, i1 0
+  %nop7486 = alloca i1, i1 0
+  %nop7487 = alloca i1, i1 0
+  %nop7488 = alloca i1, i1 0
+  %nop7489 = alloca i1, i1 0
+  %nop7490 = alloca i1, i1 0
+  %nop7491 = alloca i1, i1 0
+  %nop7492 = alloca i1, i1 0
+  %nop7493 = alloca i1, i1 0
+  %nop7494 = alloca i1, i1 0
+  %nop7495 = alloca i1, i1 0
+  %nop7496 = alloca i1, i1 0
+  %nop7497 = alloca i1, i1 0
+  %nop7498 = alloca i1, i1 0
+  %nop7499 = alloca i1, i1 0
+  %nop7500 = alloca i1, i1 0
+  %nop7501 = alloca i1, i1 0
+  %nop7502 = alloca i1, i1 0
+  %nop7503 = alloca i1, i1 0
+  %nop7504 = alloca i1, i1 0
+  %nop7505 = alloca i1, i1 0
+  %nop7506 = alloca i1, i1 0
+  %nop7507 = alloca i1, i1 0
+  %nop7508 = alloca i1, i1 0
+  %nop7509 = alloca i1, i1 0
+  %nop7510 = alloca i1, i1 0
+  %nop7511 = alloca i1, i1 0
+  %nop7512 = alloca i1, i1 0
+  %nop7513 = alloca i1, i1 0
+  %nop7514 = alloca i1, i1 0
+  %nop7515 = alloca i1, i1 0
+  %nop7516 = alloca i1, i1 0
+  %nop7517 = alloca i1, i1 0
+  %nop7518 = alloca i1, i1 0
+  %nop7519 = alloca i1, i1 0
+  %nop7520 = alloca i1, i1 0
+  %nop7521 = alloca i1, i1 0
+  %nop7522 = alloca i1, i1 0
+  %nop7523 = alloca i1, i1 0
+  %nop7524 = alloca i1, i1 0
+  %nop7525 = alloca i1, i1 0
+  %nop7526 = alloca i1, i1 0
+  %nop7527 = alloca i1, i1 0
+  %nop7528 = alloca i1, i1 0
+  %nop7529 = alloca i1, i1 0
+  %nop7530 = alloca i1, i1 0
+  %nop7531 = alloca i1, i1 0
+  %nop7532 = alloca i1, i1 0
+  %nop7533 = alloca i1, i1 0
+  %nop7534 = alloca i1, i1 0
+  %nop7535 = alloca i1, i1 0
+  %nop7536 = alloca i1, i1 0
+  %nop7537 = alloca i1, i1 0
+  %nop7538 = alloca i1, i1 0
+  %nop7539 = alloca i1, i1 0
+  %nop7540 = alloca i1, i1 0
+  %nop7541 = alloca i1, i1 0
+  %nop7542 = alloca i1, i1 0
+  %nop7543 = alloca i1, i1 0
+  %nop7544 = alloca i1, i1 0
+  %nop7545 = alloca i1, i1 0
+  %nop7546 = alloca i1, i1 0
+  %nop7547 = alloca i1, i1 0
+  %nop7548 = alloca i1, i1 0
+  %nop7549 = alloca i1, i1 0
+  %nop7550 = alloca i1, i1 0
+  %nop7551 = alloca i1, i1 0
+  %nop7552 = alloca i1, i1 0
+  %nop7553 = alloca i1, i1 0
+  %nop7554 = alloca i1, i1 0
+  %nop7555 = alloca i1, i1 0
+  %nop7556 = alloca i1, i1 0
+  %nop7557 = alloca i1, i1 0
+  %nop7558 = alloca i1, i1 0
+  %nop7559 = alloca i1, i1 0
+  %nop7560 = alloca i1, i1 0
+  %nop7561 = alloca i1, i1 0
+  %nop7562 = alloca i1, i1 0
+  %nop7563 = alloca i1, i1 0
+  %nop7564 = alloca i1, i1 0
+  %nop7565 = alloca i1, i1 0
+  %nop7566 = alloca i1, i1 0
+  %nop7567 = alloca i1, i1 0
+  %nop7568 = alloca i1, i1 0
+  %nop7569 = alloca i1, i1 0
+  %nop7570 = alloca i1, i1 0
+  %nop7571 = alloca i1, i1 0
+  %nop7572 = alloca i1, i1 0
+  %nop7573 = alloca i1, i1 0
+  %nop7574 = alloca i1, i1 0
+  %nop7575 = alloca i1, i1 0
+  %nop7576 = alloca i1, i1 0
+  %nop7577 = alloca i1, i1 0
+  %nop7578 = alloca i1, i1 0
+  %nop7579 = alloca i1, i1 0
+  %nop7580 = alloca i1, i1 0
+  %nop7581 = alloca i1, i1 0
+  %nop7582 = alloca i1, i1 0
+  %nop7583 = alloca i1, i1 0
+  %nop7584 = alloca i1, i1 0
+  %nop7585 = alloca i1, i1 0
+  %nop7586 = alloca i1, i1 0
+  %nop7587 = alloca i1, i1 0
+  %nop7588 = alloca i1, i1 0
+  %nop7589 = alloca i1, i1 0
+  %nop7590 = alloca i1, i1 0
+  %nop7591 = alloca i1, i1 0
+  %nop7592 = alloca i1, i1 0
+  %nop7593 = alloca i1, i1 0
+  %nop7594 = alloca i1, i1 0
+  %nop7595 = alloca i1, i1 0
+  %nop7596 = alloca i1, i1 0
+  %nop7597 = alloca i1, i1 0
+  %nop7598 = alloca i1, i1 0
+  %nop7599 = alloca i1, i1 0
+  %nop7600 = alloca i1, i1 0
+  %nop7601 = alloca i1, i1 0
+  %nop7602 = alloca i1, i1 0
+  %nop7603 = alloca i1, i1 0
+  %nop7604 = alloca i1, i1 0
+  %nop7605 = alloca i1, i1 0
+  %nop7606 = alloca i1, i1 0
+  %nop7607 = alloca i1, i1 0
+  %nop7608 = alloca i1, i1 0
+  %nop7609 = alloca i1, i1 0
+  %nop7610 = alloca i1, i1 0
+  %nop7611 = alloca i1, i1 0
+  %nop7612 = alloca i1, i1 0
+  %nop7613 = alloca i1, i1 0
+  %nop7614 = alloca i1, i1 0
+  %nop7615 = alloca i1, i1 0
+  %nop7616 = alloca i1, i1 0
+  %nop7617 = alloca i1, i1 0
+  %nop7618 = alloca i1, i1 0
+  %nop7619 = alloca i1, i1 0
+  %nop7620 = alloca i1, i1 0
+  %nop7621 = alloca i1, i1 0
+  %nop7622 = alloca i1, i1 0
+  %nop7623 = alloca i1, i1 0
+  %nop7624 = alloca i1, i1 0
+  %nop7625 = alloca i1, i1 0
+  %nop7626 = alloca i1, i1 0
+  %nop7627 = alloca i1, i1 0
+  %nop7628 = alloca i1, i1 0
+  %nop7629 = alloca i1, i1 0
+  %nop7630 = alloca i1, i1 0
+  %nop7631 = alloca i1, i1 0
+  %nop7632 = alloca i1, i1 0
+  %nop7633 = alloca i1, i1 0
+  %nop7634 = alloca i1, i1 0
+  %nop7635 = alloca i1, i1 0
+  %nop7636 = alloca i1, i1 0
+  %nop7637 = alloca i1, i1 0
+  %nop7638 = alloca i1, i1 0
+  %nop7639 = alloca i1, i1 0
+  %nop7640 = alloca i1, i1 0
+  %nop7641 = alloca i1, i1 0
+  %nop7642 = alloca i1, i1 0
+  %nop7643 = alloca i1, i1 0
+  %nop7644 = alloca i1, i1 0
+  %nop7645 = alloca i1, i1 0
+  %nop7646 = alloca i1, i1 0
+  %nop7647 = alloca i1, i1 0
+  %nop7648 = alloca i1, i1 0
+  %nop7649 = alloca i1, i1 0
+  %nop7650 = alloca i1, i1 0
+  %nop7651 = alloca i1, i1 0
+  %nop7652 = alloca i1, i1 0
+  %nop7653 = alloca i1, i1 0
+  %nop7654 = alloca i1, i1 0
+  %nop7655 = alloca i1, i1 0
+  %nop7656 = alloca i1, i1 0
+  %nop7657 = alloca i1, i1 0
+  %nop7658 = alloca i1, i1 0
+  %nop7659 = alloca i1, i1 0
+  %nop7660 = alloca i1, i1 0
+  %nop7661 = alloca i1, i1 0
+  %nop7662 = alloca i1, i1 0
+  %nop7663 = alloca i1, i1 0
+  %nop7664 = alloca i1, i1 0
+  %nop7665 = alloca i1, i1 0
+  %nop7666 = alloca i1, i1 0
+  %nop7667 = alloca i1, i1 0
+  %nop7668 = alloca i1, i1 0
+  %nop7669 = alloca i1, i1 0
+  %nop7670 = alloca i1, i1 0
+  %nop7671 = alloca i1, i1 0
+  %nop7672 = alloca i1, i1 0
+  %nop7673 = alloca i1, i1 0
+  %nop7674 = alloca i1, i1 0
+  %nop7675 = alloca i1, i1 0
+  %nop7676 = alloca i1, i1 0
+  %nop7677 = alloca i1, i1 0
+  %nop7678 = alloca i1, i1 0
+  %nop7679 = alloca i1, i1 0
+  %nop7680 = alloca i1, i1 0
+  %nop7681 = alloca i1, i1 0
+  %nop7682 = alloca i1, i1 0
+  %nop7683 = alloca i1, i1 0
+  %nop7684 = alloca i1, i1 0
+  %nop7685 = alloca i1, i1 0
+  %nop7686 = alloca i1, i1 0
+  %nop7687 = alloca i1, i1 0
+  %nop7688 = alloca i1, i1 0
+  %nop7689 = alloca i1, i1 0
+  %nop7690 = alloca i1, i1 0
+  %nop7691 = alloca i1, i1 0
+  %nop7692 = alloca i1, i1 0
+  %nop7693 = alloca i1, i1 0
+  %nop7694 = alloca i1, i1 0
+  %nop7695 = alloca i1, i1 0
+  %nop7696 = alloca i1, i1 0
+  %nop7697 = alloca i1, i1 0
+  %nop7698 = alloca i1, i1 0
+  %nop7699 = alloca i1, i1 0
+  %nop7700 = alloca i1, i1 0
+  %nop7701 = alloca i1, i1 0
+  %nop7702 = alloca i1, i1 0
+  %nop7703 = alloca i1, i1 0
+  %nop7704 = alloca i1, i1 0
+  %nop7705 = alloca i1, i1 0
+  %nop7706 = alloca i1, i1 0
+  %nop7707 = alloca i1, i1 0
+  %nop7708 = alloca i1, i1 0
+  %nop7709 = alloca i1, i1 0
+  %nop7710 = alloca i1, i1 0
+  %nop7711 = alloca i1, i1 0
+  %nop7712 = alloca i1, i1 0
+  %nop7713 = alloca i1, i1 0
+  %nop7714 = alloca i1, i1 0
+  %nop7715 = alloca i1, i1 0
+  %nop7716 = alloca i1, i1 0
+  %nop7717 = alloca i1, i1 0
+  %nop7718 = alloca i1, i1 0
+  %nop7719 = alloca i1, i1 0
+  %nop7720 = alloca i1, i1 0
+  %nop7721 = alloca i1, i1 0
+  %nop7722 = alloca i1, i1 0
+  %nop7723 = alloca i1, i1 0
+  %nop7724 = alloca i1, i1 0
+  %nop7725 = alloca i1, i1 0
+  %nop7726 = alloca i1, i1 0
+  %nop7727 = alloca i1, i1 0
+  %nop7728 = alloca i1, i1 0
+  %nop7729 = alloca i1, i1 0
+  %nop7730 = alloca i1, i1 0
+  %nop7731 = alloca i1, i1 0
+  %nop7732 = alloca i1, i1 0
+  %nop7733 = alloca i1, i1 0
+  %nop7734 = alloca i1, i1 0
+  %nop7735 = alloca i1, i1 0
+  %nop7736 = alloca i1, i1 0
+  %nop7737 = alloca i1, i1 0
+  %nop7738 = alloca i1, i1 0
+  %nop7739 = alloca i1, i1 0
+  %nop7740 = alloca i1, i1 0
+  %nop7741 = alloca i1, i1 0
+  %nop7742 = alloca i1, i1 0
+  %nop7743 = alloca i1, i1 0
+  %nop7744 = alloca i1, i1 0
+  %nop7745 = alloca i1, i1 0
+  %nop7746 = alloca i1, i1 0
+  %nop7747 = alloca i1, i1 0
+  %nop7748 = alloca i1, i1 0
+  %nop7749 = alloca i1, i1 0
+  %nop7750 = alloca i1, i1 0
+  %nop7751 = alloca i1, i1 0
+  %nop7752 = alloca i1, i1 0
+  %nop7753 = alloca i1, i1 0
+  %nop7754 = alloca i1, i1 0
+  %nop7755 = alloca i1, i1 0
+  %nop7756 = alloca i1, i1 0
+  %nop7757 = alloca i1, i1 0
+  %nop7758 = alloca i1, i1 0
+  %nop7759 = alloca i1, i1 0
+  %nop7760 = alloca i1, i1 0
+  %nop7761 = alloca i1, i1 0
+  %nop7762 = alloca i1, i1 0
+  %nop7763 = alloca i1, i1 0
+  %nop7764 = alloca i1, i1 0
+  %nop7765 = alloca i1, i1 0
+  %nop7766 = alloca i1, i1 0
+  %nop7767 = alloca i1, i1 0
+  %nop7768 = alloca i1, i1 0
+  %nop7769 = alloca i1, i1 0
+  %nop7770 = alloca i1, i1 0
+  %nop7771 = alloca i1, i1 0
+  %nop7772 = alloca i1, i1 0
+  %nop7773 = alloca i1, i1 0
+  %nop7774 = alloca i1, i1 0
+  %nop7775 = alloca i1, i1 0
+  %nop7776 = alloca i1, i1 0
+  %nop7777 = alloca i1, i1 0
+  %nop7778 = alloca i1, i1 0
+  %nop7779 = alloca i1, i1 0
+  %nop7780 = alloca i1, i1 0
+  %nop7781 = alloca i1, i1 0
+  %nop7782 = alloca i1, i1 0
+  %nop7783 = alloca i1, i1 0
+  %nop7784 = alloca i1, i1 0
+  %nop7785 = alloca i1, i1 0
+  %nop7786 = alloca i1, i1 0
+  %nop7787 = alloca i1, i1 0
+  %nop7788 = alloca i1, i1 0
+  %nop7789 = alloca i1, i1 0
+  %nop7790 = alloca i1, i1 0
+  %nop7791 = alloca i1, i1 0
+  %nop7792 = alloca i1, i1 0
+  %nop7793 = alloca i1, i1 0
+  %nop7794 = alloca i1, i1 0
+  %nop7795 = alloca i1, i1 0
+  %nop7796 = alloca i1, i1 0
+  %nop7797 = alloca i1, i1 0
+  %nop7798 = alloca i1, i1 0
+  %nop7799 = alloca i1, i1 0
+  %nop7800 = alloca i1, i1 0
+  %nop7801 = alloca i1, i1 0
+  %nop7802 = alloca i1, i1 0
+  %nop7803 = alloca i1, i1 0
+  %nop7804 = alloca i1, i1 0
+  %nop7805 = alloca i1, i1 0
+  %nop7806 = alloca i1, i1 0
+  %nop7807 = alloca i1, i1 0
+  %nop7808 = alloca i1, i1 0
+  %nop7809 = alloca i1, i1 0
+  %nop7810 = alloca i1, i1 0
+  %nop7811 = alloca i1, i1 0
+  %nop7812 = alloca i1, i1 0
+  %nop7813 = alloca i1, i1 0
+  %nop7814 = alloca i1, i1 0
+  %nop7815 = alloca i1, i1 0
+  %nop7816 = alloca i1, i1 0
+  %nop7817 = alloca i1, i1 0
+  %nop7818 = alloca i1, i1 0
+  %nop7819 = alloca i1, i1 0
+  %nop7820 = alloca i1, i1 0
+  %nop7821 = alloca i1, i1 0
+  %nop7822 = alloca i1, i1 0
+  %nop7823 = alloca i1, i1 0
+  %nop7824 = alloca i1, i1 0
+  %nop7825 = alloca i1, i1 0
+  %nop7826 = alloca i1, i1 0
+  %nop7827 = alloca i1, i1 0
+  %nop7828 = alloca i1, i1 0
+  %nop7829 = alloca i1, i1 0
+  %nop7830 = alloca i1, i1 0
+  %nop7831 = alloca i1, i1 0
+  %nop7832 = alloca i1, i1 0
+  %nop7833 = alloca i1, i1 0
+  %nop7834 = alloca i1, i1 0
+  %nop7835 = alloca i1, i1 0
+  %nop7836 = alloca i1, i1 0
+  %nop7837 = alloca i1, i1 0
+  %nop7838 = alloca i1, i1 0
+  %nop7839 = alloca i1, i1 0
+  %nop7840 = alloca i1, i1 0
+  %nop7841 = alloca i1, i1 0
+  %nop7842 = alloca i1, i1 0
+  %nop7843 = alloca i1, i1 0
+  %nop7844 = alloca i1, i1 0
+  %nop7845 = alloca i1, i1 0
+  %nop7846 = alloca i1, i1 0
+  %nop7847 = alloca i1, i1 0
+  %nop7848 = alloca i1, i1 0
+  %nop7849 = alloca i1, i1 0
+  %nop7850 = alloca i1, i1 0
+  %nop7851 = alloca i1, i1 0
+  %nop7852 = alloca i1, i1 0
+  %nop7853 = alloca i1, i1 0
+  %nop7854 = alloca i1, i1 0
+  %nop7855 = alloca i1, i1 0
+  %nop7856 = alloca i1, i1 0
+  %nop7857 = alloca i1, i1 0
+  %nop7858 = alloca i1, i1 0
+  %nop7859 = alloca i1, i1 0
+  %nop7860 = alloca i1, i1 0
+  %nop7861 = alloca i1, i1 0
+  %nop7862 = alloca i1, i1 0
+  %nop7863 = alloca i1, i1 0
+  %nop7864 = alloca i1, i1 0
+  %nop7865 = alloca i1, i1 0
+  %nop7866 = alloca i1, i1 0
+  %nop7867 = alloca i1, i1 0
+  %nop7868 = alloca i1, i1 0
+  %nop7869 = alloca i1, i1 0
+  %nop7870 = alloca i1, i1 0
+  %nop7871 = alloca i1, i1 0
+  %nop7872 = alloca i1, i1 0
+  %nop7873 = alloca i1, i1 0
+  %nop7874 = alloca i1, i1 0
+  %nop7875 = alloca i1, i1 0
+  %nop7876 = alloca i1, i1 0
+  %nop7877 = alloca i1, i1 0
+  %nop7878 = alloca i1, i1 0
+  %nop7879 = alloca i1, i1 0
+  %nop7880 = alloca i1, i1 0
+  %nop7881 = alloca i1, i1 0
+  %nop7882 = alloca i1, i1 0
+  %nop7883 = alloca i1, i1 0
+  %nop7884 = alloca i1, i1 0
+  %nop7885 = alloca i1, i1 0
+  %nop7886 = alloca i1, i1 0
+  %nop7887 = alloca i1, i1 0
+  %nop7888 = alloca i1, i1 0
+  %nop7889 = alloca i1, i1 0
+  %nop7890 = alloca i1, i1 0
+  %nop7891 = alloca i1, i1 0
+  %nop7892 = alloca i1, i1 0
+  %nop7893 = alloca i1, i1 0
+  %nop7894 = alloca i1, i1 0
+  %nop7895 = alloca i1, i1 0
+  %nop7896 = alloca i1, i1 0
+  %nop7897 = alloca i1, i1 0
+  %nop7898 = alloca i1, i1 0
+  %nop7899 = alloca i1, i1 0
+  %nop7900 = alloca i1, i1 0
+  %nop7901 = alloca i1, i1 0
+  %nop7902 = alloca i1, i1 0
+  %nop7903 = alloca i1, i1 0
+  %nop7904 = alloca i1, i1 0
+  %nop7905 = alloca i1, i1 0
+  %nop7906 = alloca i1, i1 0
+  %nop7907 = alloca i1, i1 0
+  %nop7908 = alloca i1, i1 0
+  %nop7909 = alloca i1, i1 0
+  %nop7910 = alloca i1, i1 0
+  %nop7911 = alloca i1, i1 0
+  %nop7912 = alloca i1, i1 0
+  %nop7913 = alloca i1, i1 0
+  %nop7914 = alloca i1, i1 0
+  %nop7915 = alloca i1, i1 0
+  %nop7916 = alloca i1, i1 0
+  %nop7917 = alloca i1, i1 0
+  %nop7918 = alloca i1, i1 0
+  %nop7919 = alloca i1, i1 0
+  %nop7920 = alloca i1, i1 0
+  %nop7921 = alloca i1, i1 0
+  %nop7922 = alloca i1, i1 0
+  %nop7923 = alloca i1, i1 0
+  %nop7924 = alloca i1, i1 0
+  %nop7925 = alloca i1, i1 0
+  %nop7926 = alloca i1, i1 0
+  %nop7927 = alloca i1, i1 0
+  %nop7928 = alloca i1, i1 0
+  %nop7929 = alloca i1, i1 0
+  %nop7930 = alloca i1, i1 0
+  %nop7931 = alloca i1, i1 0
+  %nop7932 = alloca i1, i1 0
+  %nop7933 = alloca i1, i1 0
+  %nop7934 = alloca i1, i1 0
+  %nop7935 = alloca i1, i1 0
+  %nop7936 = alloca i1, i1 0
+  %nop7937 = alloca i1, i1 0
+  %nop7938 = alloca i1, i1 0
+  %nop7939 = alloca i1, i1 0
+  %nop7940 = alloca i1, i1 0
+  %nop7941 = alloca i1, i1 0
+  %nop7942 = alloca i1, i1 0
+  %nop7943 = alloca i1, i1 0
+  %nop7944 = alloca i1, i1 0
+  %nop7945 = alloca i1, i1 0
+  %nop7946 = alloca i1, i1 0
+  %nop7947 = alloca i1, i1 0
+  %nop7948 = alloca i1, i1 0
+  %nop7949 = alloca i1, i1 0
+  %nop7950 = alloca i1, i1 0
+  %nop7951 = alloca i1, i1 0
+  %nop7952 = alloca i1, i1 0
+  %nop7953 = alloca i1, i1 0
+  %nop7954 = alloca i1, i1 0
+  %nop7955 = alloca i1, i1 0
+  %nop7956 = alloca i1, i1 0
+  %nop7957 = alloca i1, i1 0
+  %nop7958 = alloca i1, i1 0
+  %nop7959 = alloca i1, i1 0
+  %nop7960 = alloca i1, i1 0
+  %nop7961 = alloca i1, i1 0
+  %nop7962 = alloca i1, i1 0
+  %nop7963 = alloca i1, i1 0
+  %nop7964 = alloca i1, i1 0
+  %nop7965 = alloca i1, i1 0
+  %nop7966 = alloca i1, i1 0
+  %nop7967 = alloca i1, i1 0
+  %nop7968 = alloca i1, i1 0
+  %nop7969 = alloca i1, i1 0
+  %nop7970 = alloca i1, i1 0
+  %nop7971 = alloca i1, i1 0
+  %nop7972 = alloca i1, i1 0
+  %nop7973 = alloca i1, i1 0
+  %nop7974 = alloca i1, i1 0
+  %nop7975 = alloca i1, i1 0
+  %nop7976 = alloca i1, i1 0
+  %nop7977 = alloca i1, i1 0
+  %nop7978 = alloca i1, i1 0
+  %nop7979 = alloca i1, i1 0
+  %nop7980 = alloca i1, i1 0
+  %nop7981 = alloca i1, i1 0
+  %nop7982 = alloca i1, i1 0
+  %nop7983 = alloca i1, i1 0
+  %nop7984 = alloca i1, i1 0
+  %nop7985 = alloca i1, i1 0
+  %nop7986 = alloca i1, i1 0
+  %nop7987 = alloca i1, i1 0
+  %nop7988 = alloca i1, i1 0
+  %nop7989 = alloca i1, i1 0
+  %nop7990 = alloca i1, i1 0
+  %nop7991 = alloca i1, i1 0
+  %nop7992 = alloca i1, i1 0
+  %nop7993 = alloca i1, i1 0
+  %nop7994 = alloca i1, i1 0
+  %nop7995 = alloca i1, i1 0
+  %nop7996 = alloca i1, i1 0
+  %nop7997 = alloca i1, i1 0
+  %nop7998 = alloca i1, i1 0
+  %nop7999 = alloca i1, i1 0
+  %nop8000 = alloca i1, i1 0
+  %nop8001 = alloca i1, i1 0
+  %nop8002 = alloca i1, i1 0
+  %nop8003 = alloca i1, i1 0
+  %nop8004 = alloca i1, i1 0
+  %nop8005 = alloca i1, i1 0
+  %nop8006 = alloca i1, i1 0
+  %nop8007 = alloca i1, i1 0
+  %nop8008 = alloca i1, i1 0
+  %nop8009 = alloca i1, i1 0
+  %nop8010 = alloca i1, i1 0
+  %nop8011 = alloca i1, i1 0
+  %nop8012 = alloca i1, i1 0
+  %nop8013 = alloca i1, i1 0
+  %nop8014 = alloca i1, i1 0
+  %nop8015 = alloca i1, i1 0
+  %nop8016 = alloca i1, i1 0
+  %nop8017 = alloca i1, i1 0
+  %nop8018 = alloca i1, i1 0
+  %nop8019 = alloca i1, i1 0
+  %nop8020 = alloca i1, i1 0
+  %nop8021 = alloca i1, i1 0
+  %nop8022 = alloca i1, i1 0
+  %nop8023 = alloca i1, i1 0
+  %nop8024 = alloca i1, i1 0
+  %nop8025 = alloca i1, i1 0
+  %nop8026 = alloca i1, i1 0
+  %nop8027 = alloca i1, i1 0
+  %nop8028 = alloca i1, i1 0
+  %nop8029 = alloca i1, i1 0
+  %nop8030 = alloca i1, i1 0
+  %nop8031 = alloca i1, i1 0
+  %nop8032 = alloca i1, i1 0
+  %nop8033 = alloca i1, i1 0
+  %nop8034 = alloca i1, i1 0
+  %nop8035 = alloca i1, i1 0
+  %nop8036 = alloca i1, i1 0
+  %nop8037 = alloca i1, i1 0
+  %nop8038 = alloca i1, i1 0
+  %nop8039 = alloca i1, i1 0
+  %nop8040 = alloca i1, i1 0
+  %nop8041 = alloca i1, i1 0
+  %nop8042 = alloca i1, i1 0
+  %nop8043 = alloca i1, i1 0
+  %nop8044 = alloca i1, i1 0
+  %nop8045 = alloca i1, i1 0
+  %nop8046 = alloca i1, i1 0
+  %nop8047 = alloca i1, i1 0
+  %nop8048 = alloca i1, i1 0
+  %nop8049 = alloca i1, i1 0
+  %nop8050 = alloca i1, i1 0
+  %nop8051 = alloca i1, i1 0
+  %nop8052 = alloca i1, i1 0
+  %nop8053 = alloca i1, i1 0
+  %nop8054 = alloca i1, i1 0
+  %nop8055 = alloca i1, i1 0
+  %nop8056 = alloca i1, i1 0
+  %nop8057 = alloca i1, i1 0
+  %nop8058 = alloca i1, i1 0
+  %nop8059 = alloca i1, i1 0
+  %nop8060 = alloca i1, i1 0
+  %nop8061 = alloca i1, i1 0
+  %nop8062 = alloca i1, i1 0
+  %nop8063 = alloca i1, i1 0
+  %nop8064 = alloca i1, i1 0
+  %nop8065 = alloca i1, i1 0
+  %nop8066 = alloca i1, i1 0
+  %nop8067 = alloca i1, i1 0
+  %nop8068 = alloca i1, i1 0
+  %nop8069 = alloca i1, i1 0
+  %nop8070 = alloca i1, i1 0
+  %nop8071 = alloca i1, i1 0
+  %nop8072 = alloca i1, i1 0
+  %nop8073 = alloca i1, i1 0
+  %nop8074 = alloca i1, i1 0
+  %nop8075 = alloca i1, i1 0
+  %nop8076 = alloca i1, i1 0
+  %nop8077 = alloca i1, i1 0
+  %nop8078 = alloca i1, i1 0
+  %nop8079 = alloca i1, i1 0
+  %nop8080 = alloca i1, i1 0
+  %nop8081 = alloca i1, i1 0
+  %nop8082 = alloca i1, i1 0
+  %nop8083 = alloca i1, i1 0
+  %nop8084 = alloca i1, i1 0
+  %nop8085 = alloca i1, i1 0
+  %nop8086 = alloca i1, i1 0
+  %nop8087 = alloca i1, i1 0
+  %nop8088 = alloca i1, i1 0
+  %nop8089 = alloca i1, i1 0
+  %nop8090 = alloca i1, i1 0
+  %nop8091 = alloca i1, i1 0
+  %nop8092 = alloca i1, i1 0
+  %nop8093 = alloca i1, i1 0
+  %nop8094 = alloca i1, i1 0
+  %nop8095 = alloca i1, i1 0
+  %nop8096 = alloca i1, i1 0
+  %nop8097 = alloca i1, i1 0
+  %nop8098 = alloca i1, i1 0
+  %nop8099 = alloca i1, i1 0
+  %nop8100 = alloca i1, i1 0
+  %nop8101 = alloca i1, i1 0
+  %nop8102 = alloca i1, i1 0
+  %nop8103 = alloca i1, i1 0
+  %nop8104 = alloca i1, i1 0
+  %nop8105 = alloca i1, i1 0
+  %nop8106 = alloca i1, i1 0
+  %nop8107 = alloca i1, i1 0
+  %nop8108 = alloca i1, i1 0
+  %nop8109 = alloca i1, i1 0
+  %nop8110 = alloca i1, i1 0
+  %nop8111 = alloca i1, i1 0
+  %nop8112 = alloca i1, i1 0
+  %nop8113 = alloca i1, i1 0
+  %nop8114 = alloca i1, i1 0
+  %nop8115 = alloca i1, i1 0
+  %nop8116 = alloca i1, i1 0
+  %nop8117 = alloca i1, i1 0
+  %nop8118 = alloca i1, i1 0
+  %nop8119 = alloca i1, i1 0
+  %nop8120 = alloca i1, i1 0
+  %nop8121 = alloca i1, i1 0
+  %nop8122 = alloca i1, i1 0
+  %nop8123 = alloca i1, i1 0
+  %nop8124 = alloca i1, i1 0
+  %nop8125 = alloca i1, i1 0
+  %nop8126 = alloca i1, i1 0
+  %nop8127 = alloca i1, i1 0
+  %nop8128 = alloca i1, i1 0
+  %nop8129 = alloca i1, i1 0
+  %nop8130 = alloca i1, i1 0
+  %nop8131 = alloca i1, i1 0
+  %nop8132 = alloca i1, i1 0
+  %nop8133 = alloca i1, i1 0
+  %nop8134 = alloca i1, i1 0
+  %nop8135 = alloca i1, i1 0
+  %nop8136 = alloca i1, i1 0
+  %nop8137 = alloca i1, i1 0
+  %nop8138 = alloca i1, i1 0
+  %nop8139 = alloca i1, i1 0
+  %nop8140 = alloca i1, i1 0
+  %nop8141 = alloca i1, i1 0
+  %nop8142 = alloca i1, i1 0
+  %nop8143 = alloca i1, i1 0
+  %nop8144 = alloca i1, i1 0
+  %nop8145 = alloca i1, i1 0
+  %nop8146 = alloca i1, i1 0
+  %nop8147 = alloca i1, i1 0
+  %nop8148 = alloca i1, i1 0
+  %nop8149 = alloca i1, i1 0
+  %nop8150 = alloca i1, i1 0
+  %nop8151 = alloca i1, i1 0
+  %nop8152 = alloca i1, i1 0
+  %nop8153 = alloca i1, i1 0
+  %nop8154 = alloca i1, i1 0
+  %nop8155 = alloca i1, i1 0
+  %nop8156 = alloca i1, i1 0
+  %nop8157 = alloca i1, i1 0
+  %nop8158 = alloca i1, i1 0
+  %nop8159 = alloca i1, i1 0
+  %nop8160 = alloca i1, i1 0
+  %nop8161 = alloca i1, i1 0
+  %nop8162 = alloca i1, i1 0
+  %nop8163 = alloca i1, i1 0
+  %nop8164 = alloca i1, i1 0
+  %nop8165 = alloca i1, i1 0
+  %nop8166 = alloca i1, i1 0
+  %nop8167 = alloca i1, i1 0
+  %nop8168 = alloca i1, i1 0
+  %nop8169 = alloca i1, i1 0
+  %nop8170 = alloca i1, i1 0
+  %nop8171 = alloca i1, i1 0
+  %nop8172 = alloca i1, i1 0
+  %nop8173 = alloca i1, i1 0
+  %nop8174 = alloca i1, i1 0
+  %nop8175 = alloca i1, i1 0
+  %nop8176 = alloca i1, i1 0
+  %nop8177 = alloca i1, i1 0
+  %nop8178 = alloca i1, i1 0
+  %nop8179 = alloca i1, i1 0
+  %nop8180 = alloca i1, i1 0
+  %nop8181 = alloca i1, i1 0
+  %nop8182 = alloca i1, i1 0
+  %nop8183 = alloca i1, i1 0
+  %nop8184 = alloca i1, i1 0
+  %nop8185 = alloca i1, i1 0
+  %nop8186 = alloca i1, i1 0
+  %nop8187 = alloca i1, i1 0
+  %nop8188 = alloca i1, i1 0
+  %nop8189 = alloca i1, i1 0
+  %nop8190 = alloca i1, i1 0
+  %nop8191 = alloca i1, i1 0
+  %nop8192 = alloca i1, i1 0
+  %nop8193 = alloca i1, i1 0
+  %nop8194 = alloca i1, i1 0
+  %nop8195 = alloca i1, i1 0
+  %nop8196 = alloca i1, i1 0
+  %nop8197 = alloca i1, i1 0
+  %nop8198 = alloca i1, i1 0
+  %nop8199 = alloca i1, i1 0
+  %nop8200 = alloca i1, i1 0
+  %nop8201 = alloca i1, i1 0
+  %nop8202 = alloca i1, i1 0
+  %nop8203 = alloca i1, i1 0
+  %nop8204 = alloca i1, i1 0
+  %nop8205 = alloca i1, i1 0
+  %nop8206 = alloca i1, i1 0
+  %nop8207 = alloca i1, i1 0
+  %nop8208 = alloca i1, i1 0
+  %nop8209 = alloca i1, i1 0
+  %nop8210 = alloca i1, i1 0
+  %nop8211 = alloca i1, i1 0
+  %nop8212 = alloca i1, i1 0
+  %nop8213 = alloca i1, i1 0
+  %nop8214 = alloca i1, i1 0
+  %nop8215 = alloca i1, i1 0
+  %nop8216 = alloca i1, i1 0
+  %nop8217 = alloca i1, i1 0
+  %nop8218 = alloca i1, i1 0
+  %nop8219 = alloca i1, i1 0
+  %nop8220 = alloca i1, i1 0
+  %nop8221 = alloca i1, i1 0
+  %nop8222 = alloca i1, i1 0
+  %nop8223 = alloca i1, i1 0
+  %nop8224 = alloca i1, i1 0
+  %nop8225 = alloca i1, i1 0
+  %nop8226 = alloca i1, i1 0
+  %nop8227 = alloca i1, i1 0
+  %nop8228 = alloca i1, i1 0
+  %nop8229 = alloca i1, i1 0
+  %nop8230 = alloca i1, i1 0
+  %nop8231 = alloca i1, i1 0
+  %nop8232 = alloca i1, i1 0
+  %nop8233 = alloca i1, i1 0
+  %nop8234 = alloca i1, i1 0
+  %nop8235 = alloca i1, i1 0
+  %nop8236 = alloca i1, i1 0
+  %nop8237 = alloca i1, i1 0
+  %nop8238 = alloca i1, i1 0
+  %nop8239 = alloca i1, i1 0
+  %nop8240 = alloca i1, i1 0
+  %nop8241 = alloca i1, i1 0
+  %nop8242 = alloca i1, i1 0
+  %nop8243 = alloca i1, i1 0
+  %nop8244 = alloca i1, i1 0
+  %nop8245 = alloca i1, i1 0
+  %nop8246 = alloca i1, i1 0
+  %nop8247 = alloca i1, i1 0
+  %nop8248 = alloca i1, i1 0
+  %nop8249 = alloca i1, i1 0
+  %nop8250 = alloca i1, i1 0
+  %nop8251 = alloca i1, i1 0
+  %nop8252 = alloca i1, i1 0
+  %nop8253 = alloca i1, i1 0
+  %nop8254 = alloca i1, i1 0
+  %nop8255 = alloca i1, i1 0
+  %nop8256 = alloca i1, i1 0
+  %nop8257 = alloca i1, i1 0
+  %nop8258 = alloca i1, i1 0
+  %nop8259 = alloca i1, i1 0
+  %nop8260 = alloca i1, i1 0
+  %nop8261 = alloca i1, i1 0
+  %nop8262 = alloca i1, i1 0
+  %nop8263 = alloca i1, i1 0
+  %nop8264 = alloca i1, i1 0
+  %nop8265 = alloca i1, i1 0
+  %nop8266 = alloca i1, i1 0
+  %nop8267 = alloca i1, i1 0
+  %nop8268 = alloca i1, i1 0
+  %nop8269 = alloca i1, i1 0
+  %nop8270 = alloca i1, i1 0
+  %nop8271 = alloca i1, i1 0
+  %nop8272 = alloca i1, i1 0
+  %nop8273 = alloca i1, i1 0
+  %nop8274 = alloca i1, i1 0
+  %nop8275 = alloca i1, i1 0
+  %nop8276 = alloca i1, i1 0
+  %nop8277 = alloca i1, i1 0
+  %nop8278 = alloca i1, i1 0
+  %nop8279 = alloca i1, i1 0
+  %nop8280 = alloca i1, i1 0
+  %nop8281 = alloca i1, i1 0
+  %nop8282 = alloca i1, i1 0
+  %nop8283 = alloca i1, i1 0
+  %nop8284 = alloca i1, i1 0
+  %nop8285 = alloca i1, i1 0
+  %nop8286 = alloca i1, i1 0
+  %nop8287 = alloca i1, i1 0
+  %nop8288 = alloca i1, i1 0
+  %nop8289 = alloca i1, i1 0
+  %nop8290 = alloca i1, i1 0
+  %nop8291 = alloca i1, i1 0
+  %nop8292 = alloca i1, i1 0
+  %nop8293 = alloca i1, i1 0
+  %nop8294 = alloca i1, i1 0
+  %nop8295 = alloca i1, i1 0
+  %nop8296 = alloca i1, i1 0
+  %nop8297 = alloca i1, i1 0
+  %nop8298 = alloca i1, i1 0
+  %nop8299 = alloca i1, i1 0
+  %nop8300 = alloca i1, i1 0
+  %nop8301 = alloca i1, i1 0
+  %nop8302 = alloca i1, i1 0
+  %nop8303 = alloca i1, i1 0
+  %nop8304 = alloca i1, i1 0
+  %nop8305 = alloca i1, i1 0
+  %nop8306 = alloca i1, i1 0
+  %nop8307 = alloca i1, i1 0
+  %nop8308 = alloca i1, i1 0
+  %nop8309 = alloca i1, i1 0
+  %nop8310 = alloca i1, i1 0
+  %nop8311 = alloca i1, i1 0
+  %nop8312 = alloca i1, i1 0
+  %nop8313 = alloca i1, i1 0
+  %nop8314 = alloca i1, i1 0
+  %nop8315 = alloca i1, i1 0
+  %nop8316 = alloca i1, i1 0
+  %nop8317 = alloca i1, i1 0
+  %nop8318 = alloca i1, i1 0
+  %nop8319 = alloca i1, i1 0
+  %nop8320 = alloca i1, i1 0
+  %nop8321 = alloca i1, i1 0
+  %nop8322 = alloca i1, i1 0
+  %nop8323 = alloca i1, i1 0
+  %nop8324 = alloca i1, i1 0
+  %nop8325 = alloca i1, i1 0
+  %nop8326 = alloca i1, i1 0
+  %nop8327 = alloca i1, i1 0
+  %nop8328 = alloca i1, i1 0
+  %nop8329 = alloca i1, i1 0
+  %nop8330 = alloca i1, i1 0
+  %nop8331 = alloca i1, i1 0
+  %nop8332 = alloca i1, i1 0
+  %nop8333 = alloca i1, i1 0
+  %nop8334 = alloca i1, i1 0
+  %nop8335 = alloca i1, i1 0
+  %nop8336 = alloca i1, i1 0
+  %nop8337 = alloca i1, i1 0
+  %nop8338 = alloca i1, i1 0
+  %nop8339 = alloca i1, i1 0
+  %nop8340 = alloca i1, i1 0
+  %nop8341 = alloca i1, i1 0
+  %nop8342 = alloca i1, i1 0
+  %nop8343 = alloca i1, i1 0
+  %nop8344 = alloca i1, i1 0
+  %nop8345 = alloca i1, i1 0
+  %nop8346 = alloca i1, i1 0
+  %nop8347 = alloca i1, i1 0
+  %nop8348 = alloca i1, i1 0
+  %nop8349 = alloca i1, i1 0
+  %nop8350 = alloca i1, i1 0
+  %nop8351 = alloca i1, i1 0
+  %nop8352 = alloca i1, i1 0
+  %nop8353 = alloca i1, i1 0
+  %nop8354 = alloca i1, i1 0
+  %nop8355 = alloca i1, i1 0
+  %nop8356 = alloca i1, i1 0
+  %nop8357 = alloca i1, i1 0
+  %nop8358 = alloca i1, i1 0
+  %nop8359 = alloca i1, i1 0
+  %nop8360 = alloca i1, i1 0
+  %nop8361 = alloca i1, i1 0
+  %nop8362 = alloca i1, i1 0
+  %nop8363 = alloca i1, i1 0
+  %nop8364 = alloca i1, i1 0
+  %nop8365 = alloca i1, i1 0
+  %nop8366 = alloca i1, i1 0
+  %nop8367 = alloca i1, i1 0
+  %nop8368 = alloca i1, i1 0
+  %nop8369 = alloca i1, i1 0
+  %nop8370 = alloca i1, i1 0
+  %nop8371 = alloca i1, i1 0
+  %nop8372 = alloca i1, i1 0
+  %nop8373 = alloca i1, i1 0
+  %nop8374 = alloca i1, i1 0
+  %nop8375 = alloca i1, i1 0
+  %nop8376 = alloca i1, i1 0
+  %nop8377 = alloca i1, i1 0
+  %nop8378 = alloca i1, i1 0
+  %nop8379 = alloca i1, i1 0
+  %nop8380 = alloca i1, i1 0
+  %nop8381 = alloca i1, i1 0
+  %nop8382 = alloca i1, i1 0
+  %nop8383 = alloca i1, i1 0
+  %nop8384 = alloca i1, i1 0
+  %nop8385 = alloca i1, i1 0
+  %nop8386 = alloca i1, i1 0
+  %nop8387 = alloca i1, i1 0
+  %nop8388 = alloca i1, i1 0
+  %nop8389 = alloca i1, i1 0
+  %nop8390 = alloca i1, i1 0
+  %nop8391 = alloca i1, i1 0
+  %nop8392 = alloca i1, i1 0
+  %nop8393 = alloca i1, i1 0
+  %nop8394 = alloca i1, i1 0
+  %nop8395 = alloca i1, i1 0
+  %nop8396 = alloca i1, i1 0
+  %nop8397 = alloca i1, i1 0
+  %nop8398 = alloca i1, i1 0
+  %nop8399 = alloca i1, i1 0
+  %nop8400 = alloca i1, i1 0
+  %nop8401 = alloca i1, i1 0
+  %nop8402 = alloca i1, i1 0
+  %nop8403 = alloca i1, i1 0
+  %nop8404 = alloca i1, i1 0
+  %nop8405 = alloca i1, i1 0
+  %nop8406 = alloca i1, i1 0
+  %nop8407 = alloca i1, i1 0
+  %nop8408 = alloca i1, i1 0
+  %nop8409 = alloca i1, i1 0
+  %nop8410 = alloca i1, i1 0
+  %nop8411 = alloca i1, i1 0
+  %nop8412 = alloca i1, i1 0
+  %nop8413 = alloca i1, i1 0
+  %nop8414 = alloca i1, i1 0
+  %nop8415 = alloca i1, i1 0
+  %nop8416 = alloca i1, i1 0
+  %nop8417 = alloca i1, i1 0
+  %nop8418 = alloca i1, i1 0
+  %nop8419 = alloca i1, i1 0
+  %nop8420 = alloca i1, i1 0
+  %nop8421 = alloca i1, i1 0
+  %nop8422 = alloca i1, i1 0
+  %nop8423 = alloca i1, i1 0
+  %nop8424 = alloca i1, i1 0
+  %nop8425 = alloca i1, i1 0
+  %nop8426 = alloca i1, i1 0
+  %nop8427 = alloca i1, i1 0
+  %nop8428 = alloca i1, i1 0
+  %nop8429 = alloca i1, i1 0
+  %nop8430 = alloca i1, i1 0
+  %nop8431 = alloca i1, i1 0
+  %nop8432 = alloca i1, i1 0
+  %nop8433 = alloca i1, i1 0
+  %nop8434 = alloca i1, i1 0
+  %nop8435 = alloca i1, i1 0
+  %nop8436 = alloca i1, i1 0
+  %nop8437 = alloca i1, i1 0
+  %nop8438 = alloca i1, i1 0
+  %nop8439 = alloca i1, i1 0
+  %nop8440 = alloca i1, i1 0
+  %nop8441 = alloca i1, i1 0
+  %nop8442 = alloca i1, i1 0
+  %nop8443 = alloca i1, i1 0
+  %nop8444 = alloca i1, i1 0
+  %nop8445 = alloca i1, i1 0
+  %nop8446 = alloca i1, i1 0
+  %nop8447 = alloca i1, i1 0
+  %nop8448 = alloca i1, i1 0
+  %nop8449 = alloca i1, i1 0
+  %nop8450 = alloca i1, i1 0
+  %nop8451 = alloca i1, i1 0
+  %nop8452 = alloca i1, i1 0
+  %nop8453 = alloca i1, i1 0
+  %nop8454 = alloca i1, i1 0
+  %nop8455 = alloca i1, i1 0
+  %nop8456 = alloca i1, i1 0
+  %nop8457 = alloca i1, i1 0
+  %nop8458 = alloca i1, i1 0
+  %nop8459 = alloca i1, i1 0
+  %nop8460 = alloca i1, i1 0
+  %nop8461 = alloca i1, i1 0
+  %nop8462 = alloca i1, i1 0
+  %nop8463 = alloca i1, i1 0
+  %nop8464 = alloca i1, i1 0
+  %nop8465 = alloca i1, i1 0
+  %nop8466 = alloca i1, i1 0
+  %nop8467 = alloca i1, i1 0
+  %nop8468 = alloca i1, i1 0
+  %nop8469 = alloca i1, i1 0
+  %nop8470 = alloca i1, i1 0
+  %nop8471 = alloca i1, i1 0
+  %nop8472 = alloca i1, i1 0
+  %nop8473 = alloca i1, i1 0
+  %nop8474 = alloca i1, i1 0
+  %nop8475 = alloca i1, i1 0
+  %nop8476 = alloca i1, i1 0
+  %nop8477 = alloca i1, i1 0
+  %nop8478 = alloca i1, i1 0
+  %nop8479 = alloca i1, i1 0
+  %nop8480 = alloca i1, i1 0
+  %nop8481 = alloca i1, i1 0
+  %nop8482 = alloca i1, i1 0
+  %nop8483 = alloca i1, i1 0
+  %nop8484 = alloca i1, i1 0
+  %nop8485 = alloca i1, i1 0
+  %nop8486 = alloca i1, i1 0
+  %nop8487 = alloca i1, i1 0
+  %nop8488 = alloca i1, i1 0
+  %nop8489 = alloca i1, i1 0
+  %nop8490 = alloca i1, i1 0
+  %nop8491 = alloca i1, i1 0
+  %nop8492 = alloca i1, i1 0
+  %nop8493 = alloca i1, i1 0
+  %nop8494 = alloca i1, i1 0
+  %nop8495 = alloca i1, i1 0
+  %nop8496 = alloca i1, i1 0
+  %nop8497 = alloca i1, i1 0
+  %nop8498 = alloca i1, i1 0
+  %nop8499 = alloca i1, i1 0
+  %nop8500 = alloca i1, i1 0
+  %nop8501 = alloca i1, i1 0
+  %nop8502 = alloca i1, i1 0
+  %nop8503 = alloca i1, i1 0
+  %nop8504 = alloca i1, i1 0
+  %nop8505 = alloca i1, i1 0
+  %nop8506 = alloca i1, i1 0
+  %nop8507 = alloca i1, i1 0
+  %nop8508 = alloca i1, i1 0
+  %nop8509 = alloca i1, i1 0
+  %nop8510 = alloca i1, i1 0
+  %nop8511 = alloca i1, i1 0
+  %nop8512 = alloca i1, i1 0
+  %nop8513 = alloca i1, i1 0
+  %nop8514 = alloca i1, i1 0
+  %nop8515 = alloca i1, i1 0
+  %nop8516 = alloca i1, i1 0
+  %nop8517 = alloca i1, i1 0
+  %nop8518 = alloca i1, i1 0
+  %nop8519 = alloca i1, i1 0
+  %nop8520 = alloca i1, i1 0
+  %nop8521 = alloca i1, i1 0
+  %nop8522 = alloca i1, i1 0
+  %nop8523 = alloca i1, i1 0
+  %nop8524 = alloca i1, i1 0
+  %nop8525 = alloca i1, i1 0
+  %nop8526 = alloca i1, i1 0
+  %nop8527 = alloca i1, i1 0
+  %nop8528 = alloca i1, i1 0
+  %nop8529 = alloca i1, i1 0
+  %nop8530 = alloca i1, i1 0
+  %nop8531 = alloca i1, i1 0
+  %nop8532 = alloca i1, i1 0
+  %nop8533 = alloca i1, i1 0
+  %nop8534 = alloca i1, i1 0
+  %nop8535 = alloca i1, i1 0
+  %nop8536 = alloca i1, i1 0
+  %nop8537 = alloca i1, i1 0
+  %nop8538 = alloca i1, i1 0
+  %nop8539 = alloca i1, i1 0
+  %nop8540 = alloca i1, i1 0
+  %nop8541 = alloca i1, i1 0
+  %nop8542 = alloca i1, i1 0
+  %nop8543 = alloca i1, i1 0
+  %nop8544 = alloca i1, i1 0
+  %nop8545 = alloca i1, i1 0
+  %nop8546 = alloca i1, i1 0
+  %nop8547 = alloca i1, i1 0
+  %nop8548 = alloca i1, i1 0
+  %nop8549 = alloca i1, i1 0
+  %nop8550 = alloca i1, i1 0
+  %nop8551 = alloca i1, i1 0
+  %nop8552 = alloca i1, i1 0
+  %nop8553 = alloca i1, i1 0
+  %nop8554 = alloca i1, i1 0
+  %nop8555 = alloca i1, i1 0
+  %nop8556 = alloca i1, i1 0
+  %nop8557 = alloca i1, i1 0
+  %nop8558 = alloca i1, i1 0
+  %nop8559 = alloca i1, i1 0
+  %nop8560 = alloca i1, i1 0
+  %nop8561 = alloca i1, i1 0
+  %nop8562 = alloca i1, i1 0
+  %nop8563 = alloca i1, i1 0
+  %nop8564 = alloca i1, i1 0
+  %nop8565 = alloca i1, i1 0
+  %nop8566 = alloca i1, i1 0
+  %nop8567 = alloca i1, i1 0
+  %nop8568 = alloca i1, i1 0
+  %nop8569 = alloca i1, i1 0
+  %nop8570 = alloca i1, i1 0
+  %nop8571 = alloca i1, i1 0
+  %nop8572 = alloca i1, i1 0
+  %nop8573 = alloca i1, i1 0
+  %nop8574 = alloca i1, i1 0
+  %nop8575 = alloca i1, i1 0
+  %nop8576 = alloca i1, i1 0
+  %nop8577 = alloca i1, i1 0
+  %nop8578 = alloca i1, i1 0
+  %nop8579 = alloca i1, i1 0
+  %nop8580 = alloca i1, i1 0
+  %nop8581 = alloca i1, i1 0
+  %nop8582 = alloca i1, i1 0
+  %nop8583 = alloca i1, i1 0
+  %nop8584 = alloca i1, i1 0
+  %nop8585 = alloca i1, i1 0
+  %nop8586 = alloca i1, i1 0
+  %nop8587 = alloca i1, i1 0
+  %nop8588 = alloca i1, i1 0
+  %nop8589 = alloca i1, i1 0
+  %nop8590 = alloca i1, i1 0
+  %nop8591 = alloca i1, i1 0
+  %nop8592 = alloca i1, i1 0
+  %nop8593 = alloca i1, i1 0
+  %nop8594 = alloca i1, i1 0
+  %nop8595 = alloca i1, i1 0
+  %nop8596 = alloca i1, i1 0
+  %nop8597 = alloca i1, i1 0
+  %nop8598 = alloca i1, i1 0
+  %nop8599 = alloca i1, i1 0
+  %nop8600 = alloca i1, i1 0
+  %nop8601 = alloca i1, i1 0
+  %nop8602 = alloca i1, i1 0
+  %nop8603 = alloca i1, i1 0
+  %nop8604 = alloca i1, i1 0
+  %nop8605 = alloca i1, i1 0
+  %nop8606 = alloca i1, i1 0
+  %nop8607 = alloca i1, i1 0
+  %nop8608 = alloca i1, i1 0
+  %nop8609 = alloca i1, i1 0
+  %nop8610 = alloca i1, i1 0
+  %nop8611 = alloca i1, i1 0
+  %nop8612 = alloca i1, i1 0
+  %nop8613 = alloca i1, i1 0
+  %nop8614 = alloca i1, i1 0
+  %nop8615 = alloca i1, i1 0
+  %nop8616 = alloca i1, i1 0
+  %nop8617 = alloca i1, i1 0
+  %nop8618 = alloca i1, i1 0
+  %nop8619 = alloca i1, i1 0
+  %nop8620 = alloca i1, i1 0
+  %nop8621 = alloca i1, i1 0
+  %nop8622 = alloca i1, i1 0
+  %nop8623 = alloca i1, i1 0
+  %nop8624 = alloca i1, i1 0
+  %nop8625 = alloca i1, i1 0
+  %nop8626 = alloca i1, i1 0
+  %nop8627 = alloca i1, i1 0
+  %nop8628 = alloca i1, i1 0
+  %nop8629 = alloca i1, i1 0
+  %nop8630 = alloca i1, i1 0
+  %nop8631 = alloca i1, i1 0
+  %nop8632 = alloca i1, i1 0
+  %nop8633 = alloca i1, i1 0
+  %nop8634 = alloca i1, i1 0
+  %nop8635 = alloca i1, i1 0
+  %nop8636 = alloca i1, i1 0
+  %nop8637 = alloca i1, i1 0
+  %nop8638 = alloca i1, i1 0
+  %nop8639 = alloca i1, i1 0
+  %nop8640 = alloca i1, i1 0
+  %nop8641 = alloca i1, i1 0
+  %nop8642 = alloca i1, i1 0
+  %nop8643 = alloca i1, i1 0
+  %nop8644 = alloca i1, i1 0
+  %nop8645 = alloca i1, i1 0
+  %nop8646 = alloca i1, i1 0
+  %nop8647 = alloca i1, i1 0
+  %nop8648 = alloca i1, i1 0
+  %nop8649 = alloca i1, i1 0
+  %nop8650 = alloca i1, i1 0
+  %nop8651 = alloca i1, i1 0
+  %nop8652 = alloca i1, i1 0
+  %nop8653 = alloca i1, i1 0
+  %nop8654 = alloca i1, i1 0
+  %nop8655 = alloca i1, i1 0
+  %nop8656 = alloca i1, i1 0
+  %nop8657 = alloca i1, i1 0
+  %nop8658 = alloca i1, i1 0
+  %nop8659 = alloca i1, i1 0
+  %nop8660 = alloca i1, i1 0
+  %nop8661 = alloca i1, i1 0
+  %nop8662 = alloca i1, i1 0
+  %nop8663 = alloca i1, i1 0
+  %nop8664 = alloca i1, i1 0
+  %nop8665 = alloca i1, i1 0
+  %nop8666 = alloca i1, i1 0
+  %nop8667 = alloca i1, i1 0
+  %nop8668 = alloca i1, i1 0
+  %nop8669 = alloca i1, i1 0
+  %nop8670 = alloca i1, i1 0
+  %nop8671 = alloca i1, i1 0
+  %nop8672 = alloca i1, i1 0
+  %nop8673 = alloca i1, i1 0
+  %nop8674 = alloca i1, i1 0
+  %nop8675 = alloca i1, i1 0
+  %nop8676 = alloca i1, i1 0
+  %nop8677 = alloca i1, i1 0
+  %nop8678 = alloca i1, i1 0
+  %nop8679 = alloca i1, i1 0
+  %nop8680 = alloca i1, i1 0
+  %nop8681 = alloca i1, i1 0
+  %nop8682 = alloca i1, i1 0
+  %nop8683 = alloca i1, i1 0
+  %nop8684 = alloca i1, i1 0
+  %nop8685 = alloca i1, i1 0
+  %nop8686 = alloca i1, i1 0
+  %nop8687 = alloca i1, i1 0
+  %nop8688 = alloca i1, i1 0
+  %nop8689 = alloca i1, i1 0
+  %nop8690 = alloca i1, i1 0
+  %nop8691 = alloca i1, i1 0
+  %nop8692 = alloca i1, i1 0
+  %nop8693 = alloca i1, i1 0
+  %nop8694 = alloca i1, i1 0
+  %nop8695 = alloca i1, i1 0
+  %nop8696 = alloca i1, i1 0
+  %nop8697 = alloca i1, i1 0
+  %nop8698 = alloca i1, i1 0
+  %nop8699 = alloca i1, i1 0
+  %nop8700 = alloca i1, i1 0
+  %nop8701 = alloca i1, i1 0
+  %nop8702 = alloca i1, i1 0
+  %nop8703 = alloca i1, i1 0
+  %nop8704 = alloca i1, i1 0
+  %nop8705 = alloca i1, i1 0
+  %nop8706 = alloca i1, i1 0
+  %nop8707 = alloca i1, i1 0
+  %nop8708 = alloca i1, i1 0
+  %nop8709 = alloca i1, i1 0
+  %nop8710 = alloca i1, i1 0
+  %nop8711 = alloca i1, i1 0
+  %nop8712 = alloca i1, i1 0
+  %nop8713 = alloca i1, i1 0
+  %nop8714 = alloca i1, i1 0
+  %nop8715 = alloca i1, i1 0
+  %nop8716 = alloca i1, i1 0
+  %nop8717 = alloca i1, i1 0
+  %nop8718 = alloca i1, i1 0
+  %nop8719 = alloca i1, i1 0
+  %nop8720 = alloca i1, i1 0
+  %nop8721 = alloca i1, i1 0
+  %nop8722 = alloca i1, i1 0
+  %nop8723 = alloca i1, i1 0
+  %nop8724 = alloca i1, i1 0
+  %nop8725 = alloca i1, i1 0
+  %nop8726 = alloca i1, i1 0
+  %nop8727 = alloca i1, i1 0
+  %nop8728 = alloca i1, i1 0
+  %nop8729 = alloca i1, i1 0
+  %nop8730 = alloca i1, i1 0
+  %nop8731 = alloca i1, i1 0
+  %nop8732 = alloca i1, i1 0
+  %nop8733 = alloca i1, i1 0
+  %nop8734 = alloca i1, i1 0
+  %nop8735 = alloca i1, i1 0
+  %nop8736 = alloca i1, i1 0
+  %nop8737 = alloca i1, i1 0
+  %nop8738 = alloca i1, i1 0
+  %nop8739 = alloca i1, i1 0
+  %nop8740 = alloca i1, i1 0
+  %nop8741 = alloca i1, i1 0
+  %nop8742 = alloca i1, i1 0
+  %nop8743 = alloca i1, i1 0
+  %nop8744 = alloca i1, i1 0
+  %nop8745 = alloca i1, i1 0
+  %nop8746 = alloca i1, i1 0
+  %nop8747 = alloca i1, i1 0
+  %nop8748 = alloca i1, i1 0
+  %nop8749 = alloca i1, i1 0
+  %nop8750 = alloca i1, i1 0
+  %nop8751 = alloca i1, i1 0
+  %nop8752 = alloca i1, i1 0
+  %nop8753 = alloca i1, i1 0
+  %nop8754 = alloca i1, i1 0
+  %nop8755 = alloca i1, i1 0
+  %nop8756 = alloca i1, i1 0
+  %nop8757 = alloca i1, i1 0
+  %nop8758 = alloca i1, i1 0
+  %nop8759 = alloca i1, i1 0
+  %nop8760 = alloca i1, i1 0
+  %nop8761 = alloca i1, i1 0
+  %nop8762 = alloca i1, i1 0
+  %nop8763 = alloca i1, i1 0
+  %nop8764 = alloca i1, i1 0
+  %nop8765 = alloca i1, i1 0
+  %nop8766 = alloca i1, i1 0
+  %nop8767 = alloca i1, i1 0
+  %nop8768 = alloca i1, i1 0
+  %nop8769 = alloca i1, i1 0
+  %nop8770 = alloca i1, i1 0
+  %nop8771 = alloca i1, i1 0
+  %nop8772 = alloca i1, i1 0
+  %nop8773 = alloca i1, i1 0
+  %nop8774 = alloca i1, i1 0
+  %nop8775 = alloca i1, i1 0
+  %nop8776 = alloca i1, i1 0
+  %nop8777 = alloca i1, i1 0
+  %nop8778 = alloca i1, i1 0
+  %nop8779 = alloca i1, i1 0
+  %nop8780 = alloca i1, i1 0
+  %nop8781 = alloca i1, i1 0
+  %nop8782 = alloca i1, i1 0
+  %nop8783 = alloca i1, i1 0
+  %nop8784 = alloca i1, i1 0
+  %nop8785 = alloca i1, i1 0
+  %nop8786 = alloca i1, i1 0
+  %nop8787 = alloca i1, i1 0
+  %nop8788 = alloca i1, i1 0
+  %nop8789 = alloca i1, i1 0
+  %nop8790 = alloca i1, i1 0
+  %nop8791 = alloca i1, i1 0
+  %nop8792 = alloca i1, i1 0
+  %nop8793 = alloca i1, i1 0
+  %nop8794 = alloca i1, i1 0
+  %nop8795 = alloca i1, i1 0
+  %nop8796 = alloca i1, i1 0
+  %nop8797 = alloca i1, i1 0
+  %nop8798 = alloca i1, i1 0
+  %nop8799 = alloca i1, i1 0
+  %nop8800 = alloca i1, i1 0
+  %nop8801 = alloca i1, i1 0
+  %nop8802 = alloca i1, i1 0
+  %nop8803 = alloca i1, i1 0
+  %nop8804 = alloca i1, i1 0
+  %nop8805 = alloca i1, i1 0
+  %nop8806 = alloca i1, i1 0
+  %nop8807 = alloca i1, i1 0
+  %nop8808 = alloca i1, i1 0
+  %nop8809 = alloca i1, i1 0
+  %nop8810 = alloca i1, i1 0
+  %nop8811 = alloca i1, i1 0
+  %nop8812 = alloca i1, i1 0
+  %nop8813 = alloca i1, i1 0
+  %nop8814 = alloca i1, i1 0
+  %nop8815 = alloca i1, i1 0
+  %nop8816 = alloca i1, i1 0
+  %nop8817 = alloca i1, i1 0
+  %nop8818 = alloca i1, i1 0
+  %nop8819 = alloca i1, i1 0
+  %nop8820 = alloca i1, i1 0
+  %nop8821 = alloca i1, i1 0
+  %nop8822 = alloca i1, i1 0
+  %nop8823 = alloca i1, i1 0
+  %nop8824 = alloca i1, i1 0
+  %nop8825 = alloca i1, i1 0
+  %nop8826 = alloca i1, i1 0
+  %nop8827 = alloca i1, i1 0
+  %nop8828 = alloca i1, i1 0
+  %nop8829 = alloca i1, i1 0
+  %nop8830 = alloca i1, i1 0
+  %nop8831 = alloca i1, i1 0
+  %nop8832 = alloca i1, i1 0
+  %nop8833 = alloca i1, i1 0
+  %nop8834 = alloca i1, i1 0
+  %nop8835 = alloca i1, i1 0
+  %nop8836 = alloca i1, i1 0
+  %nop8837 = alloca i1, i1 0
+  %nop8838 = alloca i1, i1 0
+  %nop8839 = alloca i1, i1 0
+  %nop8840 = alloca i1, i1 0
+  %nop8841 = alloca i1, i1 0
+  %nop8842 = alloca i1, i1 0
+  %nop8843 = alloca i1, i1 0
+  %nop8844 = alloca i1, i1 0
+  %nop8845 = alloca i1, i1 0
+  %nop8846 = alloca i1, i1 0
+  %nop8847 = alloca i1, i1 0
+  %nop8848 = alloca i1, i1 0
+  %nop8849 = alloca i1, i1 0
+  %nop8850 = alloca i1, i1 0
+  %nop8851 = alloca i1, i1 0
+  %nop8852 = alloca i1, i1 0
+  %nop8853 = alloca i1, i1 0
+  %nop8854 = alloca i1, i1 0
+  %nop8855 = alloca i1, i1 0
+  %nop8856 = alloca i1, i1 0
+  %nop8857 = alloca i1, i1 0
+  %nop8858 = alloca i1, i1 0
+  %nop8859 = alloca i1, i1 0
+  %nop8860 = alloca i1, i1 0
+  %nop8861 = alloca i1, i1 0
+  %nop8862 = alloca i1, i1 0
+  %nop8863 = alloca i1, i1 0
+  %nop8864 = alloca i1, i1 0
+  %nop8865 = alloca i1, i1 0
+  %nop8866 = alloca i1, i1 0
+  %nop8867 = alloca i1, i1 0
+  %nop8868 = alloca i1, i1 0
+  %nop8869 = alloca i1, i1 0
+  %nop8870 = alloca i1, i1 0
+  %nop8871 = alloca i1, i1 0
+  %nop8872 = alloca i1, i1 0
+  %nop8873 = alloca i1, i1 0
+  %nop8874 = alloca i1, i1 0
+  %nop8875 = alloca i1, i1 0
+  %nop8876 = alloca i1, i1 0
+  %nop8877 = alloca i1, i1 0
+  %nop8878 = alloca i1, i1 0
+  %nop8879 = alloca i1, i1 0
+  %nop8880 = alloca i1, i1 0
+  %nop8881 = alloca i1, i1 0
+  %nop8882 = alloca i1, i1 0
+  %nop8883 = alloca i1, i1 0
+  %nop8884 = alloca i1, i1 0
+  %nop8885 = alloca i1, i1 0
+  %nop8886 = alloca i1, i1 0
+  %nop8887 = alloca i1, i1 0
+  %nop8888 = alloca i1, i1 0
+  %nop8889 = alloca i1, i1 0
+  %nop8890 = alloca i1, i1 0
+  %nop8891 = alloca i1, i1 0
+  %nop8892 = alloca i1, i1 0
+  %nop8893 = alloca i1, i1 0
+  %nop8894 = alloca i1, i1 0
+  %nop8895 = alloca i1, i1 0
+  %nop8896 = alloca i1, i1 0
+  %nop8897 = alloca i1, i1 0
+  %nop8898 = alloca i1, i1 0
+  %nop8899 = alloca i1, i1 0
+  %nop8900 = alloca i1, i1 0
+  %nop8901 = alloca i1, i1 0
+  %nop8902 = alloca i1, i1 0
+  %nop8903 = alloca i1, i1 0
+  %nop8904 = alloca i1, i1 0
+  %nop8905 = alloca i1, i1 0
+  %nop8906 = alloca i1, i1 0
+  %nop8907 = alloca i1, i1 0
+  %nop8908 = alloca i1, i1 0
+  %nop8909 = alloca i1, i1 0
+  %nop8910 = alloca i1, i1 0
+  %nop8911 = alloca i1, i1 0
+  %nop8912 = alloca i1, i1 0
+  %nop8913 = alloca i1, i1 0
+  %nop8914 = alloca i1, i1 0
+  %nop8915 = alloca i1, i1 0
+  %nop8916 = alloca i1, i1 0
+  %nop8917 = alloca i1, i1 0
+  %nop8918 = alloca i1, i1 0
+  %nop8919 = alloca i1, i1 0
+  %nop8920 = alloca i1, i1 0
+  %nop8921 = alloca i1, i1 0
+  %nop8922 = alloca i1, i1 0
+  %nop8923 = alloca i1, i1 0
+  %nop8924 = alloca i1, i1 0
+  %nop8925 = alloca i1, i1 0
+  %nop8926 = alloca i1, i1 0
+  %nop8927 = alloca i1, i1 0
+  %nop8928 = alloca i1, i1 0
+  %nop8929 = alloca i1, i1 0
+  %nop8930 = alloca i1, i1 0
+  %nop8931 = alloca i1, i1 0
+  %nop8932 = alloca i1, i1 0
+  %nop8933 = alloca i1, i1 0
+  %nop8934 = alloca i1, i1 0
+  %nop8935 = alloca i1, i1 0
+  %nop8936 = alloca i1, i1 0
+  %nop8937 = alloca i1, i1 0
+  %nop8938 = alloca i1, i1 0
+  %nop8939 = alloca i1, i1 0
+  %nop8940 = alloca i1, i1 0
+  %nop8941 = alloca i1, i1 0
+  %nop8942 = alloca i1, i1 0
+  %nop8943 = alloca i1, i1 0
+  %nop8944 = alloca i1, i1 0
+  %nop8945 = alloca i1, i1 0
+  %nop8946 = alloca i1, i1 0
+  %nop8947 = alloca i1, i1 0
+  %nop8948 = alloca i1, i1 0
+  %nop8949 = alloca i1, i1 0
+  %nop8950 = alloca i1, i1 0
+  %nop8951 = alloca i1, i1 0
+  %nop8952 = alloca i1, i1 0
+  %nop8953 = alloca i1, i1 0
+  %nop8954 = alloca i1, i1 0
+  %nop8955 = alloca i1, i1 0
+  %nop8956 = alloca i1, i1 0
+  %nop8957 = alloca i1, i1 0
+  %nop8958 = alloca i1, i1 0
+  %nop8959 = alloca i1, i1 0
+  %nop8960 = alloca i1, i1 0
+  %nop8961 = alloca i1, i1 0
+  %nop8962 = alloca i1, i1 0
+  %nop8963 = alloca i1, i1 0
+  %nop8964 = alloca i1, i1 0
+  %nop8965 = alloca i1, i1 0
+  %nop8966 = alloca i1, i1 0
+  %nop8967 = alloca i1, i1 0
+  %nop8968 = alloca i1, i1 0
+  %nop8969 = alloca i1, i1 0
+  %nop8970 = alloca i1, i1 0
+  %nop8971 = alloca i1, i1 0
+  %nop8972 = alloca i1, i1 0
+  %nop8973 = alloca i1, i1 0
+  %nop8974 = alloca i1, i1 0
+  %nop8975 = alloca i1, i1 0
+  %nop8976 = alloca i1, i1 0
+  %nop8977 = alloca i1, i1 0
+  %nop8978 = alloca i1, i1 0
+  %nop8979 = alloca i1, i1 0
+  %nop8980 = alloca i1, i1 0
+  %nop8981 = alloca i1, i1 0
+  %nop8982 = alloca i1, i1 0
+  %nop8983 = alloca i1, i1 0
+  %nop8984 = alloca i1, i1 0
+  %nop8985 = alloca i1, i1 0
+  %nop8986 = alloca i1, i1 0
+  %nop8987 = alloca i1, i1 0
+  %nop8988 = alloca i1, i1 0
+  %nop8989 = alloca i1, i1 0
+  %nop8990 = alloca i1, i1 0
+  %nop8991 = alloca i1, i1 0
+  %nop8992 = alloca i1, i1 0
+  %nop8993 = alloca i1, i1 0
+  %nop8994 = alloca i1, i1 0
+  %nop8995 = alloca i1, i1 0
+  %nop8996 = alloca i1, i1 0
+  %nop8997 = alloca i1, i1 0
+  %nop8998 = alloca i1, i1 0
+  %nop8999 = alloca i1, i1 0
+  %nop9000 = alloca i1, i1 0
+  %nop9001 = alloca i1, i1 0
+  %nop9002 = alloca i1, i1 0
+  %nop9003 = alloca i1, i1 0
+  %nop9004 = alloca i1, i1 0
+  %nop9005 = alloca i1, i1 0
+  %nop9006 = alloca i1, i1 0
+  %nop9007 = alloca i1, i1 0
+  %nop9008 = alloca i1, i1 0
+  %nop9009 = alloca i1, i1 0
+  %nop9010 = alloca i1, i1 0
+  %nop9011 = alloca i1, i1 0
+  %nop9012 = alloca i1, i1 0
+  %nop9013 = alloca i1, i1 0
+  %nop9014 = alloca i1, i1 0
+  %nop9015 = alloca i1, i1 0
+  %nop9016 = alloca i1, i1 0
+  %nop9017 = alloca i1, i1 0
+  %nop9018 = alloca i1, i1 0
+  %nop9019 = alloca i1, i1 0
+  %nop9020 = alloca i1, i1 0
+  %nop9021 = alloca i1, i1 0
+  %nop9022 = alloca i1, i1 0
+  %nop9023 = alloca i1, i1 0
+  %nop9024 = alloca i1, i1 0
+  %nop9025 = alloca i1, i1 0
+  %nop9026 = alloca i1, i1 0
+  %nop9027 = alloca i1, i1 0
+  %nop9028 = alloca i1, i1 0
+  %nop9029 = alloca i1, i1 0
+  %nop9030 = alloca i1, i1 0
+  %nop9031 = alloca i1, i1 0
+  %nop9032 = alloca i1, i1 0
+  %nop9033 = alloca i1, i1 0
+  %nop9034 = alloca i1, i1 0
+  %nop9035 = alloca i1, i1 0
+  %nop9036 = alloca i1, i1 0
+  %nop9037 = alloca i1, i1 0
+  %nop9038 = alloca i1, i1 0
+  %nop9039 = alloca i1, i1 0
+  %nop9040 = alloca i1, i1 0
+  %nop9041 = alloca i1, i1 0
+  %nop9042 = alloca i1, i1 0
+  %nop9043 = alloca i1, i1 0
+  %nop9044 = alloca i1, i1 0
+  %nop9045 = alloca i1, i1 0
+  %nop9046 = alloca i1, i1 0
+  %nop9047 = alloca i1, i1 0
+  %nop9048 = alloca i1, i1 0
+  %nop9049 = alloca i1, i1 0
+  %nop9050 = alloca i1, i1 0
+  %nop9051 = alloca i1, i1 0
+  %nop9052 = alloca i1, i1 0
+  %nop9053 = alloca i1, i1 0
+  %nop9054 = alloca i1, i1 0
+  %nop9055 = alloca i1, i1 0
+  %nop9056 = alloca i1, i1 0
+  %nop9057 = alloca i1, i1 0
+  %nop9058 = alloca i1, i1 0
+  %nop9059 = alloca i1, i1 0
+  %nop9060 = alloca i1, i1 0
+  %nop9061 = alloca i1, i1 0
+  %nop9062 = alloca i1, i1 0
+  %nop9063 = alloca i1, i1 0
+  %nop9064 = alloca i1, i1 0
+  %nop9065 = alloca i1, i1 0
+  %nop9066 = alloca i1, i1 0
+  %nop9067 = alloca i1, i1 0
+  %nop9068 = alloca i1, i1 0
+  %nop9069 = alloca i1, i1 0
+  %nop9070 = alloca i1, i1 0
+  %nop9071 = alloca i1, i1 0
+  %nop9072 = alloca i1, i1 0
+  %nop9073 = alloca i1, i1 0
+  %nop9074 = alloca i1, i1 0
+  %nop9075 = alloca i1, i1 0
+  %nop9076 = alloca i1, i1 0
+  %nop9077 = alloca i1, i1 0
+  %nop9078 = alloca i1, i1 0
+  %nop9079 = alloca i1, i1 0
+  %nop9080 = alloca i1, i1 0
+  %nop9081 = alloca i1, i1 0
+  %nop9082 = alloca i1, i1 0
+  %nop9083 = alloca i1, i1 0
+  %nop9084 = alloca i1, i1 0
+  %nop9085 = alloca i1, i1 0
+  %nop9086 = alloca i1, i1 0
+  %nop9087 = alloca i1, i1 0
+  %nop9088 = alloca i1, i1 0
+  %nop9089 = alloca i1, i1 0
+  %nop9090 = alloca i1, i1 0
+  %nop9091 = alloca i1, i1 0
+  %nop9092 = alloca i1, i1 0
+  %nop9093 = alloca i1, i1 0
+  %nop9094 = alloca i1, i1 0
+  %nop9095 = alloca i1, i1 0
+  %nop9096 = alloca i1, i1 0
+  %nop9097 = alloca i1, i1 0
+  %nop9098 = alloca i1, i1 0
+  %nop9099 = alloca i1, i1 0
+  %nop9100 = alloca i1, i1 0
+  %nop9101 = alloca i1, i1 0
+  %nop9102 = alloca i1, i1 0
+  %nop9103 = alloca i1, i1 0
+  %nop9104 = alloca i1, i1 0
+  %nop9105 = alloca i1, i1 0
+  %nop9106 = alloca i1, i1 0
+  %nop9107 = alloca i1, i1 0
+  %nop9108 = alloca i1, i1 0
+  %nop9109 = alloca i1, i1 0
+  %nop9110 = alloca i1, i1 0
+  %nop9111 = alloca i1, i1 0
+  %nop9112 = alloca i1, i1 0
+  %nop9113 = alloca i1, i1 0
+  %nop9114 = alloca i1, i1 0
+  %nop9115 = alloca i1, i1 0
+  %nop9116 = alloca i1, i1 0
+  %nop9117 = alloca i1, i1 0
+  %nop9118 = alloca i1, i1 0
+  %nop9119 = alloca i1, i1 0
+  %nop9120 = alloca i1, i1 0
+  %nop9121 = alloca i1, i1 0
+  %nop9122 = alloca i1, i1 0
+  %nop9123 = alloca i1, i1 0
+  %nop9124 = alloca i1, i1 0
+  %nop9125 = alloca i1, i1 0
+  %nop9126 = alloca i1, i1 0
+  %nop9127 = alloca i1, i1 0
+  %nop9128 = alloca i1, i1 0
+  %nop9129 = alloca i1, i1 0
+  %nop9130 = alloca i1, i1 0
+  %nop9131 = alloca i1, i1 0
+  %nop9132 = alloca i1, i1 0
+  %nop9133 = alloca i1, i1 0
+  %nop9134 = alloca i1, i1 0
+  %nop9135 = alloca i1, i1 0
+  %nop9136 = alloca i1, i1 0
+  %nop9137 = alloca i1, i1 0
+  %nop9138 = alloca i1, i1 0
+  %nop9139 = alloca i1, i1 0
+  %nop9140 = alloca i1, i1 0
+  %nop9141 = alloca i1, i1 0
+  %nop9142 = alloca i1, i1 0
+  %nop9143 = alloca i1, i1 0
+  %nop9144 = alloca i1, i1 0
+  %nop9145 = alloca i1, i1 0
+  %nop9146 = alloca i1, i1 0
+  %nop9147 = alloca i1, i1 0
+  %nop9148 = alloca i1, i1 0
+  %nop9149 = alloca i1, i1 0
+  %nop9150 = alloca i1, i1 0
+  %nop9151 = alloca i1, i1 0
+  %nop9152 = alloca i1, i1 0
+  %nop9153 = alloca i1, i1 0
+  %nop9154 = alloca i1, i1 0
+  %nop9155 = alloca i1, i1 0
+  %nop9156 = alloca i1, i1 0
+  %nop9157 = alloca i1, i1 0
+  %nop9158 = alloca i1, i1 0
+  %nop9159 = alloca i1, i1 0
+  %nop9160 = alloca i1, i1 0
+  %nop9161 = alloca i1, i1 0
+  %nop9162 = alloca i1, i1 0
+  %nop9163 = alloca i1, i1 0
+  %nop9164 = alloca i1, i1 0
+  %nop9165 = alloca i1, i1 0
+  %nop9166 = alloca i1, i1 0
+  %nop9167 = alloca i1, i1 0
+  %nop9168 = alloca i1, i1 0
+  %nop9169 = alloca i1, i1 0
+  %nop9170 = alloca i1, i1 0
+  %nop9171 = alloca i1, i1 0
+  %nop9172 = alloca i1, i1 0
+  %nop9173 = alloca i1, i1 0
+  %nop9174 = alloca i1, i1 0
+  %nop9175 = alloca i1, i1 0
+  %nop9176 = alloca i1, i1 0
+  %nop9177 = alloca i1, i1 0
+  %nop9178 = alloca i1, i1 0
+  %nop9179 = alloca i1, i1 0
+  %nop9180 = alloca i1, i1 0
+  %nop9181 = alloca i1, i1 0
+  %nop9182 = alloca i1, i1 0
+  %nop9183 = alloca i1, i1 0
+  %nop9184 = alloca i1, i1 0
+  %nop9185 = alloca i1, i1 0
+  %nop9186 = alloca i1, i1 0
+  %nop9187 = alloca i1, i1 0
+  %nop9188 = alloca i1, i1 0
+  %nop9189 = alloca i1, i1 0
+  %nop9190 = alloca i1, i1 0
+  %nop9191 = alloca i1, i1 0
+  %nop9192 = alloca i1, i1 0
+  %nop9193 = alloca i1, i1 0
+  %nop9194 = alloca i1, i1 0
+  %nop9195 = alloca i1, i1 0
+  %nop9196 = alloca i1, i1 0
+  %nop9197 = alloca i1, i1 0
+  %nop9198 = alloca i1, i1 0
+  %nop9199 = alloca i1, i1 0
+  %nop9200 = alloca i1, i1 0
+  %nop9201 = alloca i1, i1 0
+  %nop9202 = alloca i1, i1 0
+  %nop9203 = alloca i1, i1 0
+  %nop9204 = alloca i1, i1 0
+  %nop9205 = alloca i1, i1 0
+  %nop9206 = alloca i1, i1 0
+  %nop9207 = alloca i1, i1 0
+  %nop9208 = alloca i1, i1 0
+  %nop9209 = alloca i1, i1 0
+  %nop9210 = alloca i1, i1 0
+  %nop9211 = alloca i1, i1 0
+  %nop9212 = alloca i1, i1 0
+  %nop9213 = alloca i1, i1 0
+  %nop9214 = alloca i1, i1 0
+  %nop9215 = alloca i1, i1 0
+  %nop9216 = alloca i1, i1 0
+  %nop9217 = alloca i1, i1 0
+  %nop9218 = alloca i1, i1 0
+  %nop9219 = alloca i1, i1 0
+  %nop9220 = alloca i1, i1 0
+  %nop9221 = alloca i1, i1 0
+  %nop9222 = alloca i1, i1 0
+  %nop9223 = alloca i1, i1 0
+  %nop9224 = alloca i1, i1 0
+  %nop9225 = alloca i1, i1 0
+  %nop9226 = alloca i1, i1 0
+  %nop9227 = alloca i1, i1 0
+  %nop9228 = alloca i1, i1 0
+  %nop9229 = alloca i1, i1 0
+  %nop9230 = alloca i1, i1 0
+  %nop9231 = alloca i1, i1 0
+  %nop9232 = alloca i1, i1 0
+  %nop9233 = alloca i1, i1 0
+  %nop9234 = alloca i1, i1 0
+  %nop9235 = alloca i1, i1 0
+  %nop9236 = alloca i1, i1 0
+  %nop9237 = alloca i1, i1 0
+  %nop9238 = alloca i1, i1 0
+  %nop9239 = alloca i1, i1 0
+  %nop9240 = alloca i1, i1 0
+  %nop9241 = alloca i1, i1 0
+  %nop9242 = alloca i1, i1 0
+  %nop9243 = alloca i1, i1 0
+  %nop9244 = alloca i1, i1 0
+  %nop9245 = alloca i1, i1 0
+  %nop9246 = alloca i1, i1 0
+  %nop9247 = alloca i1, i1 0
+  %nop9248 = alloca i1, i1 0
+  %nop9249 = alloca i1, i1 0
+  %nop9250 = alloca i1, i1 0
+  %nop9251 = alloca i1, i1 0
+  %nop9252 = alloca i1, i1 0
+  %nop9253 = alloca i1, i1 0
+  %nop9254 = alloca i1, i1 0
+  %nop9255 = alloca i1, i1 0
+  %nop9256 = alloca i1, i1 0
+  %nop9257 = alloca i1, i1 0
+  %nop9258 = alloca i1, i1 0
+  %nop9259 = alloca i1, i1 0
+  %nop9260 = alloca i1, i1 0
+  %nop9261 = alloca i1, i1 0
+  %nop9262 = alloca i1, i1 0
+  %nop9263 = alloca i1, i1 0
+  %nop9264 = alloca i1, i1 0
+  %nop9265 = alloca i1, i1 0
+  %nop9266 = alloca i1, i1 0
+  %nop9267 = alloca i1, i1 0
+  %nop9268 = alloca i1, i1 0
+  %nop9269 = alloca i1, i1 0
+  %nop9270 = alloca i1, i1 0
+  %nop9271 = alloca i1, i1 0
+  %nop9272 = alloca i1, i1 0
+  %nop9273 = alloca i1, i1 0
+  %nop9274 = alloca i1, i1 0
+  %nop9275 = alloca i1, i1 0
+  %nop9276 = alloca i1, i1 0
+  %nop9277 = alloca i1, i1 0
+  %nop9278 = alloca i1, i1 0
+  %nop9279 = alloca i1, i1 0
+  %nop9280 = alloca i1, i1 0
+  %nop9281 = alloca i1, i1 0
+  %nop9282 = alloca i1, i1 0
+  %nop9283 = alloca i1, i1 0
+  %nop9284 = alloca i1, i1 0
+  %nop9285 = alloca i1, i1 0
+  %nop9286 = alloca i1, i1 0
+  %nop9287 = alloca i1, i1 0
+  %nop9288 = alloca i1, i1 0
+  %nop9289 = alloca i1, i1 0
+  %nop9290 = alloca i1, i1 0
+  %nop9291 = alloca i1, i1 0
+  %nop9292 = alloca i1, i1 0
+  %nop9293 = alloca i1, i1 0
+  %nop9294 = alloca i1, i1 0
+  %nop9295 = alloca i1, i1 0
+  %nop9296 = alloca i1, i1 0
+  %nop9297 = alloca i1, i1 0
+  %nop9298 = alloca i1, i1 0
+  %nop9299 = alloca i1, i1 0
+  %nop9300 = alloca i1, i1 0
+  %nop9301 = alloca i1, i1 0
+  %nop9302 = alloca i1, i1 0
+  %nop9303 = alloca i1, i1 0
+  %nop9304 = alloca i1, i1 0
+  %nop9305 = alloca i1, i1 0
+  %nop9306 = alloca i1, i1 0
+  %nop9307 = alloca i1, i1 0
+  %nop9308 = alloca i1, i1 0
+  %nop9309 = alloca i1, i1 0
+  %nop9310 = alloca i1, i1 0
+  %nop9311 = alloca i1, i1 0
+  %nop9312 = alloca i1, i1 0
+  %nop9313 = alloca i1, i1 0
+  %nop9314 = alloca i1, i1 0
+  %nop9315 = alloca i1, i1 0
+  %nop9316 = alloca i1, i1 0
+  %nop9317 = alloca i1, i1 0
+  %nop9318 = alloca i1, i1 0
+  %nop9319 = alloca i1, i1 0
+  %nop9320 = alloca i1, i1 0
+  %nop9321 = alloca i1, i1 0
+  %nop9322 = alloca i1, i1 0
+  %nop9323 = alloca i1, i1 0
+  %nop9324 = alloca i1, i1 0
+  %nop9325 = alloca i1, i1 0
+  %nop9326 = alloca i1, i1 0
+  %nop9327 = alloca i1, i1 0
+  %nop9328 = alloca i1, i1 0
+  %nop9329 = alloca i1, i1 0
+  %nop9330 = alloca i1, i1 0
+  %nop9331 = alloca i1, i1 0
+  %nop9332 = alloca i1, i1 0
+  %nop9333 = alloca i1, i1 0
+  %nop9334 = alloca i1, i1 0
+  %nop9335 = alloca i1, i1 0
+  %nop9336 = alloca i1, i1 0
+  %nop9337 = alloca i1, i1 0
+  %nop9338 = alloca i1, i1 0
+  %nop9339 = alloca i1, i1 0
+  %nop9340 = alloca i1, i1 0
+  %nop9341 = alloca i1, i1 0
+  %nop9342 = alloca i1, i1 0
+  %nop9343 = alloca i1, i1 0
+  %nop9344 = alloca i1, i1 0
+  %nop9345 = alloca i1, i1 0
+  %nop9346 = alloca i1, i1 0
+  %nop9347 = alloca i1, i1 0
+  %nop9348 = alloca i1, i1 0
+  %nop9349 = alloca i1, i1 0
+  %nop9350 = alloca i1, i1 0
+  %nop9351 = alloca i1, i1 0
+  %nop9352 = alloca i1, i1 0
+  %nop9353 = alloca i1, i1 0
+  %nop9354 = alloca i1, i1 0
+  %nop9355 = alloca i1, i1 0
+  %nop9356 = alloca i1, i1 0
+  %nop9357 = alloca i1, i1 0
+  %nop9358 = alloca i1, i1 0
+  %nop9359 = alloca i1, i1 0
+  %nop9360 = alloca i1, i1 0
+  %nop9361 = alloca i1, i1 0
+  %nop9362 = alloca i1, i1 0
+  %nop9363 = alloca i1, i1 0
+  %nop9364 = alloca i1, i1 0
+  %nop9365 = alloca i1, i1 0
+  %nop9366 = alloca i1, i1 0
+  %nop9367 = alloca i1, i1 0
+  %nop9368 = alloca i1, i1 0
+  %nop9369 = alloca i1, i1 0
+  %nop9370 = alloca i1, i1 0
+  %nop9371 = alloca i1, i1 0
+  %nop9372 = alloca i1, i1 0
+  %nop9373 = alloca i1, i1 0
+  %nop9374 = alloca i1, i1 0
+  %nop9375 = alloca i1, i1 0
+  %nop9376 = alloca i1, i1 0
+  %nop9377 = alloca i1, i1 0
+  %nop9378 = alloca i1, i1 0
+  %nop9379 = alloca i1, i1 0
+  %nop9380 = alloca i1, i1 0
+  %nop9381 = alloca i1, i1 0
+  %nop9382 = alloca i1, i1 0
+  %nop9383 = alloca i1, i1 0
+  %nop9384 = alloca i1, i1 0
+  %nop9385 = alloca i1, i1 0
+  %nop9386 = alloca i1, i1 0
+  %nop9387 = alloca i1, i1 0
+  %nop9388 = alloca i1, i1 0
+  %nop9389 = alloca i1, i1 0
+  %nop9390 = alloca i1, i1 0
+  %nop9391 = alloca i1, i1 0
+  %nop9392 = alloca i1, i1 0
+  %nop9393 = alloca i1, i1 0
+  %nop9394 = alloca i1, i1 0
+  %nop9395 = alloca i1, i1 0
+  %nop9396 = alloca i1, i1 0
+  %nop9397 = alloca i1, i1 0
+  %nop9398 = alloca i1, i1 0
+  %nop9399 = alloca i1, i1 0
+  %nop9400 = alloca i1, i1 0
+  %nop9401 = alloca i1, i1 0
+  %nop9402 = alloca i1, i1 0
+  %nop9403 = alloca i1, i1 0
+  %nop9404 = alloca i1, i1 0
+  %nop9405 = alloca i1, i1 0
+  %nop9406 = alloca i1, i1 0
+  %nop9407 = alloca i1, i1 0
+  %nop9408 = alloca i1, i1 0
+  %nop9409 = alloca i1, i1 0
+  %nop9410 = alloca i1, i1 0
+  %nop9411 = alloca i1, i1 0
+  %nop9412 = alloca i1, i1 0
+  %nop9413 = alloca i1, i1 0
+  %nop9414 = alloca i1, i1 0
+  %nop9415 = alloca i1, i1 0
+  %nop9416 = alloca i1, i1 0
+  %nop9417 = alloca i1, i1 0
+  %nop9418 = alloca i1, i1 0
+  %nop9419 = alloca i1, i1 0
+  %nop9420 = alloca i1, i1 0
+  %nop9421 = alloca i1, i1 0
+  %nop9422 = alloca i1, i1 0
+  %nop9423 = alloca i1, i1 0
+  %nop9424 = alloca i1, i1 0
+  %nop9425 = alloca i1, i1 0
+  %nop9426 = alloca i1, i1 0
+  %nop9427 = alloca i1, i1 0
+  %nop9428 = alloca i1, i1 0
+  %nop9429 = alloca i1, i1 0
+  %nop9430 = alloca i1, i1 0
+  %nop9431 = alloca i1, i1 0
+  %nop9432 = alloca i1, i1 0
+  %nop9433 = alloca i1, i1 0
+  %nop9434 = alloca i1, i1 0
+  %nop9435 = alloca i1, i1 0
+  %nop9436 = alloca i1, i1 0
+  %nop9437 = alloca i1, i1 0
+  %nop9438 = alloca i1, i1 0
+  %nop9439 = alloca i1, i1 0
+  %nop9440 = alloca i1, i1 0
+  %nop9441 = alloca i1, i1 0
+  %nop9442 = alloca i1, i1 0
+  %nop9443 = alloca i1, i1 0
+  %nop9444 = alloca i1, i1 0
+  %nop9445 = alloca i1, i1 0
+  %nop9446 = alloca i1, i1 0
+  %nop9447 = alloca i1, i1 0
+  %nop9448 = alloca i1, i1 0
+  %nop9449 = alloca i1, i1 0
+  %nop9450 = alloca i1, i1 0
+  %nop9451 = alloca i1, i1 0
+  %nop9452 = alloca i1, i1 0
+  %nop9453 = alloca i1, i1 0
+  %nop9454 = alloca i1, i1 0
+  %nop9455 = alloca i1, i1 0
+  %nop9456 = alloca i1, i1 0
+  %nop9457 = alloca i1, i1 0
+  %nop9458 = alloca i1, i1 0
+  %nop9459 = alloca i1, i1 0
+  %nop9460 = alloca i1, i1 0
+  %nop9461 = alloca i1, i1 0
+  %nop9462 = alloca i1, i1 0
+  %nop9463 = alloca i1, i1 0
+  %nop9464 = alloca i1, i1 0
+  %nop9465 = alloca i1, i1 0
+  %nop9466 = alloca i1, i1 0
+  %nop9467 = alloca i1, i1 0
+  %nop9468 = alloca i1, i1 0
+  %nop9469 = alloca i1, i1 0
+  %nop9470 = alloca i1, i1 0
+  %nop9471 = alloca i1, i1 0
+  %nop9472 = alloca i1, i1 0
+  %nop9473 = alloca i1, i1 0
+  %nop9474 = alloca i1, i1 0
+  %nop9475 = alloca i1, i1 0
+  %nop9476 = alloca i1, i1 0
+  %nop9477 = alloca i1, i1 0
+  %nop9478 = alloca i1, i1 0
+  %nop9479 = alloca i1, i1 0
+  %nop9480 = alloca i1, i1 0
+  %nop9481 = alloca i1, i1 0
+  %nop9482 = alloca i1, i1 0
+  %nop9483 = alloca i1, i1 0
+  %nop9484 = alloca i1, i1 0
+  %nop9485 = alloca i1, i1 0
+  %nop9486 = alloca i1, i1 0
+  %nop9487 = alloca i1, i1 0
+  %nop9488 = alloca i1, i1 0
+  %nop9489 = alloca i1, i1 0
+  %nop9490 = alloca i1, i1 0
+  %nop9491 = alloca i1, i1 0
+  %nop9492 = alloca i1, i1 0
+  %nop9493 = alloca i1, i1 0
+  %nop9494 = alloca i1, i1 0
+  %nop9495 = alloca i1, i1 0
+  %nop9496 = alloca i1, i1 0
+  %nop9497 = alloca i1, i1 0
+  %nop9498 = alloca i1, i1 0
+  %nop9499 = alloca i1, i1 0
+  %nop9500 = alloca i1, i1 0
+  %nop9501 = alloca i1, i1 0
+  %nop9502 = alloca i1, i1 0
+  %nop9503 = alloca i1, i1 0
+  %nop9504 = alloca i1, i1 0
+  %nop9505 = alloca i1, i1 0
+  %nop9506 = alloca i1, i1 0
+  %nop9507 = alloca i1, i1 0
+  %nop9508 = alloca i1, i1 0
+  %nop9509 = alloca i1, i1 0
+  %nop9510 = alloca i1, i1 0
+  %nop9511 = alloca i1, i1 0
+  %nop9512 = alloca i1, i1 0
+  %nop9513 = alloca i1, i1 0
+  %nop9514 = alloca i1, i1 0
+  %nop9515 = alloca i1, i1 0
+  %nop9516 = alloca i1, i1 0
+  %nop9517 = alloca i1, i1 0
+  %nop9518 = alloca i1, i1 0
+  %nop9519 = alloca i1, i1 0
+  %nop9520 = alloca i1, i1 0
+  %nop9521 = alloca i1, i1 0
+  %nop9522 = alloca i1, i1 0
+  %nop9523 = alloca i1, i1 0
+  %nop9524 = alloca i1, i1 0
+  %nop9525 = alloca i1, i1 0
+  %nop9526 = alloca i1, i1 0
+  %nop9527 = alloca i1, i1 0
+  %nop9528 = alloca i1, i1 0
+  %nop9529 = alloca i1, i1 0
+  %nop9530 = alloca i1, i1 0
+  %nop9531 = alloca i1, i1 0
+  %nop9532 = alloca i1, i1 0
+  %nop9533 = alloca i1, i1 0
+  %nop9534 = alloca i1, i1 0
+  %nop9535 = alloca i1, i1 0
+  %nop9536 = alloca i1, i1 0
+  %nop9537 = alloca i1, i1 0
+  %nop9538 = alloca i1, i1 0
+  %nop9539 = alloca i1, i1 0
+  %nop9540 = alloca i1, i1 0
+  %nop9541 = alloca i1, i1 0
+  %nop9542 = alloca i1, i1 0
+  %nop9543 = alloca i1, i1 0
+  %nop9544 = alloca i1, i1 0
+  %nop9545 = alloca i1, i1 0
+  %nop9546 = alloca i1, i1 0
+  %nop9547 = alloca i1, i1 0
+  %nop9548 = alloca i1, i1 0
+  %nop9549 = alloca i1, i1 0
+  %nop9550 = alloca i1, i1 0
+  %nop9551 = alloca i1, i1 0
+  %nop9552 = alloca i1, i1 0
+  %nop9553 = alloca i1, i1 0
+  %nop9554 = alloca i1, i1 0
+  %nop9555 = alloca i1, i1 0
+  %nop9556 = alloca i1, i1 0
+  %nop9557 = alloca i1, i1 0
+  %nop9558 = alloca i1, i1 0
+  %nop9559 = alloca i1, i1 0
+  %nop9560 = alloca i1, i1 0
+  %nop9561 = alloca i1, i1 0
+  %nop9562 = alloca i1, i1 0
+  %nop9563 = alloca i1, i1 0
+  %nop9564 = alloca i1, i1 0
+  %nop9565 = alloca i1, i1 0
+  %nop9566 = alloca i1, i1 0
+  %nop9567 = alloca i1, i1 0
+  %nop9568 = alloca i1, i1 0
+  %nop9569 = alloca i1, i1 0
+  %nop9570 = alloca i1, i1 0
+  %nop9571 = alloca i1, i1 0
+  %nop9572 = alloca i1, i1 0
+  %nop9573 = alloca i1, i1 0
+  %nop9574 = alloca i1, i1 0
+  %nop9575 = alloca i1, i1 0
+  %nop9576 = alloca i1, i1 0
+  %nop9577 = alloca i1, i1 0
+  %nop9578 = alloca i1, i1 0
+  %nop9579 = alloca i1, i1 0
+  %nop9580 = alloca i1, i1 0
+  %nop9581 = alloca i1, i1 0
+  %nop9582 = alloca i1, i1 0
+  %nop9583 = alloca i1, i1 0
+  %nop9584 = alloca i1, i1 0
+  %nop9585 = alloca i1, i1 0
+  %nop9586 = alloca i1, i1 0
+  %nop9587 = alloca i1, i1 0
+  %nop9588 = alloca i1, i1 0
+  %nop9589 = alloca i1, i1 0
+  %nop9590 = alloca i1, i1 0
+  %nop9591 = alloca i1, i1 0
+  %nop9592 = alloca i1, i1 0
+  %nop9593 = alloca i1, i1 0
+  %nop9594 = alloca i1, i1 0
+  %nop9595 = alloca i1, i1 0
+  %nop9596 = alloca i1, i1 0
+  %nop9597 = alloca i1, i1 0
+  %nop9598 = alloca i1, i1 0
+  %nop9599 = alloca i1, i1 0
+  %nop9600 = alloca i1, i1 0
+  %nop9601 = alloca i1, i1 0
+  %nop9602 = alloca i1, i1 0
+  %nop9603 = alloca i1, i1 0
+  %nop9604 = alloca i1, i1 0
+  %nop9605 = alloca i1, i1 0
+  %nop9606 = alloca i1, i1 0
+  %nop9607 = alloca i1, i1 0
+  %nop9608 = alloca i1, i1 0
+  %nop9609 = alloca i1, i1 0
+  %nop9610 = alloca i1, i1 0
+  %nop9611 = alloca i1, i1 0
+  %nop9612 = alloca i1, i1 0
+  %nop9613 = alloca i1, i1 0
+  %nop9614 = alloca i1, i1 0
+  %nop9615 = alloca i1, i1 0
+  %nop9616 = alloca i1, i1 0
+  %nop9617 = alloca i1, i1 0
+  %nop9618 = alloca i1, i1 0
+  %nop9619 = alloca i1, i1 0
+  %nop9620 = alloca i1, i1 0
+  %nop9621 = alloca i1, i1 0
+  %nop9622 = alloca i1, i1 0
+  %nop9623 = alloca i1, i1 0
+  %nop9624 = alloca i1, i1 0
+  %nop9625 = alloca i1, i1 0
+  %nop9626 = alloca i1, i1 0
+  %nop9627 = alloca i1, i1 0
+  %nop9628 = alloca i1, i1 0
+  %nop9629 = alloca i1, i1 0
+  %nop9630 = alloca i1, i1 0
+  %nop9631 = alloca i1, i1 0
+  %nop9632 = alloca i1, i1 0
+  %nop9633 = alloca i1, i1 0
+  %nop9634 = alloca i1, i1 0
+  %nop9635 = alloca i1, i1 0
+  %nop9636 = alloca i1, i1 0
+  %nop9637 = alloca i1, i1 0
+  %nop9638 = alloca i1, i1 0
+  %nop9639 = alloca i1, i1 0
+  %nop9640 = alloca i1, i1 0
+  %nop9641 = alloca i1, i1 0
+  %nop9642 = alloca i1, i1 0
+  %nop9643 = alloca i1, i1 0
+  %nop9644 = alloca i1, i1 0
+  %nop9645 = alloca i1, i1 0
+  %nop9646 = alloca i1, i1 0
+  %nop9647 = alloca i1, i1 0
+  %nop9648 = alloca i1, i1 0
+  %nop9649 = alloca i1, i1 0
+  %nop9650 = alloca i1, i1 0
+  %nop9651 = alloca i1, i1 0
+  %nop9652 = alloca i1, i1 0
+  %nop9653 = alloca i1, i1 0
+  %nop9654 = alloca i1, i1 0
+  %nop9655 = alloca i1, i1 0
+  %nop9656 = alloca i1, i1 0
+  %nop9657 = alloca i1, i1 0
+  %nop9658 = alloca i1, i1 0
+  %nop9659 = alloca i1, i1 0
+  %nop9660 = alloca i1, i1 0
+  %nop9661 = alloca i1, i1 0
+  %nop9662 = alloca i1, i1 0
+  %nop9663 = alloca i1, i1 0
+  %nop9664 = alloca i1, i1 0
+  %nop9665 = alloca i1, i1 0
+  %nop9666 = alloca i1, i1 0
+  %nop9667 = alloca i1, i1 0
+  %nop9668 = alloca i1, i1 0
+  %nop9669 = alloca i1, i1 0
+  %nop9670 = alloca i1, i1 0
+  %nop9671 = alloca i1, i1 0
+  %nop9672 = alloca i1, i1 0
+  %nop9673 = alloca i1, i1 0
+  %nop9674 = alloca i1, i1 0
+  %nop9675 = alloca i1, i1 0
+  %nop9676 = alloca i1, i1 0
+  %nop9677 = alloca i1, i1 0
+  %nop9678 = alloca i1, i1 0
+  %nop9679 = alloca i1, i1 0
+  %nop9680 = alloca i1, i1 0
+  %nop9681 = alloca i1, i1 0
+  %nop9682 = alloca i1, i1 0
+  %nop9683 = alloca i1, i1 0
+  %nop9684 = alloca i1, i1 0
+  %nop9685 = alloca i1, i1 0
+  %nop9686 = alloca i1, i1 0
+  %nop9687 = alloca i1, i1 0
+  %nop9688 = alloca i1, i1 0
+  %nop9689 = alloca i1, i1 0
+  %nop9690 = alloca i1, i1 0
+  %nop9691 = alloca i1, i1 0
+  %nop9692 = alloca i1, i1 0
+  %nop9693 = alloca i1, i1 0
+  %nop9694 = alloca i1, i1 0
+  %nop9695 = alloca i1, i1 0
+  %nop9696 = alloca i1, i1 0
+  %nop9697 = alloca i1, i1 0
+  %nop9698 = alloca i1, i1 0
+  %nop9699 = alloca i1, i1 0
+  %nop9700 = alloca i1, i1 0
+  %nop9701 = alloca i1, i1 0
+  %nop9702 = alloca i1, i1 0
+  %nop9703 = alloca i1, i1 0
+  %nop9704 = alloca i1, i1 0
+  %nop9705 = alloca i1, i1 0
+  %nop9706 = alloca i1, i1 0
+  %nop9707 = alloca i1, i1 0
+  %nop9708 = alloca i1, i1 0
+  %nop9709 = alloca i1, i1 0
+  %nop9710 = alloca i1, i1 0
+  %nop9711 = alloca i1, i1 0
+  %nop9712 = alloca i1, i1 0
+  %nop9713 = alloca i1, i1 0
+  %nop9714 = alloca i1, i1 0
+  %nop9715 = alloca i1, i1 0
+  %nop9716 = alloca i1, i1 0
+  %nop9717 = alloca i1, i1 0
+  %nop9718 = alloca i1, i1 0
+  %nop9719 = alloca i1, i1 0
+  %nop9720 = alloca i1, i1 0
+  %nop9721 = alloca i1, i1 0
+  %nop9722 = alloca i1, i1 0
+  %nop9723 = alloca i1, i1 0
+  %nop9724 = alloca i1, i1 0
+  %nop9725 = alloca i1, i1 0
+  %nop9726 = alloca i1, i1 0
+  %nop9727 = alloca i1, i1 0
+  %nop9728 = alloca i1, i1 0
+  %nop9729 = alloca i1, i1 0
+  %nop9730 = alloca i1, i1 0
+  %nop9731 = alloca i1, i1 0
+  %nop9732 = alloca i1, i1 0
+  %nop9733 = alloca i1, i1 0
+  %nop9734 = alloca i1, i1 0
+  %nop9735 = alloca i1, i1 0
+  %nop9736 = alloca i1, i1 0
+  %nop9737 = alloca i1, i1 0
+  %nop9738 = alloca i1, i1 0
+  %nop9739 = alloca i1, i1 0
+  %nop9740 = alloca i1, i1 0
+  %nop9741 = alloca i1, i1 0
+  %nop9742 = alloca i1, i1 0
+  %nop9743 = alloca i1, i1 0
+  %nop9744 = alloca i1, i1 0
+  %nop9745 = alloca i1, i1 0
+  %nop9746 = alloca i1, i1 0
+  %nop9747 = alloca i1, i1 0
+  %nop9748 = alloca i1, i1 0
+  %nop9749 = alloca i1, i1 0
+  %nop9750 = alloca i1, i1 0
+  %nop9751 = alloca i1, i1 0
+  %nop9752 = alloca i1, i1 0
+  %nop9753 = alloca i1, i1 0
+  %nop9754 = alloca i1, i1 0
+  %nop9755 = alloca i1, i1 0
+  %nop9756 = alloca i1, i1 0
+  %nop9757 = alloca i1, i1 0
+  %nop9758 = alloca i1, i1 0
+  %nop9759 = alloca i1, i1 0
+  %nop9760 = alloca i1, i1 0
+  %nop9761 = alloca i1, i1 0
+  %nop9762 = alloca i1, i1 0
+  %nop9763 = alloca i1, i1 0
+  %nop9764 = alloca i1, i1 0
+  %nop9765 = alloca i1, i1 0
+  %nop9766 = alloca i1, i1 0
+  %nop9767 = alloca i1, i1 0
+  %nop9768 = alloca i1, i1 0
+  %nop9769 = alloca i1, i1 0
+  %nop9770 = alloca i1, i1 0
+  %nop9771 = alloca i1, i1 0
+  %nop9772 = alloca i1, i1 0
+  %nop9773 = alloca i1, i1 0
+  %nop9774 = alloca i1, i1 0
+  %nop9775 = alloca i1, i1 0
+  %nop9776 = alloca i1, i1 0
+  %nop9777 = alloca i1, i1 0
+  %nop9778 = alloca i1, i1 0
+  %nop9779 = alloca i1, i1 0
+  %nop9780 = alloca i1, i1 0
+  %nop9781 = alloca i1, i1 0
+  %nop9782 = alloca i1, i1 0
+  %nop9783 = alloca i1, i1 0
+  %nop9784 = alloca i1, i1 0
+  %nop9785 = alloca i1, i1 0
+  %nop9786 = alloca i1, i1 0
+  %nop9787 = alloca i1, i1 0
+  %nop9788 = alloca i1, i1 0
+  %nop9789 = alloca i1, i1 0
+  %nop9790 = alloca i1, i1 0
+  %nop9791 = alloca i1, i1 0
+  %nop9792 = alloca i1, i1 0
+  %nop9793 = alloca i1, i1 0
+  %nop9794 = alloca i1, i1 0
+  %nop9795 = alloca i1, i1 0
+  %nop9796 = alloca i1, i1 0
+  %nop9797 = alloca i1, i1 0
+  %nop9798 = alloca i1, i1 0
+  %nop9799 = alloca i1, i1 0
+  %nop9800 = alloca i1, i1 0
+  %nop9801 = alloca i1, i1 0
+  %nop9802 = alloca i1, i1 0
+  %nop9803 = alloca i1, i1 0
+  %nop9804 = alloca i1, i1 0
+  %nop9805 = alloca i1, i1 0
+  %nop9806 = alloca i1, i1 0
+  %nop9807 = alloca i1, i1 0
+  %nop9808 = alloca i1, i1 0
+  %nop9809 = alloca i1, i1 0
+  %nop9810 = alloca i1, i1 0
+  %nop9811 = alloca i1, i1 0
+  %nop9812 = alloca i1, i1 0
+  %nop9813 = alloca i1, i1 0
+  %nop9814 = alloca i1, i1 0
+  %nop9815 = alloca i1, i1 0
+  %nop9816 = alloca i1, i1 0
+  %nop9817 = alloca i1, i1 0
+  %nop9818 = alloca i1, i1 0
+  %nop9819 = alloca i1, i1 0
+  %nop9820 = alloca i1, i1 0
+  %nop9821 = alloca i1, i1 0
+  %nop9822 = alloca i1, i1 0
+  %nop9823 = alloca i1, i1 0
+  %nop9824 = alloca i1, i1 0
+  %nop9825 = alloca i1, i1 0
+  %nop9826 = alloca i1, i1 0
+  %nop9827 = alloca i1, i1 0
+  %nop9828 = alloca i1, i1 0
+  %nop9829 = alloca i1, i1 0
+  %nop9830 = alloca i1, i1 0
+  %nop9831 = alloca i1, i1 0
+  %nop9832 = alloca i1, i1 0
+  %nop9833 = alloca i1, i1 0
+  %nop9834 = alloca i1, i1 0
+  %nop9835 = alloca i1, i1 0
+  %nop9836 = alloca i1, i1 0
+  %nop9837 = alloca i1, i1 0
+  %nop9838 = alloca i1, i1 0
+  %nop9839 = alloca i1, i1 0
+  %nop9840 = alloca i1, i1 0
+  %nop9841 = alloca i1, i1 0
+  %nop9842 = alloca i1, i1 0
+  %nop9843 = alloca i1, i1 0
+  %nop9844 = alloca i1, i1 0
+  %nop9845 = alloca i1, i1 0
+  %nop9846 = alloca i1, i1 0
+  %nop9847 = alloca i1, i1 0
+  %nop9848 = alloca i1, i1 0
+  %nop9849 = alloca i1, i1 0
+  %nop9850 = alloca i1, i1 0
+  %nop9851 = alloca i1, i1 0
+  %nop9852 = alloca i1, i1 0
+  %nop9853 = alloca i1, i1 0
+  %nop9854 = alloca i1, i1 0
+  %nop9855 = alloca i1, i1 0
+  %nop9856 = alloca i1, i1 0
+  %nop9857 = alloca i1, i1 0
+  %nop9858 = alloca i1, i1 0
+  %nop9859 = alloca i1, i1 0
+  %nop9860 = alloca i1, i1 0
+  %nop9861 = alloca i1, i1 0
+  %nop9862 = alloca i1, i1 0
+  %nop9863 = alloca i1, i1 0
+  %nop9864 = alloca i1, i1 0
+  %nop9865 = alloca i1, i1 0
+  %nop9866 = alloca i1, i1 0
+  %nop9867 = alloca i1, i1 0
+  %nop9868 = alloca i1, i1 0
+  %nop9869 = alloca i1, i1 0
+  %nop9870 = alloca i1, i1 0
+  %nop9871 = alloca i1, i1 0
+  %nop9872 = alloca i1, i1 0
+  %nop9873 = alloca i1, i1 0
+  %nop9874 = alloca i1, i1 0
+  %nop9875 = alloca i1, i1 0
+  %nop9876 = alloca i1, i1 0
+  %nop9877 = alloca i1, i1 0
+  %nop9878 = alloca i1, i1 0
+  %nop9879 = alloca i1, i1 0
+  %nop9880 = alloca i1, i1 0
+  %nop9881 = alloca i1, i1 0
+  %nop9882 = alloca i1, i1 0
+  %nop9883 = alloca i1, i1 0
+  %nop9884 = alloca i1, i1 0
+  %nop9885 = alloca i1, i1 0
+  %nop9886 = alloca i1, i1 0
+  %nop9887 = alloca i1, i1 0
+  %nop9888 = alloca i1, i1 0
+  %nop9889 = alloca i1, i1 0
+  %nop9890 = alloca i1, i1 0
+  %nop9891 = alloca i1, i1 0
+  %nop9892 = alloca i1, i1 0
+  %nop9893 = alloca i1, i1 0
+  %nop9894 = alloca i1, i1 0
+  %nop9895 = alloca i1, i1 0
+  %nop9896 = alloca i1, i1 0
+  %nop9897 = alloca i1, i1 0
+  %nop9898 = alloca i1, i1 0
+  %nop9899 = alloca i1, i1 0
+  %nop9900 = alloca i1, i1 0
+  %nop9901 = alloca i1, i1 0
+  %nop9902 = alloca i1, i1 0
+  %nop9903 = alloca i1, i1 0
+  %nop9904 = alloca i1, i1 0
+  %nop9905 = alloca i1, i1 0
+  %nop9906 = alloca i1, i1 0
+  %nop9907 = alloca i1, i1 0
+  %nop9908 = alloca i1, i1 0
+  %nop9909 = alloca i1, i1 0
+  %nop9910 = alloca i1, i1 0
+  %nop9911 = alloca i1, i1 0
+  %nop9912 = alloca i1, i1 0
+  %nop9913 = alloca i1, i1 0
+  %nop9914 = alloca i1, i1 0
+  %nop9915 = alloca i1, i1 0
+  %nop9916 = alloca i1, i1 0
+  %nop9917 = alloca i1, i1 0
+  %nop9918 = alloca i1, i1 0
+  %nop9919 = alloca i1, i1 0
+  %nop9920 = alloca i1, i1 0
+  %nop9921 = alloca i1, i1 0
+  %nop9922 = alloca i1, i1 0
+  %nop9923 = alloca i1, i1 0
+  %nop9924 = alloca i1, i1 0
+  %nop9925 = alloca i1, i1 0
+  %nop9926 = alloca i1, i1 0
+  %nop9927 = alloca i1, i1 0
+  %nop9928 = alloca i1, i1 0
+  %nop9929 = alloca i1, i1 0
+  %nop9930 = alloca i1, i1 0
+  %nop9931 = alloca i1, i1 0
+  %nop9932 = alloca i1, i1 0
+  %nop9933 = alloca i1, i1 0
+  %nop9934 = alloca i1, i1 0
+  %nop9935 = alloca i1, i1 0
+  %nop9936 = alloca i1, i1 0
+  %nop9937 = alloca i1, i1 0
+  %nop9938 = alloca i1, i1 0
+  %nop9939 = alloca i1, i1 0
+  %nop9940 = alloca i1, i1 0
+  %nop9941 = alloca i1, i1 0
+  %nop9942 = alloca i1, i1 0
+  %nop9943 = alloca i1, i1 0
+  %nop9944 = alloca i1, i1 0
+  %nop9945 = alloca i1, i1 0
+  %nop9946 = alloca i1, i1 0
+  %nop9947 = alloca i1, i1 0
+  %nop9948 = alloca i1, i1 0
+  %nop9949 = alloca i1, i1 0
+  %nop9950 = alloca i1, i1 0
+  %nop9951 = alloca i1, i1 0
+  %nop9952 = alloca i1, i1 0
+  %nop9953 = alloca i1, i1 0
+  %nop9954 = alloca i1, i1 0
+  %nop9955 = alloca i1, i1 0
+  %nop9956 = alloca i1, i1 0
+  %nop9957 = alloca i1, i1 0
+  %nop9958 = alloca i1, i1 0
+  %nop9959 = alloca i1, i1 0
+  %nop9960 = alloca i1, i1 0
+  %nop9961 = alloca i1, i1 0
+  %nop9962 = alloca i1, i1 0
+  %nop9963 = alloca i1, i1 0
+  %nop9964 = alloca i1, i1 0
+  %nop9965 = alloca i1, i1 0
+  %nop9966 = alloca i1, i1 0
+  %nop9967 = alloca i1, i1 0
+  %nop9968 = alloca i1, i1 0
+  %nop9969 = alloca i1, i1 0
+  %nop9970 = alloca i1, i1 0
+  %nop9971 = alloca i1, i1 0
+  %nop9972 = alloca i1, i1 0
+  %nop9973 = alloca i1, i1 0
+  %nop9974 = alloca i1, i1 0
+  %nop9975 = alloca i1, i1 0
+  %nop9976 = alloca i1, i1 0
+  %nop9977 = alloca i1, i1 0
+  %nop9978 = alloca i1, i1 0
+  %nop9979 = alloca i1, i1 0
+  %nop9980 = alloca i1, i1 0
+  %nop9981 = alloca i1, i1 0
+  %nop9982 = alloca i1, i1 0
+  %nop9983 = alloca i1, i1 0
+  %nop9984 = alloca i1, i1 0
+  %nop9985 = alloca i1, i1 0
+  %nop9986 = alloca i1, i1 0
+  %nop9987 = alloca i1, i1 0
+  %nop9988 = alloca i1, i1 0
+  %nop9989 = alloca i1, i1 0
+  %nop9990 = alloca i1, i1 0
+  %nop9991 = alloca i1, i1 0
+  %nop9992 = alloca i1, i1 0
+  %nop9993 = alloca i1, i1 0
+  %nop9994 = alloca i1, i1 0
+  %nop9995 = alloca i1, i1 0
+  %nop9996 = alloca i1, i1 0
+  %nop9997 = alloca i1, i1 0
+  %nop9998 = alloca i1, i1 0
+  %nop9999 = alloca i1, i1 0
+  %nop10000 = alloca i1, i1 0
+  %nop10001 = alloca i1, i1 0
+  %nop10002 = alloca i1, i1 0
+  %nop10003 = alloca i1, i1 0
+  %nop10004 = alloca i1, i1 0
+  %nop10005 = alloca i1, i1 0
+  %nop10006 = alloca i1, i1 0
+  %nop10007 = alloca i1, i1 0
+  %nop10008 = alloca i1, i1 0
+  %nop10009 = alloca i1, i1 0
+  %nop10010 = alloca i1, i1 0
+  %nop10011 = alloca i1, i1 0
+  %nop10012 = alloca i1, i1 0
+  %nop10013 = alloca i1, i1 0
+  %nop10014 = alloca i1, i1 0
+  %nop10015 = alloca i1, i1 0
+  %nop10016 = alloca i1, i1 0
+  %nop10017 = alloca i1, i1 0
+  %nop10018 = alloca i1, i1 0
+  %nop10019 = alloca i1, i1 0
+  %nop10020 = alloca i1, i1 0
+  %nop10021 = alloca i1, i1 0
+  %nop10022 = alloca i1, i1 0
+  %nop10023 = alloca i1, i1 0
+  %nop10024 = alloca i1, i1 0
+  %nop10025 = alloca i1, i1 0
+  %nop10026 = alloca i1, i1 0
+  %nop10027 = alloca i1, i1 0
+  %nop10028 = alloca i1, i1 0
+  %nop10029 = alloca i1, i1 0
+  %nop10030 = alloca i1, i1 0
+  %nop10031 = alloca i1, i1 0
+  %nop10032 = alloca i1, i1 0
+  %nop10033 = alloca i1, i1 0
+  %nop10034 = alloca i1, i1 0
+  %nop10035 = alloca i1, i1 0
+  %nop10036 = alloca i1, i1 0
+  %nop10037 = alloca i1, i1 0
+  %nop10038 = alloca i1, i1 0
+  %nop10039 = alloca i1, i1 0
+  %nop10040 = alloca i1, i1 0
+  %nop10041 = alloca i1, i1 0
+  %nop10042 = alloca i1, i1 0
+  %nop10043 = alloca i1, i1 0
+  %nop10044 = alloca i1, i1 0
+  %nop10045 = alloca i1, i1 0
+  %nop10046 = alloca i1, i1 0
+  %nop10047 = alloca i1, i1 0
+  %nop10048 = alloca i1, i1 0
+  %nop10049 = alloca i1, i1 0
+  %nop10050 = alloca i1, i1 0
+  %nop10051 = alloca i1, i1 0
+  %nop10052 = alloca i1, i1 0
+  %nop10053 = alloca i1, i1 0
+  %nop10054 = alloca i1, i1 0
+  %nop10055 = alloca i1, i1 0
+  %nop10056 = alloca i1, i1 0
+  %nop10057 = alloca i1, i1 0
+  %nop10058 = alloca i1, i1 0
+  %nop10059 = alloca i1, i1 0
+  %nop10060 = alloca i1, i1 0
+  %nop10061 = alloca i1, i1 0
+  %nop10062 = alloca i1, i1 0
+  %nop10063 = alloca i1, i1 0
+  %nop10064 = alloca i1, i1 0
+  %nop10065 = alloca i1, i1 0
+  %nop10066 = alloca i1, i1 0
+  %nop10067 = alloca i1, i1 0
+  %nop10068 = alloca i1, i1 0
+  %nop10069 = alloca i1, i1 0
+  %nop10070 = alloca i1, i1 0
+  %nop10071 = alloca i1, i1 0
+  %nop10072 = alloca i1, i1 0
+  %nop10073 = alloca i1, i1 0
+  %nop10074 = alloca i1, i1 0
+  %nop10075 = alloca i1, i1 0
+  %nop10076 = alloca i1, i1 0
+  %nop10077 = alloca i1, i1 0
+  %nop10078 = alloca i1, i1 0
+  %nop10079 = alloca i1, i1 0
+  %nop10080 = alloca i1, i1 0
+  %nop10081 = alloca i1, i1 0
+  %nop10082 = alloca i1, i1 0
+  %nop10083 = alloca i1, i1 0
+  %nop10084 = alloca i1, i1 0
+  %nop10085 = alloca i1, i1 0
+  %nop10086 = alloca i1, i1 0
+  %nop10087 = alloca i1, i1 0
+  %nop10088 = alloca i1, i1 0
+  %nop10089 = alloca i1, i1 0
+  %nop10090 = alloca i1, i1 0
+  %nop10091 = alloca i1, i1 0
+  %nop10092 = alloca i1, i1 0
+  %nop10093 = alloca i1, i1 0
+  %nop10094 = alloca i1, i1 0
+  %nop10095 = alloca i1, i1 0
+  %nop10096 = alloca i1, i1 0
+  %nop10097 = alloca i1, i1 0
+  %nop10098 = alloca i1, i1 0
+  %nop10099 = alloca i1, i1 0
+  %nop10100 = alloca i1, i1 0
+  %nop10101 = alloca i1, i1 0
+  %nop10102 = alloca i1, i1 0
+  %nop10103 = alloca i1, i1 0
+  %nop10104 = alloca i1, i1 0
+  %nop10105 = alloca i1, i1 0
+  %nop10106 = alloca i1, i1 0
+  %nop10107 = alloca i1, i1 0
+  %nop10108 = alloca i1, i1 0
+  %nop10109 = alloca i1, i1 0
+  %nop10110 = alloca i1, i1 0
+  %nop10111 = alloca i1, i1 0
+  %nop10112 = alloca i1, i1 0
+  %nop10113 = alloca i1, i1 0
+  %nop10114 = alloca i1, i1 0
+  %nop10115 = alloca i1, i1 0
+  %nop10116 = alloca i1, i1 0
+  %nop10117 = alloca i1, i1 0
+  %nop10118 = alloca i1, i1 0
+  %nop10119 = alloca i1, i1 0
+  %nop10120 = alloca i1, i1 0
+  %nop10121 = alloca i1, i1 0
+  %nop10122 = alloca i1, i1 0
+  %nop10123 = alloca i1, i1 0
+  %nop10124 = alloca i1, i1 0
+  %nop10125 = alloca i1, i1 0
+  %nop10126 = alloca i1, i1 0
+  %nop10127 = alloca i1, i1 0
+  %nop10128 = alloca i1, i1 0
+  %nop10129 = alloca i1, i1 0
+  %nop10130 = alloca i1, i1 0
+  %nop10131 = alloca i1, i1 0
+  %nop10132 = alloca i1, i1 0
+  %nop10133 = alloca i1, i1 0
+  %nop10134 = alloca i1, i1 0
+  %nop10135 = alloca i1, i1 0
+  %nop10136 = alloca i1, i1 0
+  %nop10137 = alloca i1, i1 0
+  %nop10138 = alloca i1, i1 0
+  %nop10139 = alloca i1, i1 0
+  %nop10140 = alloca i1, i1 0
+  %nop10141 = alloca i1, i1 0
+  %nop10142 = alloca i1, i1 0
+  %nop10143 = alloca i1, i1 0
+  %nop10144 = alloca i1, i1 0
+  %nop10145 = alloca i1, i1 0
+  %nop10146 = alloca i1, i1 0
+  %nop10147 = alloca i1, i1 0
+  %nop10148 = alloca i1, i1 0
+  %nop10149 = alloca i1, i1 0
+  %nop10150 = alloca i1, i1 0
+  %nop10151 = alloca i1, i1 0
+  %nop10152 = alloca i1, i1 0
+  %nop10153 = alloca i1, i1 0
+  %nop10154 = alloca i1, i1 0
+  %nop10155 = alloca i1, i1 0
+  %nop10156 = alloca i1, i1 0
+  %nop10157 = alloca i1, i1 0
+  %nop10158 = alloca i1, i1 0
+  %nop10159 = alloca i1, i1 0
+  %nop10160 = alloca i1, i1 0
+  %nop10161 = alloca i1, i1 0
+  %nop10162 = alloca i1, i1 0
+  %nop10163 = alloca i1, i1 0
+  %nop10164 = alloca i1, i1 0
+  %nop10165 = alloca i1, i1 0
+  %nop10166 = alloca i1, i1 0
+  %nop10167 = alloca i1, i1 0
+  %nop10168 = alloca i1, i1 0
+  %nop10169 = alloca i1, i1 0
+  %nop10170 = alloca i1, i1 0
+  %nop10171 = alloca i1, i1 0
+  %nop10172 = alloca i1, i1 0
+  %nop10173 = alloca i1, i1 0
+  %nop10174 = alloca i1, i1 0
+  %nop10175 = alloca i1, i1 0
+  %nop10176 = alloca i1, i1 0
+  %nop10177 = alloca i1, i1 0
+  %nop10178 = alloca i1, i1 0
+  %nop10179 = alloca i1, i1 0
+  %nop10180 = alloca i1, i1 0
+  %nop10181 = alloca i1, i1 0
+  %nop10182 = alloca i1, i1 0
+  %nop10183 = alloca i1, i1 0
+  %nop10184 = alloca i1, i1 0
+  %nop10185 = alloca i1, i1 0
+  %nop10186 = alloca i1, i1 0
+  %nop10187 = alloca i1, i1 0
+  %nop10188 = alloca i1, i1 0
+  %nop10189 = alloca i1, i1 0
+  %nop10190 = alloca i1, i1 0
+  %nop10191 = alloca i1, i1 0
+  %nop10192 = alloca i1, i1 0
+  %nop10193 = alloca i1, i1 0
+  %nop10194 = alloca i1, i1 0
+  %nop10195 = alloca i1, i1 0
+  %nop10196 = alloca i1, i1 0
+  %nop10197 = alloca i1, i1 0
+  %nop10198 = alloca i1, i1 0
+  %nop10199 = alloca i1, i1 0
+  %nop10200 = alloca i1, i1 0
+  %nop10201 = alloca i1, i1 0
+  %nop10202 = alloca i1, i1 0
+  %nop10203 = alloca i1, i1 0
+  %nop10204 = alloca i1, i1 0
+  %nop10205 = alloca i1, i1 0
+  %nop10206 = alloca i1, i1 0
+  %nop10207 = alloca i1, i1 0
+  %nop10208 = alloca i1, i1 0
+  %nop10209 = alloca i1, i1 0
+  %nop10210 = alloca i1, i1 0
+  %nop10211 = alloca i1, i1 0
+  %nop10212 = alloca i1, i1 0
+  %nop10213 = alloca i1, i1 0
+  %nop10214 = alloca i1, i1 0
+  %nop10215 = alloca i1, i1 0
+  %nop10216 = alloca i1, i1 0
+  %nop10217 = alloca i1, i1 0
+  %nop10218 = alloca i1, i1 0
+  %nop10219 = alloca i1, i1 0
+  %nop10220 = alloca i1, i1 0
+  %nop10221 = alloca i1, i1 0
+  %nop10222 = alloca i1, i1 0
+  %nop10223 = alloca i1, i1 0
+  %nop10224 = alloca i1, i1 0
+  %nop10225 = alloca i1, i1 0
+  %nop10226 = alloca i1, i1 0
+  %nop10227 = alloca i1, i1 0
+  %nop10228 = alloca i1, i1 0
+  %nop10229 = alloca i1, i1 0
+  %nop10230 = alloca i1, i1 0
+  %nop10231 = alloca i1, i1 0
+  %nop10232 = alloca i1, i1 0
+  %nop10233 = alloca i1, i1 0
+  %nop10234 = alloca i1, i1 0
+  %nop10235 = alloca i1, i1 0
+  %nop10236 = alloca i1, i1 0
+  %nop10237 = alloca i1, i1 0
+  %nop10238 = alloca i1, i1 0
+  %nop10239 = alloca i1, i1 0
+  %nop10240 = alloca i1, i1 0
+  %nop10241 = alloca i1, i1 0
+  %nop10242 = alloca i1, i1 0
+  %nop10243 = alloca i1, i1 0
+  %nop10244 = alloca i1, i1 0
+  %nop10245 = alloca i1, i1 0
+  %nop10246 = alloca i1, i1 0
+  %nop10247 = alloca i1, i1 0
+  %nop10248 = alloca i1, i1 0
+  %nop10249 = alloca i1, i1 0
+  %nop10250 = alloca i1, i1 0
+  %nop10251 = alloca i1, i1 0
+  %nop10252 = alloca i1, i1 0
+  %nop10253 = alloca i1, i1 0
+  %nop10254 = alloca i1, i1 0
+  %nop10255 = alloca i1, i1 0
+  %nop10256 = alloca i1, i1 0
+  %nop10257 = alloca i1, i1 0
+  %nop10258 = alloca i1, i1 0
+  %nop10259 = alloca i1, i1 0
+  %nop10260 = alloca i1, i1 0
+  %nop10261 = alloca i1, i1 0
+  %nop10262 = alloca i1, i1 0
+  %nop10263 = alloca i1, i1 0
+  %nop10264 = alloca i1, i1 0
+  %nop10265 = alloca i1, i1 0
+  %nop10266 = alloca i1, i1 0
+  %nop10267 = alloca i1, i1 0
+  %nop10268 = alloca i1, i1 0
+  %nop10269 = alloca i1, i1 0
+  %nop10270 = alloca i1, i1 0
+  %nop10271 = alloca i1, i1 0
+  %nop10272 = alloca i1, i1 0
+  %nop10273 = alloca i1, i1 0
+  %nop10274 = alloca i1, i1 0
+  %nop10275 = alloca i1, i1 0
+  %nop10276 = alloca i1, i1 0
+  %nop10277 = alloca i1, i1 0
+  %nop10278 = alloca i1, i1 0
+  %nop10279 = alloca i1, i1 0
+  %nop10280 = alloca i1, i1 0
+  %nop10281 = alloca i1, i1 0
+  %nop10282 = alloca i1, i1 0
+  %nop10283 = alloca i1, i1 0
+  %nop10284 = alloca i1, i1 0
+  %nop10285 = alloca i1, i1 0
+  %nop10286 = alloca i1, i1 0
+  %nop10287 = alloca i1, i1 0
+  %nop10288 = alloca i1, i1 0
+  %nop10289 = alloca i1, i1 0
+  %nop10290 = alloca i1, i1 0
+  %nop10291 = alloca i1, i1 0
+  %nop10292 = alloca i1, i1 0
+  %nop10293 = alloca i1, i1 0
+  %nop10294 = alloca i1, i1 0
+  %nop10295 = alloca i1, i1 0
+  %nop10296 = alloca i1, i1 0
+  %nop10297 = alloca i1, i1 0
+  %nop10298 = alloca i1, i1 0
+  %nop10299 = alloca i1, i1 0
+  %nop10300 = alloca i1, i1 0
+  %nop10301 = alloca i1, i1 0
+  %nop10302 = alloca i1, i1 0
+  %nop10303 = alloca i1, i1 0
+  %nop10304 = alloca i1, i1 0
+  %nop10305 = alloca i1, i1 0
+  %nop10306 = alloca i1, i1 0
+  %nop10307 = alloca i1, i1 0
+  %nop10308 = alloca i1, i1 0
+  %nop10309 = alloca i1, i1 0
+  %nop10310 = alloca i1, i1 0
+  %nop10311 = alloca i1, i1 0
+  %nop10312 = alloca i1, i1 0
+  %nop10313 = alloca i1, i1 0
+  %nop10314 = alloca i1, i1 0
+  %nop10315 = alloca i1, i1 0
+  %nop10316 = alloca i1, i1 0
+  %nop10317 = alloca i1, i1 0
+  %nop10318 = alloca i1, i1 0
+  %nop10319 = alloca i1, i1 0
+  %nop10320 = alloca i1, i1 0
+  %nop10321 = alloca i1, i1 0
+  %nop10322 = alloca i1, i1 0
+  %nop10323 = alloca i1, i1 0
+  %nop10324 = alloca i1, i1 0
+  %nop10325 = alloca i1, i1 0
+  %nop10326 = alloca i1, i1 0
+  %nop10327 = alloca i1, i1 0
+  %nop10328 = alloca i1, i1 0
+  %nop10329 = alloca i1, i1 0
+  %nop10330 = alloca i1, i1 0
+  %nop10331 = alloca i1, i1 0
+  %nop10332 = alloca i1, i1 0
+  %nop10333 = alloca i1, i1 0
+  %nop10334 = alloca i1, i1 0
+  %nop10335 = alloca i1, i1 0
+  %nop10336 = alloca i1, i1 0
+  %nop10337 = alloca i1, i1 0
+  %nop10338 = alloca i1, i1 0
+  %nop10339 = alloca i1, i1 0
+  %nop10340 = alloca i1, i1 0
+  %nop10341 = alloca i1, i1 0
+  %nop10342 = alloca i1, i1 0
+  %nop10343 = alloca i1, i1 0
+  %nop10344 = alloca i1, i1 0
+  %nop10345 = alloca i1, i1 0
+  %nop10346 = alloca i1, i1 0
+  %nop10347 = alloca i1, i1 0
+  %nop10348 = alloca i1, i1 0
+  %nop10349 = alloca i1, i1 0
+  %nop10350 = alloca i1, i1 0
+  %nop10351 = alloca i1, i1 0
+  %nop10352 = alloca i1, i1 0
+  %nop10353 = alloca i1, i1 0
+  %nop10354 = alloca i1, i1 0
+  %nop10355 = alloca i1, i1 0
+  %nop10356 = alloca i1, i1 0
+  %nop10357 = alloca i1, i1 0
+  %nop10358 = alloca i1, i1 0
+  %nop10359 = alloca i1, i1 0
+  %nop10360 = alloca i1, i1 0
+  %nop10361 = alloca i1, i1 0
+  %nop10362 = alloca i1, i1 0
+  %nop10363 = alloca i1, i1 0
+  %nop10364 = alloca i1, i1 0
+  %nop10365 = alloca i1, i1 0
+  %nop10366 = alloca i1, i1 0
+  %nop10367 = alloca i1, i1 0
+  %nop10368 = alloca i1, i1 0
+  %nop10369 = alloca i1, i1 0
+  %nop10370 = alloca i1, i1 0
+  %nop10371 = alloca i1, i1 0
+  %nop10372 = alloca i1, i1 0
+  %nop10373 = alloca i1, i1 0
+  %nop10374 = alloca i1, i1 0
+  %nop10375 = alloca i1, i1 0
+  %nop10376 = alloca i1, i1 0
+  %nop10377 = alloca i1, i1 0
+  %nop10378 = alloca i1, i1 0
+  %nop10379 = alloca i1, i1 0
+  %nop10380 = alloca i1, i1 0
+  %nop10381 = alloca i1, i1 0
+  %nop10382 = alloca i1, i1 0
+  %nop10383 = alloca i1, i1 0
+  %nop10384 = alloca i1, i1 0
+  %nop10385 = alloca i1, i1 0
+  %nop10386 = alloca i1, i1 0
+  %nop10387 = alloca i1, i1 0
+  %nop10388 = alloca i1, i1 0
+  %nop10389 = alloca i1, i1 0
+  %nop10390 = alloca i1, i1 0
+  %nop10391 = alloca i1, i1 0
+  %nop10392 = alloca i1, i1 0
+  %nop10393 = alloca i1, i1 0
+  %nop10394 = alloca i1, i1 0
+  %nop10395 = alloca i1, i1 0
+  %nop10396 = alloca i1, i1 0
+  %nop10397 = alloca i1, i1 0
+  %nop10398 = alloca i1, i1 0
+  %nop10399 = alloca i1, i1 0
+  %nop10400 = alloca i1, i1 0
+  %nop10401 = alloca i1, i1 0
+  %nop10402 = alloca i1, i1 0
+  %nop10403 = alloca i1, i1 0
+  %nop10404 = alloca i1, i1 0
+  %nop10405 = alloca i1, i1 0
+  %nop10406 = alloca i1, i1 0
+  %nop10407 = alloca i1, i1 0
+  %nop10408 = alloca i1, i1 0
+  %nop10409 = alloca i1, i1 0
+  %nop10410 = alloca i1, i1 0
+  %nop10411 = alloca i1, i1 0
+  %nop10412 = alloca i1, i1 0
+  %nop10413 = alloca i1, i1 0
+  %nop10414 = alloca i1, i1 0
+  %nop10415 = alloca i1, i1 0
+  %nop10416 = alloca i1, i1 0
+  %nop10417 = alloca i1, i1 0
+  %nop10418 = alloca i1, i1 0
+  %nop10419 = alloca i1, i1 0
+  %nop10420 = alloca i1, i1 0
+  %nop10421 = alloca i1, i1 0
+  %nop10422 = alloca i1, i1 0
+  %nop10423 = alloca i1, i1 0
+  %nop10424 = alloca i1, i1 0
+  %nop10425 = alloca i1, i1 0
+  %nop10426 = alloca i1, i1 0
+  %nop10427 = alloca i1, i1 0
+  %nop10428 = alloca i1, i1 0
+  %nop10429 = alloca i1, i1 0
+  %nop10430 = alloca i1, i1 0
+  %nop10431 = alloca i1, i1 0
+  %nop10432 = alloca i1, i1 0
+  %nop10433 = alloca i1, i1 0
+  %nop10434 = alloca i1, i1 0
+  %nop10435 = alloca i1, i1 0
+  %nop10436 = alloca i1, i1 0
+  %nop10437 = alloca i1, i1 0
+  %nop10438 = alloca i1, i1 0
+  %nop10439 = alloca i1, i1 0
+  %nop10440 = alloca i1, i1 0
+  %nop10441 = alloca i1, i1 0
+  %nop10442 = alloca i1, i1 0
+  %nop10443 = alloca i1, i1 0
+  %nop10444 = alloca i1, i1 0
+  %nop10445 = alloca i1, i1 0
+  %nop10446 = alloca i1, i1 0
+  %nop10447 = alloca i1, i1 0
+  %nop10448 = alloca i1, i1 0
+  %nop10449 = alloca i1, i1 0
+  %nop10450 = alloca i1, i1 0
+  %nop10451 = alloca i1, i1 0
+  %nop10452 = alloca i1, i1 0
+  %nop10453 = alloca i1, i1 0
+  %nop10454 = alloca i1, i1 0
+  %nop10455 = alloca i1, i1 0
+  %nop10456 = alloca i1, i1 0
+  %nop10457 = alloca i1, i1 0
+  %nop10458 = alloca i1, i1 0
+  %nop10459 = alloca i1, i1 0
+  %nop10460 = alloca i1, i1 0
+  %nop10461 = alloca i1, i1 0
+  %nop10462 = alloca i1, i1 0
+  %nop10463 = alloca i1, i1 0
+  %nop10464 = alloca i1, i1 0
+  %nop10465 = alloca i1, i1 0
+  %nop10466 = alloca i1, i1 0
+  %nop10467 = alloca i1, i1 0
+  %nop10468 = alloca i1, i1 0
+  %nop10469 = alloca i1, i1 0
+  %nop10470 = alloca i1, i1 0
+  %nop10471 = alloca i1, i1 0
+  %nop10472 = alloca i1, i1 0
+  %nop10473 = alloca i1, i1 0
+  %nop10474 = alloca i1, i1 0
+  %nop10475 = alloca i1, i1 0
+  %nop10476 = alloca i1, i1 0
+  %nop10477 = alloca i1, i1 0
+  %nop10478 = alloca i1, i1 0
+  %nop10479 = alloca i1, i1 0
+  %nop10480 = alloca i1, i1 0
+  %nop10481 = alloca i1, i1 0
+  %nop10482 = alloca i1, i1 0
+  %nop10483 = alloca i1, i1 0
+  %nop10484 = alloca i1, i1 0
+  %nop10485 = alloca i1, i1 0
+  %nop10486 = alloca i1, i1 0
+  %nop10487 = alloca i1, i1 0
+  %nop10488 = alloca i1, i1 0
+  %nop10489 = alloca i1, i1 0
+  %nop10490 = alloca i1, i1 0
+  %nop10491 = alloca i1, i1 0
+  %nop10492 = alloca i1, i1 0
+  %nop10493 = alloca i1, i1 0
+  %nop10494 = alloca i1, i1 0
+  %nop10495 = alloca i1, i1 0
+  %nop10496 = alloca i1, i1 0
+  %nop10497 = alloca i1, i1 0
+  %nop10498 = alloca i1, i1 0
+  %nop10499 = alloca i1, i1 0
+  %nop10500 = alloca i1, i1 0
+  %nop10501 = alloca i1, i1 0
+  %nop10502 = alloca i1, i1 0
+  %nop10503 = alloca i1, i1 0
+  %nop10504 = alloca i1, i1 0
+  %nop10505 = alloca i1, i1 0
+  %nop10506 = alloca i1, i1 0
+  %nop10507 = alloca i1, i1 0
+  %nop10508 = alloca i1, i1 0
+  %nop10509 = alloca i1, i1 0
+  %nop10510 = alloca i1, i1 0
+  %nop10511 = alloca i1, i1 0
+  %nop10512 = alloca i1, i1 0
+  %nop10513 = alloca i1, i1 0
+  %nop10514 = alloca i1, i1 0
+  %nop10515 = alloca i1, i1 0
+  %nop10516 = alloca i1, i1 0
+  %nop10517 = alloca i1, i1 0
+  %nop10518 = alloca i1, i1 0
+  %nop10519 = alloca i1, i1 0
+  %nop10520 = alloca i1, i1 0
+  %nop10521 = alloca i1, i1 0
+  %nop10522 = alloca i1, i1 0
+  %nop10523 = alloca i1, i1 0
+  %nop10524 = alloca i1, i1 0
+  %nop10525 = alloca i1, i1 0
+  %nop10526 = alloca i1, i1 0
+  %nop10527 = alloca i1, i1 0
+  %nop10528 = alloca i1, i1 0
+  %nop10529 = alloca i1, i1 0
+  %nop10530 = alloca i1, i1 0
+  %nop10531 = alloca i1, i1 0
+  %nop10532 = alloca i1, i1 0
+  %nop10533 = alloca i1, i1 0
+  %nop10534 = alloca i1, i1 0
+  %nop10535 = alloca i1, i1 0
+  %nop10536 = alloca i1, i1 0
+  %nop10537 = alloca i1, i1 0
+  %nop10538 = alloca i1, i1 0
+  %nop10539 = alloca i1, i1 0
+  %nop10540 = alloca i1, i1 0
+  %nop10541 = alloca i1, i1 0
+  %nop10542 = alloca i1, i1 0
+  %nop10543 = alloca i1, i1 0
+  %nop10544 = alloca i1, i1 0
+  %nop10545 = alloca i1, i1 0
+  %nop10546 = alloca i1, i1 0
+  %nop10547 = alloca i1, i1 0
+  %nop10548 = alloca i1, i1 0
+  %nop10549 = alloca i1, i1 0
+  %nop10550 = alloca i1, i1 0
+  %nop10551 = alloca i1, i1 0
+  %nop10552 = alloca i1, i1 0
+  %nop10553 = alloca i1, i1 0
+  %nop10554 = alloca i1, i1 0
+  %nop10555 = alloca i1, i1 0
+  %nop10556 = alloca i1, i1 0
+  %nop10557 = alloca i1, i1 0
+  %nop10558 = alloca i1, i1 0
+  %nop10559 = alloca i1, i1 0
+  %nop10560 = alloca i1, i1 0
+  %nop10561 = alloca i1, i1 0
+  %nop10562 = alloca i1, i1 0
+  %nop10563 = alloca i1, i1 0
+  %nop10564 = alloca i1, i1 0
+  %nop10565 = alloca i1, i1 0
+  %nop10566 = alloca i1, i1 0
+  %nop10567 = alloca i1, i1 0
+  %nop10568 = alloca i1, i1 0
+  %nop10569 = alloca i1, i1 0
+  %nop10570 = alloca i1, i1 0
+  %nop10571 = alloca i1, i1 0
+  %nop10572 = alloca i1, i1 0
+  %nop10573 = alloca i1, i1 0
+  %nop10574 = alloca i1, i1 0
+  %nop10575 = alloca i1, i1 0
+  %nop10576 = alloca i1, i1 0
+  %nop10577 = alloca i1, i1 0
+  %nop10578 = alloca i1, i1 0
+  %nop10579 = alloca i1, i1 0
+  %nop10580 = alloca i1, i1 0
+  %nop10581 = alloca i1, i1 0
+  %nop10582 = alloca i1, i1 0
+  %nop10583 = alloca i1, i1 0
+  %nop10584 = alloca i1, i1 0
+  %nop10585 = alloca i1, i1 0
+  %nop10586 = alloca i1, i1 0
+  %nop10587 = alloca i1, i1 0
+  %nop10588 = alloca i1, i1 0
+  %nop10589 = alloca i1, i1 0
+  %nop10590 = alloca i1, i1 0
+  %nop10591 = alloca i1, i1 0
+  %nop10592 = alloca i1, i1 0
+  %nop10593 = alloca i1, i1 0
+  %nop10594 = alloca i1, i1 0
+  %nop10595 = alloca i1, i1 0
+  %nop10596 = alloca i1, i1 0
+  %nop10597 = alloca i1, i1 0
+  %nop10598 = alloca i1, i1 0
+  %nop10599 = alloca i1, i1 0
+  %nop10600 = alloca i1, i1 0
+  %nop10601 = alloca i1, i1 0
+  %nop10602 = alloca i1, i1 0
+  %nop10603 = alloca i1, i1 0
+  %nop10604 = alloca i1, i1 0
+  %nop10605 = alloca i1, i1 0
+  %nop10606 = alloca i1, i1 0
+  %nop10607 = alloca i1, i1 0
+  %nop10608 = alloca i1, i1 0
+  %nop10609 = alloca i1, i1 0
+  %nop10610 = alloca i1, i1 0
+  %nop10611 = alloca i1, i1 0
+  %nop10612 = alloca i1, i1 0
+  %nop10613 = alloca i1, i1 0
+  %nop10614 = alloca i1, i1 0
+  %nop10615 = alloca i1, i1 0
+  %nop10616 = alloca i1, i1 0
+  %nop10617 = alloca i1, i1 0
+  %nop10618 = alloca i1, i1 0
+  %nop10619 = alloca i1, i1 0
+  %nop10620 = alloca i1, i1 0
+  %nop10621 = alloca i1, i1 0
+  %nop10622 = alloca i1, i1 0
+  %nop10623 = alloca i1, i1 0
+  %nop10624 = alloca i1, i1 0
+  %nop10625 = alloca i1, i1 0
+  %nop10626 = alloca i1, i1 0
+  %nop10627 = alloca i1, i1 0
+  %nop10628 = alloca i1, i1 0
+  %nop10629 = alloca i1, i1 0
+  %nop10630 = alloca i1, i1 0
+  %nop10631 = alloca i1, i1 0
+  %nop10632 = alloca i1, i1 0
+  %nop10633 = alloca i1, i1 0
+  %nop10634 = alloca i1, i1 0
+  %nop10635 = alloca i1, i1 0
+  %nop10636 = alloca i1, i1 0
+  %nop10637 = alloca i1, i1 0
+  %nop10638 = alloca i1, i1 0
+  %nop10639 = alloca i1, i1 0
+  %nop10640 = alloca i1, i1 0
+  %nop10641 = alloca i1, i1 0
+  %nop10642 = alloca i1, i1 0
+  %nop10643 = alloca i1, i1 0
+  %nop10644 = alloca i1, i1 0
+  %nop10645 = alloca i1, i1 0
+  %nop10646 = alloca i1, i1 0
+  %nop10647 = alloca i1, i1 0
+  %nop10648 = alloca i1, i1 0
+  %nop10649 = alloca i1, i1 0
+  %nop10650 = alloca i1, i1 0
+  %nop10651 = alloca i1, i1 0
+  %nop10652 = alloca i1, i1 0
+  %nop10653 = alloca i1, i1 0
+  %nop10654 = alloca i1, i1 0
+  %nop10655 = alloca i1, i1 0
+  %nop10656 = alloca i1, i1 0
+  %nop10657 = alloca i1, i1 0
+  %nop10658 = alloca i1, i1 0
+  %nop10659 = alloca i1, i1 0
+  %nop10660 = alloca i1, i1 0
+  %nop10661 = alloca i1, i1 0
+  %nop10662 = alloca i1, i1 0
+  %nop10663 = alloca i1, i1 0
+  %nop10664 = alloca i1, i1 0
+  %nop10665 = alloca i1, i1 0
+  %nop10666 = alloca i1, i1 0
+  %nop10667 = alloca i1, i1 0
+  %nop10668 = alloca i1, i1 0
+  %nop10669 = alloca i1, i1 0
+  %nop10670 = alloca i1, i1 0
+  %nop10671 = alloca i1, i1 0
+  %nop10672 = alloca i1, i1 0
+  %nop10673 = alloca i1, i1 0
+  %nop10674 = alloca i1, i1 0
+  %nop10675 = alloca i1, i1 0
+  %nop10676 = alloca i1, i1 0
+  %nop10677 = alloca i1, i1 0
+  %nop10678 = alloca i1, i1 0
+  %nop10679 = alloca i1, i1 0
+  %nop10680 = alloca i1, i1 0
+  %nop10681 = alloca i1, i1 0
+  %nop10682 = alloca i1, i1 0
+  %nop10683 = alloca i1, i1 0
+  %nop10684 = alloca i1, i1 0
+  %nop10685 = alloca i1, i1 0
+  %nop10686 = alloca i1, i1 0
+  %nop10687 = alloca i1, i1 0
+  %nop10688 = alloca i1, i1 0
+  %nop10689 = alloca i1, i1 0
+  %nop10690 = alloca i1, i1 0
+  %nop10691 = alloca i1, i1 0
+  %nop10692 = alloca i1, i1 0
+  %nop10693 = alloca i1, i1 0
+  %nop10694 = alloca i1, i1 0
+  %nop10695 = alloca i1, i1 0
+  %nop10696 = alloca i1, i1 0
+  %nop10697 = alloca i1, i1 0
+  %nop10698 = alloca i1, i1 0
+  %nop10699 = alloca i1, i1 0
+  %nop10700 = alloca i1, i1 0
+  %nop10701 = alloca i1, i1 0
+  %nop10702 = alloca i1, i1 0
+  %nop10703 = alloca i1, i1 0
+  %nop10704 = alloca i1, i1 0
+  %nop10705 = alloca i1, i1 0
+  %nop10706 = alloca i1, i1 0
+  %nop10707 = alloca i1, i1 0
+  %nop10708 = alloca i1, i1 0
+  %nop10709 = alloca i1, i1 0
+  %nop10710 = alloca i1, i1 0
+  %nop10711 = alloca i1, i1 0
+  %nop10712 = alloca i1, i1 0
+  %nop10713 = alloca i1, i1 0
+  %nop10714 = alloca i1, i1 0
+  %nop10715 = alloca i1, i1 0
+  %nop10716 = alloca i1, i1 0
+  %nop10717 = alloca i1, i1 0
+  %nop10718 = alloca i1, i1 0
+  %nop10719 = alloca i1, i1 0
+  %nop10720 = alloca i1, i1 0
+  %nop10721 = alloca i1, i1 0
+  %nop10722 = alloca i1, i1 0
+  %nop10723 = alloca i1, i1 0
+  %nop10724 = alloca i1, i1 0
+  %nop10725 = alloca i1, i1 0
+  %nop10726 = alloca i1, i1 0
+  %nop10727 = alloca i1, i1 0
+  %nop10728 = alloca i1, i1 0
+  %nop10729 = alloca i1, i1 0
+  %nop10730 = alloca i1, i1 0
+  %nop10731 = alloca i1, i1 0
+  %nop10732 = alloca i1, i1 0
+  %nop10733 = alloca i1, i1 0
+  %nop10734 = alloca i1, i1 0
+  %nop10735 = alloca i1, i1 0
+  %nop10736 = alloca i1, i1 0
+  %nop10737 = alloca i1, i1 0
+  %nop10738 = alloca i1, i1 0
+  %nop10739 = alloca i1, i1 0
+  %nop10740 = alloca i1, i1 0
+  %nop10741 = alloca i1, i1 0
+  %nop10742 = alloca i1, i1 0
+  %nop10743 = alloca i1, i1 0
+  %nop10744 = alloca i1, i1 0
+  %nop10745 = alloca i1, i1 0
+  %nop10746 = alloca i1, i1 0
+  %nop10747 = alloca i1, i1 0
+  %nop10748 = alloca i1, i1 0
+  %nop10749 = alloca i1, i1 0
+  %nop10750 = alloca i1, i1 0
+  %nop10751 = alloca i1, i1 0
+  %nop10752 = alloca i1, i1 0
+  %nop10753 = alloca i1, i1 0
+  %nop10754 = alloca i1, i1 0
+  %nop10755 = alloca i1, i1 0
+  %nop10756 = alloca i1, i1 0
+  %nop10757 = alloca i1, i1 0
+  %nop10758 = alloca i1, i1 0
+  %nop10759 = alloca i1, i1 0
+  %nop10760 = alloca i1, i1 0
+  %nop10761 = alloca i1, i1 0
+  %nop10762 = alloca i1, i1 0
+  %nop10763 = alloca i1, i1 0
+  %nop10764 = alloca i1, i1 0
+  %nop10765 = alloca i1, i1 0
+  %nop10766 = alloca i1, i1 0
+  %nop10767 = alloca i1, i1 0
+  %nop10768 = alloca i1, i1 0
+  %nop10769 = alloca i1, i1 0
+  %nop10770 = alloca i1, i1 0
+  %nop10771 = alloca i1, i1 0
+  %nop10772 = alloca i1, i1 0
+  %nop10773 = alloca i1, i1 0
+  %nop10774 = alloca i1, i1 0
+  %nop10775 = alloca i1, i1 0
+  %nop10776 = alloca i1, i1 0
+  %nop10777 = alloca i1, i1 0
+  %nop10778 = alloca i1, i1 0
+  %nop10779 = alloca i1, i1 0
+  %nop10780 = alloca i1, i1 0
+  %nop10781 = alloca i1, i1 0
+  %nop10782 = alloca i1, i1 0
+  %nop10783 = alloca i1, i1 0
+  %nop10784 = alloca i1, i1 0
+  %nop10785 = alloca i1, i1 0
+  %nop10786 = alloca i1, i1 0
+  %nop10787 = alloca i1, i1 0
+  %nop10788 = alloca i1, i1 0
+  %nop10789 = alloca i1, i1 0
+  %nop10790 = alloca i1, i1 0
+  %nop10791 = alloca i1, i1 0
+  %nop10792 = alloca i1, i1 0
+  %nop10793 = alloca i1, i1 0
+  %nop10794 = alloca i1, i1 0
+  %nop10795 = alloca i1, i1 0
+  %nop10796 = alloca i1, i1 0
+  %nop10797 = alloca i1, i1 0
+  %nop10798 = alloca i1, i1 0
+  %nop10799 = alloca i1, i1 0
+  %nop10800 = alloca i1, i1 0
+  %nop10801 = alloca i1, i1 0
+  %nop10802 = alloca i1, i1 0
+  %nop10803 = alloca i1, i1 0
+  %nop10804 = alloca i1, i1 0
+  %nop10805 = alloca i1, i1 0
+  %nop10806 = alloca i1, i1 0
+  %nop10807 = alloca i1, i1 0
+  %nop10808 = alloca i1, i1 0
+  %nop10809 = alloca i1, i1 0
+  %nop10810 = alloca i1, i1 0
+  %nop10811 = alloca i1, i1 0
+  %nop10812 = alloca i1, i1 0
+  %nop10813 = alloca i1, i1 0
+  %nop10814 = alloca i1, i1 0
+  %nop10815 = alloca i1, i1 0
+  %nop10816 = alloca i1, i1 0
+  %nop10817 = alloca i1, i1 0
+  %nop10818 = alloca i1, i1 0
+  %nop10819 = alloca i1, i1 0
+  %nop10820 = alloca i1, i1 0
+  %nop10821 = alloca i1, i1 0
+  %nop10822 = alloca i1, i1 0
+  %nop10823 = alloca i1, i1 0
+  %nop10824 = alloca i1, i1 0
+  %nop10825 = alloca i1, i1 0
+  %nop10826 = alloca i1, i1 0
+  %nop10827 = alloca i1, i1 0
+  %nop10828 = alloca i1, i1 0
+  %nop10829 = alloca i1, i1 0
+  %nop10830 = alloca i1, i1 0
+  %nop10831 = alloca i1, i1 0
+  %nop10832 = alloca i1, i1 0
+  %nop10833 = alloca i1, i1 0
+  %nop10834 = alloca i1, i1 0
+  %nop10835 = alloca i1, i1 0
+  %nop10836 = alloca i1, i1 0
+  %nop10837 = alloca i1, i1 0
+  %nop10838 = alloca i1, i1 0
+  %nop10839 = alloca i1, i1 0
+  %nop10840 = alloca i1, i1 0
+  %nop10841 = alloca i1, i1 0
+  %nop10842 = alloca i1, i1 0
+  %nop10843 = alloca i1, i1 0
+  %nop10844 = alloca i1, i1 0
+  %nop10845 = alloca i1, i1 0
+  %nop10846 = alloca i1, i1 0
+  %nop10847 = alloca i1, i1 0
+  %nop10848 = alloca i1, i1 0
+  %nop10849 = alloca i1, i1 0
+  %nop10850 = alloca i1, i1 0
+  %nop10851 = alloca i1, i1 0
+  %nop10852 = alloca i1, i1 0
+  %nop10853 = alloca i1, i1 0
+  %nop10854 = alloca i1, i1 0
+  %nop10855 = alloca i1, i1 0
+  %nop10856 = alloca i1, i1 0
+  %nop10857 = alloca i1, i1 0
+  %nop10858 = alloca i1, i1 0
+  %nop10859 = alloca i1, i1 0
+  %nop10860 = alloca i1, i1 0
+  %nop10861 = alloca i1, i1 0
+  %nop10862 = alloca i1, i1 0
+  %nop10863 = alloca i1, i1 0
+  %nop10864 = alloca i1, i1 0
+  %nop10865 = alloca i1, i1 0
+  %nop10866 = alloca i1, i1 0
+  %nop10867 = alloca i1, i1 0
+  %nop10868 = alloca i1, i1 0
+  %nop10869 = alloca i1, i1 0
+  %nop10870 = alloca i1, i1 0
+  %nop10871 = alloca i1, i1 0
+  %nop10872 = alloca i1, i1 0
+  %nop10873 = alloca i1, i1 0
+  %nop10874 = alloca i1, i1 0
+  %nop10875 = alloca i1, i1 0
+  %nop10876 = alloca i1, i1 0
+  %nop10877 = alloca i1, i1 0
+  %nop10878 = alloca i1, i1 0
+  %nop10879 = alloca i1, i1 0
+  %nop10880 = alloca i1, i1 0
+  %nop10881 = alloca i1, i1 0
+  %nop10882 = alloca i1, i1 0
+  %nop10883 = alloca i1, i1 0
+  %nop10884 = alloca i1, i1 0
+  %nop10885 = alloca i1, i1 0
+  %nop10886 = alloca i1, i1 0
+  %nop10887 = alloca i1, i1 0
+  %nop10888 = alloca i1, i1 0
+  %nop10889 = alloca i1, i1 0
+  %nop10890 = alloca i1, i1 0
+  %nop10891 = alloca i1, i1 0
+  %nop10892 = alloca i1, i1 0
+  %nop10893 = alloca i1, i1 0
+  %nop10894 = alloca i1, i1 0
+  %nop10895 = alloca i1, i1 0
+  %nop10896 = alloca i1, i1 0
+  %nop10897 = alloca i1, i1 0
+  %nop10898 = alloca i1, i1 0
+  %nop10899 = alloca i1, i1 0
+  %nop10900 = alloca i1, i1 0
+  %nop10901 = alloca i1, i1 0
+  %nop10902 = alloca i1, i1 0
+  %nop10903 = alloca i1, i1 0
+  %nop10904 = alloca i1, i1 0
+  %nop10905 = alloca i1, i1 0
+  %nop10906 = alloca i1, i1 0
+  %nop10907 = alloca i1, i1 0
+  %nop10908 = alloca i1, i1 0
+  %nop10909 = alloca i1, i1 0
+  %nop10910 = alloca i1, i1 0
+  %nop10911 = alloca i1, i1 0
+  %nop10912 = alloca i1, i1 0
+  %nop10913 = alloca i1, i1 0
+  %nop10914 = alloca i1, i1 0
+  %nop10915 = alloca i1, i1 0
+  %nop10916 = alloca i1, i1 0
+  %nop10917 = alloca i1, i1 0
+  %nop10918 = alloca i1, i1 0
+  %nop10919 = alloca i1, i1 0
+  %nop10920 = alloca i1, i1 0
+  %nop10921 = alloca i1, i1 0
+  %nop10922 = alloca i1, i1 0
+  %nop10923 = alloca i1, i1 0
+  %nop10924 = alloca i1, i1 0
+  %nop10925 = alloca i1, i1 0
+  %nop10926 = alloca i1, i1 0
+  %nop10927 = alloca i1, i1 0
+  %nop10928 = alloca i1, i1 0
+  %nop10929 = alloca i1, i1 0
+  %nop10930 = alloca i1, i1 0
+  %nop10931 = alloca i1, i1 0
+  %nop10932 = alloca i1, i1 0
+  %nop10933 = alloca i1, i1 0
+  %nop10934 = alloca i1, i1 0
+  %nop10935 = alloca i1, i1 0
+  %nop10936 = alloca i1, i1 0
+  %nop10937 = alloca i1, i1 0
+  %nop10938 = alloca i1, i1 0
+  %nop10939 = alloca i1, i1 0
+  %nop10940 = alloca i1, i1 0
+  %nop10941 = alloca i1, i1 0
+  %nop10942 = alloca i1, i1 0
+  %nop10943 = alloca i1, i1 0
+  %nop10944 = alloca i1, i1 0
+  %nop10945 = alloca i1, i1 0
+  %nop10946 = alloca i1, i1 0
+  %nop10947 = alloca i1, i1 0
+  %nop10948 = alloca i1, i1 0
+  %nop10949 = alloca i1, i1 0
+  %nop10950 = alloca i1, i1 0
+  %nop10951 = alloca i1, i1 0
+  %nop10952 = alloca i1, i1 0
+  %nop10953 = alloca i1, i1 0
+  %nop10954 = alloca i1, i1 0
+  %nop10955 = alloca i1, i1 0
+  %nop10956 = alloca i1, i1 0
+  %nop10957 = alloca i1, i1 0
+  %nop10958 = alloca i1, i1 0
+  %nop10959 = alloca i1, i1 0
+  %nop10960 = alloca i1, i1 0
+  %nop10961 = alloca i1, i1 0
+  %nop10962 = alloca i1, i1 0
+  %nop10963 = alloca i1, i1 0
+  %nop10964 = alloca i1, i1 0
+  %nop10965 = alloca i1, i1 0
+  %nop10966 = alloca i1, i1 0
+  %nop10967 = alloca i1, i1 0
+  %nop10968 = alloca i1, i1 0
+  %nop10969 = alloca i1, i1 0
+  %nop10970 = alloca i1, i1 0
+  %nop10971 = alloca i1, i1 0
+  %nop10972 = alloca i1, i1 0
+  %nop10973 = alloca i1, i1 0
+  %nop10974 = alloca i1, i1 0
+  %nop10975 = alloca i1, i1 0
+  %nop10976 = alloca i1, i1 0
+  %nop10977 = alloca i1, i1 0
+  %nop10978 = alloca i1, i1 0
+  %nop10979 = alloca i1, i1 0
+  %nop10980 = alloca i1, i1 0
+  %nop10981 = alloca i1, i1 0
+  %nop10982 = alloca i1, i1 0
+  %nop10983 = alloca i1, i1 0
+  %nop10984 = alloca i1, i1 0
+  %nop10985 = alloca i1, i1 0
+  %nop10986 = alloca i1, i1 0
+  %nop10987 = alloca i1, i1 0
+  %nop10988 = alloca i1, i1 0
+  %nop10989 = alloca i1, i1 0
+  %nop10990 = alloca i1, i1 0
+  %nop10991 = alloca i1, i1 0
+  %nop10992 = alloca i1, i1 0
+  %nop10993 = alloca i1, i1 0
+  %nop10994 = alloca i1, i1 0
+  %nop10995 = alloca i1, i1 0
+  %nop10996 = alloca i1, i1 0
+  %nop10997 = alloca i1, i1 0
+  %nop10998 = alloca i1, i1 0
+  %nop10999 = alloca i1, i1 0
+  %nop11000 = alloca i1, i1 0
+  %nop11001 = alloca i1, i1 0
+  %nop11002 = alloca i1, i1 0
+  %nop11003 = alloca i1, i1 0
+  %nop11004 = alloca i1, i1 0
+  %nop11005 = alloca i1, i1 0
+  %nop11006 = alloca i1, i1 0
+  %nop11007 = alloca i1, i1 0
+  %nop11008 = alloca i1, i1 0
+  %nop11009 = alloca i1, i1 0
+  %nop11010 = alloca i1, i1 0
+  %nop11011 = alloca i1, i1 0
+  %nop11012 = alloca i1, i1 0
+  %nop11013 = alloca i1, i1 0
+  %nop11014 = alloca i1, i1 0
+  %nop11015 = alloca i1, i1 0
+  %nop11016 = alloca i1, i1 0
+  %nop11017 = alloca i1, i1 0
+  %nop11018 = alloca i1, i1 0
+  %nop11019 = alloca i1, i1 0
+  %nop11020 = alloca i1, i1 0
+  %nop11021 = alloca i1, i1 0
+  %nop11022 = alloca i1, i1 0
+  %nop11023 = alloca i1, i1 0
+  %nop11024 = alloca i1, i1 0
+  %nop11025 = alloca i1, i1 0
+  %nop11026 = alloca i1, i1 0
+  %nop11027 = alloca i1, i1 0
+  %nop11028 = alloca i1, i1 0
+  %nop11029 = alloca i1, i1 0
+  %nop11030 = alloca i1, i1 0
+  %nop11031 = alloca i1, i1 0
+  %nop11032 = alloca i1, i1 0
+  %nop11033 = alloca i1, i1 0
+  %nop11034 = alloca i1, i1 0
+  %nop11035 = alloca i1, i1 0
+  %nop11036 = alloca i1, i1 0
+  %nop11037 = alloca i1, i1 0
+  %nop11038 = alloca i1, i1 0
+  %nop11039 = alloca i1, i1 0
+  %nop11040 = alloca i1, i1 0
+  %nop11041 = alloca i1, i1 0
+  %nop11042 = alloca i1, i1 0
+  %nop11043 = alloca i1, i1 0
+  %nop11044 = alloca i1, i1 0
+  %nop11045 = alloca i1, i1 0
+  %nop11046 = alloca i1, i1 0
+  %nop11047 = alloca i1, i1 0
+  %nop11048 = alloca i1, i1 0
+  %nop11049 = alloca i1, i1 0
+  %nop11050 = alloca i1, i1 0
+  %nop11051 = alloca i1, i1 0
+  %nop11052 = alloca i1, i1 0
+  %nop11053 = alloca i1, i1 0
+  %nop11054 = alloca i1, i1 0
+  %nop11055 = alloca i1, i1 0
+  %nop11056 = alloca i1, i1 0
+  %nop11057 = alloca i1, i1 0
+  %nop11058 = alloca i1, i1 0
+  %nop11059 = alloca i1, i1 0
+  %nop11060 = alloca i1, i1 0
+  %nop11061 = alloca i1, i1 0
+  %nop11062 = alloca i1, i1 0
+  %nop11063 = alloca i1, i1 0
+  %nop11064 = alloca i1, i1 0
+  %nop11065 = alloca i1, i1 0
+  %nop11066 = alloca i1, i1 0
+  %nop11067 = alloca i1, i1 0
+  %nop11068 = alloca i1, i1 0
+  %nop11069 = alloca i1, i1 0
+  %nop11070 = alloca i1, i1 0
+  %nop11071 = alloca i1, i1 0
+  %nop11072 = alloca i1, i1 0
+  %nop11073 = alloca i1, i1 0
+  %nop11074 = alloca i1, i1 0
+  %nop11075 = alloca i1, i1 0
+  %nop11076 = alloca i1, i1 0
+  %nop11077 = alloca i1, i1 0
+  %nop11078 = alloca i1, i1 0
+  %nop11079 = alloca i1, i1 0
+  %nop11080 = alloca i1, i1 0
+  %nop11081 = alloca i1, i1 0
+  %nop11082 = alloca i1, i1 0
+  %nop11083 = alloca i1, i1 0
+  %nop11084 = alloca i1, i1 0
+  %nop11085 = alloca i1, i1 0
+  %nop11086 = alloca i1, i1 0
+  %nop11087 = alloca i1, i1 0
+  %nop11088 = alloca i1, i1 0
+  %nop11089 = alloca i1, i1 0
+  %nop11090 = alloca i1, i1 0
+  %nop11091 = alloca i1, i1 0
+  %nop11092 = alloca i1, i1 0
+  %nop11093 = alloca i1, i1 0
+  %nop11094 = alloca i1, i1 0
+  %nop11095 = alloca i1, i1 0
+  %nop11096 = alloca i1, i1 0
+  %nop11097 = alloca i1, i1 0
+  %nop11098 = alloca i1, i1 0
+  %nop11099 = alloca i1, i1 0
+  %nop11100 = alloca i1, i1 0
+  %nop11101 = alloca i1, i1 0
+  %nop11102 = alloca i1, i1 0
+  %nop11103 = alloca i1, i1 0
+  %nop11104 = alloca i1, i1 0
+  %nop11105 = alloca i1, i1 0
+  %nop11106 = alloca i1, i1 0
+  %nop11107 = alloca i1, i1 0
+  %nop11108 = alloca i1, i1 0
+  %nop11109 = alloca i1, i1 0
+  %nop11110 = alloca i1, i1 0
+  %nop11111 = alloca i1, i1 0
+  %nop11112 = alloca i1, i1 0
+  %nop11113 = alloca i1, i1 0
+  %nop11114 = alloca i1, i1 0
+  %nop11115 = alloca i1, i1 0
+  %nop11116 = alloca i1, i1 0
+  %nop11117 = alloca i1, i1 0
+  %nop11118 = alloca i1, i1 0
+  %nop11119 = alloca i1, i1 0
+  %nop11120 = alloca i1, i1 0
+  %nop11121 = alloca i1, i1 0
+  %nop11122 = alloca i1, i1 0
+  %nop11123 = alloca i1, i1 0
+  %nop11124 = alloca i1, i1 0
+  %nop11125 = alloca i1, i1 0
+  %nop11126 = alloca i1, i1 0
+  %nop11127 = alloca i1, i1 0
+  %nop11128 = alloca i1, i1 0
+  %nop11129 = alloca i1, i1 0
+  %nop11130 = alloca i1, i1 0
+  %nop11131 = alloca i1, i1 0
+  %nop11132 = alloca i1, i1 0
+  %nop11133 = alloca i1, i1 0
+  %nop11134 = alloca i1, i1 0
+  %nop11135 = alloca i1, i1 0
+  %nop11136 = alloca i1, i1 0
+  %nop11137 = alloca i1, i1 0
+  %nop11138 = alloca i1, i1 0
+  %nop11139 = alloca i1, i1 0
+  %nop11140 = alloca i1, i1 0
+  %nop11141 = alloca i1, i1 0
+  %nop11142 = alloca i1, i1 0
+  %nop11143 = alloca i1, i1 0
+  %nop11144 = alloca i1, i1 0
+  %nop11145 = alloca i1, i1 0
+  %nop11146 = alloca i1, i1 0
+  %nop11147 = alloca i1, i1 0
+  %nop11148 = alloca i1, i1 0
+  %nop11149 = alloca i1, i1 0
+  %nop11150 = alloca i1, i1 0
+  %nop11151 = alloca i1, i1 0
+  %nop11152 = alloca i1, i1 0
+  %nop11153 = alloca i1, i1 0
+  %nop11154 = alloca i1, i1 0
+  %nop11155 = alloca i1, i1 0
+  %nop11156 = alloca i1, i1 0
+  %nop11157 = alloca i1, i1 0
+  %nop11158 = alloca i1, i1 0
+  %nop11159 = alloca i1, i1 0
+  %nop11160 = alloca i1, i1 0
+  %nop11161 = alloca i1, i1 0
+  %nop11162 = alloca i1, i1 0
+  %nop11163 = alloca i1, i1 0
+  %nop11164 = alloca i1, i1 0
+  %nop11165 = alloca i1, i1 0
+  %nop11166 = alloca i1, i1 0
+  %nop11167 = alloca i1, i1 0
+  %nop11168 = alloca i1, i1 0
+  %nop11169 = alloca i1, i1 0
+  %nop11170 = alloca i1, i1 0
+  %nop11171 = alloca i1, i1 0
+  %nop11172 = alloca i1, i1 0
+  %nop11173 = alloca i1, i1 0
+  %nop11174 = alloca i1, i1 0
+  %nop11175 = alloca i1, i1 0
+  %nop11176 = alloca i1, i1 0
+  %nop11177 = alloca i1, i1 0
+  %nop11178 = alloca i1, i1 0
+  %nop11179 = alloca i1, i1 0
+  %nop11180 = alloca i1, i1 0
+  %nop11181 = alloca i1, i1 0
+  %nop11182 = alloca i1, i1 0
+  %nop11183 = alloca i1, i1 0
+  %nop11184 = alloca i1, i1 0
+  %nop11185 = alloca i1, i1 0
+  %nop11186 = alloca i1, i1 0
+  %nop11187 = alloca i1, i1 0
+  %nop11188 = alloca i1, i1 0
+  %nop11189 = alloca i1, i1 0
+  %nop11190 = alloca i1, i1 0
+  %nop11191 = alloca i1, i1 0
+  %nop11192 = alloca i1, i1 0
+  %nop11193 = alloca i1, i1 0
+  %nop11194 = alloca i1, i1 0
+  %nop11195 = alloca i1, i1 0
+  %nop11196 = alloca i1, i1 0
+  %nop11197 = alloca i1, i1 0
+  %nop11198 = alloca i1, i1 0
+  %nop11199 = alloca i1, i1 0
+  %nop11200 = alloca i1, i1 0
+  %nop11201 = alloca i1, i1 0
+  %nop11202 = alloca i1, i1 0
+  %nop11203 = alloca i1, i1 0
+  %nop11204 = alloca i1, i1 0
+  %nop11205 = alloca i1, i1 0
+  %nop11206 = alloca i1, i1 0
+  %nop11207 = alloca i1, i1 0
+  %nop11208 = alloca i1, i1 0
+  %nop11209 = alloca i1, i1 0
+  %nop11210 = alloca i1, i1 0
+  %nop11211 = alloca i1, i1 0
+  %nop11212 = alloca i1, i1 0
+  %nop11213 = alloca i1, i1 0
+  %nop11214 = alloca i1, i1 0
+  %nop11215 = alloca i1, i1 0
+  %nop11216 = alloca i1, i1 0
+  %nop11217 = alloca i1, i1 0
+  %nop11218 = alloca i1, i1 0
+  %nop11219 = alloca i1, i1 0
+  %nop11220 = alloca i1, i1 0
+  %nop11221 = alloca i1, i1 0
+  %nop11222 = alloca i1, i1 0
+  %nop11223 = alloca i1, i1 0
+  %nop11224 = alloca i1, i1 0
+  %nop11225 = alloca i1, i1 0
+  %nop11226 = alloca i1, i1 0
+  %nop11227 = alloca i1, i1 0
+  %nop11228 = alloca i1, i1 0
+  %nop11229 = alloca i1, i1 0
+  %nop11230 = alloca i1, i1 0
+  %nop11231 = alloca i1, i1 0
+  %nop11232 = alloca i1, i1 0
+  %nop11233 = alloca i1, i1 0
+  %nop11234 = alloca i1, i1 0
+  %nop11235 = alloca i1, i1 0
+  %nop11236 = alloca i1, i1 0
+  %nop11237 = alloca i1, i1 0
+  %nop11238 = alloca i1, i1 0
+  %nop11239 = alloca i1, i1 0
+  %nop11240 = alloca i1, i1 0
+  %nop11241 = alloca i1, i1 0
+  %nop11242 = alloca i1, i1 0
+  %nop11243 = alloca i1, i1 0
+  %nop11244 = alloca i1, i1 0
+  %nop11245 = alloca i1, i1 0
+  %nop11246 = alloca i1, i1 0
+  %nop11247 = alloca i1, i1 0
+  %nop11248 = alloca i1, i1 0
+  %nop11249 = alloca i1, i1 0
+  %nop11250 = alloca i1, i1 0
+  %nop11251 = alloca i1, i1 0
+  %nop11252 = alloca i1, i1 0
+  %nop11253 = alloca i1, i1 0
+  %nop11254 = alloca i1, i1 0
+  %nop11255 = alloca i1, i1 0
+  %nop11256 = alloca i1, i1 0
+  %nop11257 = alloca i1, i1 0
+  %nop11258 = alloca i1, i1 0
+  %nop11259 = alloca i1, i1 0
+  %nop11260 = alloca i1, i1 0
+  %nop11261 = alloca i1, i1 0
+  %nop11262 = alloca i1, i1 0
+  %nop11263 = alloca i1, i1 0
+  %nop11264 = alloca i1, i1 0
+  %nop11265 = alloca i1, i1 0
+  %nop11266 = alloca i1, i1 0
+  %nop11267 = alloca i1, i1 0
+  %nop11268 = alloca i1, i1 0
+  %nop11269 = alloca i1, i1 0
+  %nop11270 = alloca i1, i1 0
+  %nop11271 = alloca i1, i1 0
+  %nop11272 = alloca i1, i1 0
+  %nop11273 = alloca i1, i1 0
+  %nop11274 = alloca i1, i1 0
+  %nop11275 = alloca i1, i1 0
+  %nop11276 = alloca i1, i1 0
+  %nop11277 = alloca i1, i1 0
+  %nop11278 = alloca i1, i1 0
+  %nop11279 = alloca i1, i1 0
+  %nop11280 = alloca i1, i1 0
+  %nop11281 = alloca i1, i1 0
+  %nop11282 = alloca i1, i1 0
+  %nop11283 = alloca i1, i1 0
+  %nop11284 = alloca i1, i1 0
+  %nop11285 = alloca i1, i1 0
+  %nop11286 = alloca i1, i1 0
+  %nop11287 = alloca i1, i1 0
+  %nop11288 = alloca i1, i1 0
+  %nop11289 = alloca i1, i1 0
+  %nop11290 = alloca i1, i1 0
+  %nop11291 = alloca i1, i1 0
+  %nop11292 = alloca i1, i1 0
+  %nop11293 = alloca i1, i1 0
+  %nop11294 = alloca i1, i1 0
+  %nop11295 = alloca i1, i1 0
+  %nop11296 = alloca i1, i1 0
+  %nop11297 = alloca i1, i1 0
+  %nop11298 = alloca i1, i1 0
+  %nop11299 = alloca i1, i1 0
+  %nop11300 = alloca i1, i1 0
+  %nop11301 = alloca i1, i1 0
+  %nop11302 = alloca i1, i1 0
+  %nop11303 = alloca i1, i1 0
+  %nop11304 = alloca i1, i1 0
+  %nop11305 = alloca i1, i1 0
+  %nop11306 = alloca i1, i1 0
+  %nop11307 = alloca i1, i1 0
+  %nop11308 = alloca i1, i1 0
+  %nop11309 = alloca i1, i1 0
+  %nop11310 = alloca i1, i1 0
+  %nop11311 = alloca i1, i1 0
+  %nop11312 = alloca i1, i1 0
+  %nop11313 = alloca i1, i1 0
+  %nop11314 = alloca i1, i1 0
+  %nop11315 = alloca i1, i1 0
+  %nop11316 = alloca i1, i1 0
+  %nop11317 = alloca i1, i1 0
+  %nop11318 = alloca i1, i1 0
+  %nop11319 = alloca i1, i1 0
+  %nop11320 = alloca i1, i1 0
+  %nop11321 = alloca i1, i1 0
+  %nop11322 = alloca i1, i1 0
+  %nop11323 = alloca i1, i1 0
+  %nop11324 = alloca i1, i1 0
+  %nop11325 = alloca i1, i1 0
+  %nop11326 = alloca i1, i1 0
+  %nop11327 = alloca i1, i1 0
+  %nop11328 = alloca i1, i1 0
+  %nop11329 = alloca i1, i1 0
+  %nop11330 = alloca i1, i1 0
+  %nop11331 = alloca i1, i1 0
+  %nop11332 = alloca i1, i1 0
+  %nop11333 = alloca i1, i1 0
+  %nop11334 = alloca i1, i1 0
+  %nop11335 = alloca i1, i1 0
+  %nop11336 = alloca i1, i1 0
+  %nop11337 = alloca i1, i1 0
+  %nop11338 = alloca i1, i1 0
+  %nop11339 = alloca i1, i1 0
+  %nop11340 = alloca i1, i1 0
+  %nop11341 = alloca i1, i1 0
+  %nop11342 = alloca i1, i1 0
+  %nop11343 = alloca i1, i1 0
+  %nop11344 = alloca i1, i1 0
+  %nop11345 = alloca i1, i1 0
+  %nop11346 = alloca i1, i1 0
+  %nop11347 = alloca i1, i1 0
+  %nop11348 = alloca i1, i1 0
+  %nop11349 = alloca i1, i1 0
+  %nop11350 = alloca i1, i1 0
+  %nop11351 = alloca i1, i1 0
+  %nop11352 = alloca i1, i1 0
+  %nop11353 = alloca i1, i1 0
+  %nop11354 = alloca i1, i1 0
+  %nop11355 = alloca i1, i1 0
+  %nop11356 = alloca i1, i1 0
+  %nop11357 = alloca i1, i1 0
+  %nop11358 = alloca i1, i1 0
+  %nop11359 = alloca i1, i1 0
+  %nop11360 = alloca i1, i1 0
+  %nop11361 = alloca i1, i1 0
+  %nop11362 = alloca i1, i1 0
+  %nop11363 = alloca i1, i1 0
+  %nop11364 = alloca i1, i1 0
+  %nop11365 = alloca i1, i1 0
+  %nop11366 = alloca i1, i1 0
+  %nop11367 = alloca i1, i1 0
+  %nop11368 = alloca i1, i1 0
+  %nop11369 = alloca i1, i1 0
+  %nop11370 = alloca i1, i1 0
+  %nop11371 = alloca i1, i1 0
+  %nop11372 = alloca i1, i1 0
+  %nop11373 = alloca i1, i1 0
+  %nop11374 = alloca i1, i1 0
+  %nop11375 = alloca i1, i1 0
+  %nop11376 = alloca i1, i1 0
+  %nop11377 = alloca i1, i1 0
+  %nop11378 = alloca i1, i1 0
+  %nop11379 = alloca i1, i1 0
+  %nop11380 = alloca i1, i1 0
+  %nop11381 = alloca i1, i1 0
+  %nop11382 = alloca i1, i1 0
+  %nop11383 = alloca i1, i1 0
+  %nop11384 = alloca i1, i1 0
+  %nop11385 = alloca i1, i1 0
+  %nop11386 = alloca i1, i1 0
+  %nop11387 = alloca i1, i1 0
+  %nop11388 = alloca i1, i1 0
+  %nop11389 = alloca i1, i1 0
+  %nop11390 = alloca i1, i1 0
+  %nop11391 = alloca i1, i1 0
+  %nop11392 = alloca i1, i1 0
+  %nop11393 = alloca i1, i1 0
+  %nop11394 = alloca i1, i1 0
+  %nop11395 = alloca i1, i1 0
+  %nop11396 = alloca i1, i1 0
+  %nop11397 = alloca i1, i1 0
+  %nop11398 = alloca i1, i1 0
+  %nop11399 = alloca i1, i1 0
+  %nop11400 = alloca i1, i1 0
+  %nop11401 = alloca i1, i1 0
+  %nop11402 = alloca i1, i1 0
+  %nop11403 = alloca i1, i1 0
+  %nop11404 = alloca i1, i1 0
+  %nop11405 = alloca i1, i1 0
+  %nop11406 = alloca i1, i1 0
+  %nop11407 = alloca i1, i1 0
+  %nop11408 = alloca i1, i1 0
+  %nop11409 = alloca i1, i1 0
+  %nop11410 = alloca i1, i1 0
+  %nop11411 = alloca i1, i1 0
+  %nop11412 = alloca i1, i1 0
+  %nop11413 = alloca i1, i1 0
+  %nop11414 = alloca i1, i1 0
+  %nop11415 = alloca i1, i1 0
+  %nop11416 = alloca i1, i1 0
+  %nop11417 = alloca i1, i1 0
+  %nop11418 = alloca i1, i1 0
+  %nop11419 = alloca i1, i1 0
+  %nop11420 = alloca i1, i1 0
+  %nop11421 = alloca i1, i1 0
+  %nop11422 = alloca i1, i1 0
+  %nop11423 = alloca i1, i1 0
+  %nop11424 = alloca i1, i1 0
+  %nop11425 = alloca i1, i1 0
+  %nop11426 = alloca i1, i1 0
+  %nop11427 = alloca i1, i1 0
+  %nop11428 = alloca i1, i1 0
+  %nop11429 = alloca i1, i1 0
+  %nop11430 = alloca i1, i1 0
+  %nop11431 = alloca i1, i1 0
+  %nop11432 = alloca i1, i1 0
+  %nop11433 = alloca i1, i1 0
+  %nop11434 = alloca i1, i1 0
+  %nop11435 = alloca i1, i1 0
+  %nop11436 = alloca i1, i1 0
+  %nop11437 = alloca i1, i1 0
+  %nop11438 = alloca i1, i1 0
+  %nop11439 = alloca i1, i1 0
+  %nop11440 = alloca i1, i1 0
+  %nop11441 = alloca i1, i1 0
+  %nop11442 = alloca i1, i1 0
+  %nop11443 = alloca i1, i1 0
+  %nop11444 = alloca i1, i1 0
+  %nop11445 = alloca i1, i1 0
+  %nop11446 = alloca i1, i1 0
+  %nop11447 = alloca i1, i1 0
+  %nop11448 = alloca i1, i1 0
+  %nop11449 = alloca i1, i1 0
+  %nop11450 = alloca i1, i1 0
+  %nop11451 = alloca i1, i1 0
+  %nop11452 = alloca i1, i1 0
+  %nop11453 = alloca i1, i1 0
+  %nop11454 = alloca i1, i1 0
+  %nop11455 = alloca i1, i1 0
+  %nop11456 = alloca i1, i1 0
+  %nop11457 = alloca i1, i1 0
+  %nop11458 = alloca i1, i1 0
+  %nop11459 = alloca i1, i1 0
+  %nop11460 = alloca i1, i1 0
+  %nop11461 = alloca i1, i1 0
+  %nop11462 = alloca i1, i1 0
+  %nop11463 = alloca i1, i1 0
+  %nop11464 = alloca i1, i1 0
+  %nop11465 = alloca i1, i1 0
+  %nop11466 = alloca i1, i1 0
+  %nop11467 = alloca i1, i1 0
+  %nop11468 = alloca i1, i1 0
+  %nop11469 = alloca i1, i1 0
+  %nop11470 = alloca i1, i1 0
+  %nop11471 = alloca i1, i1 0
+  %nop11472 = alloca i1, i1 0
+  %nop11473 = alloca i1, i1 0
+  %nop11474 = alloca i1, i1 0
+  %nop11475 = alloca i1, i1 0
+  %nop11476 = alloca i1, i1 0
+  %nop11477 = alloca i1, i1 0
+  %nop11478 = alloca i1, i1 0
+  %nop11479 = alloca i1, i1 0
+  %nop11480 = alloca i1, i1 0
+  %nop11481 = alloca i1, i1 0
+  %nop11482 = alloca i1, i1 0
+  %nop11483 = alloca i1, i1 0
+  %nop11484 = alloca i1, i1 0
+  %nop11485 = alloca i1, i1 0
+  %nop11486 = alloca i1, i1 0
+  %nop11487 = alloca i1, i1 0
+  %nop11488 = alloca i1, i1 0
+  %nop11489 = alloca i1, i1 0
+  %nop11490 = alloca i1, i1 0
+  %nop11491 = alloca i1, i1 0
+  %nop11492 = alloca i1, i1 0
+  %nop11493 = alloca i1, i1 0
+  %nop11494 = alloca i1, i1 0
+  %nop11495 = alloca i1, i1 0
+  %nop11496 = alloca i1, i1 0
+  %nop11497 = alloca i1, i1 0
+  %nop11498 = alloca i1, i1 0
+  %nop11499 = alloca i1, i1 0
+  %nop11500 = alloca i1, i1 0
+  %nop11501 = alloca i1, i1 0
+  %nop11502 = alloca i1, i1 0
+  %nop11503 = alloca i1, i1 0
+  %nop11504 = alloca i1, i1 0
+  %nop11505 = alloca i1, i1 0
+  %nop11506 = alloca i1, i1 0
+  %nop11507 = alloca i1, i1 0
+  %nop11508 = alloca i1, i1 0
+  %nop11509 = alloca i1, i1 0
+  %nop11510 = alloca i1, i1 0
+  %nop11511 = alloca i1, i1 0
+  %nop11512 = alloca i1, i1 0
+  %nop11513 = alloca i1, i1 0
+  %nop11514 = alloca i1, i1 0
+  %nop11515 = alloca i1, i1 0
+  %nop11516 = alloca i1, i1 0
+  %nop11517 = alloca i1, i1 0
+  %nop11518 = alloca i1, i1 0
+  %nop11519 = alloca i1, i1 0
+  %nop11520 = alloca i1, i1 0
+  %nop11521 = alloca i1, i1 0
+  %nop11522 = alloca i1, i1 0
+  %nop11523 = alloca i1, i1 0
+  %nop11524 = alloca i1, i1 0
+  %nop11525 = alloca i1, i1 0
+  %nop11526 = alloca i1, i1 0
+  %nop11527 = alloca i1, i1 0
+  %nop11528 = alloca i1, i1 0
+  %nop11529 = alloca i1, i1 0
+  %nop11530 = alloca i1, i1 0
+  %nop11531 = alloca i1, i1 0
+  %nop11532 = alloca i1, i1 0
+  %nop11533 = alloca i1, i1 0
+  %nop11534 = alloca i1, i1 0
+  %nop11535 = alloca i1, i1 0
+  %nop11536 = alloca i1, i1 0
+  %nop11537 = alloca i1, i1 0
+  %nop11538 = alloca i1, i1 0
+  %nop11539 = alloca i1, i1 0
+  %nop11540 = alloca i1, i1 0
+  %nop11541 = alloca i1, i1 0
+  %nop11542 = alloca i1, i1 0
+  %nop11543 = alloca i1, i1 0
+  %nop11544 = alloca i1, i1 0
+  %nop11545 = alloca i1, i1 0
+  %nop11546 = alloca i1, i1 0
+  %nop11547 = alloca i1, i1 0
+  %nop11548 = alloca i1, i1 0
+  %nop11549 = alloca i1, i1 0
+  %nop11550 = alloca i1, i1 0
+  %nop11551 = alloca i1, i1 0
+  %nop11552 = alloca i1, i1 0
+  %nop11553 = alloca i1, i1 0
+  %nop11554 = alloca i1, i1 0
+  %nop11555 = alloca i1, i1 0
+  %nop11556 = alloca i1, i1 0
+  %nop11557 = alloca i1, i1 0
+  %nop11558 = alloca i1, i1 0
+  %nop11559 = alloca i1, i1 0
+  %nop11560 = alloca i1, i1 0
+  %nop11561 = alloca i1, i1 0
+  %nop11562 = alloca i1, i1 0
+  %nop11563 = alloca i1, i1 0
+  %nop11564 = alloca i1, i1 0
+  %nop11565 = alloca i1, i1 0
+  %nop11566 = alloca i1, i1 0
+  %nop11567 = alloca i1, i1 0
+  %nop11568 = alloca i1, i1 0
+  %nop11569 = alloca i1, i1 0
+  %nop11570 = alloca i1, i1 0
+  %nop11571 = alloca i1, i1 0
+  %nop11572 = alloca i1, i1 0
+  %nop11573 = alloca i1, i1 0
+  %nop11574 = alloca i1, i1 0
+  %nop11575 = alloca i1, i1 0
+  %nop11576 = alloca i1, i1 0
+  %nop11577 = alloca i1, i1 0
+  %nop11578 = alloca i1, i1 0
+  %nop11579 = alloca i1, i1 0
+  %nop11580 = alloca i1, i1 0
+  %nop11581 = alloca i1, i1 0
+  %nop11582 = alloca i1, i1 0
+  %nop11583 = alloca i1, i1 0
+  %nop11584 = alloca i1, i1 0
+  %nop11585 = alloca i1, i1 0
+  %nop11586 = alloca i1, i1 0
+  %nop11587 = alloca i1, i1 0
+  %nop11588 = alloca i1, i1 0
+  %nop11589 = alloca i1, i1 0
+  %nop11590 = alloca i1, i1 0
+  %nop11591 = alloca i1, i1 0
+  %nop11592 = alloca i1, i1 0
+  %nop11593 = alloca i1, i1 0
+  %nop11594 = alloca i1, i1 0
+  %nop11595 = alloca i1, i1 0
+  %nop11596 = alloca i1, i1 0
+  %nop11597 = alloca i1, i1 0
+  %nop11598 = alloca i1, i1 0
+  %nop11599 = alloca i1, i1 0
+  %nop11600 = alloca i1, i1 0
+  %nop11601 = alloca i1, i1 0
+  %nop11602 = alloca i1, i1 0
+  %nop11603 = alloca i1, i1 0
+  %nop11604 = alloca i1, i1 0
+  %nop11605 = alloca i1, i1 0
+  %nop11606 = alloca i1, i1 0
+  %nop11607 = alloca i1, i1 0
+  %nop11608 = alloca i1, i1 0
+  %nop11609 = alloca i1, i1 0
+  %nop11610 = alloca i1, i1 0
+  %nop11611 = alloca i1, i1 0
+  %nop11612 = alloca i1, i1 0
+  %nop11613 = alloca i1, i1 0
+  %nop11614 = alloca i1, i1 0
+  %nop11615 = alloca i1, i1 0
+  %nop11616 = alloca i1, i1 0
+  %nop11617 = alloca i1, i1 0
+  %nop11618 = alloca i1, i1 0
+  %nop11619 = alloca i1, i1 0
+  %nop11620 = alloca i1, i1 0
+  %nop11621 = alloca i1, i1 0
+  %nop11622 = alloca i1, i1 0
+  %nop11623 = alloca i1, i1 0
+  %nop11624 = alloca i1, i1 0
+  %nop11625 = alloca i1, i1 0
+  %nop11626 = alloca i1, i1 0
+  %nop11627 = alloca i1, i1 0
+  %nop11628 = alloca i1, i1 0
+  %nop11629 = alloca i1, i1 0
+  %nop11630 = alloca i1, i1 0
+  %nop11631 = alloca i1, i1 0
+  %nop11632 = alloca i1, i1 0
+  %nop11633 = alloca i1, i1 0
+  %nop11634 = alloca i1, i1 0
+  %nop11635 = alloca i1, i1 0
+  %nop11636 = alloca i1, i1 0
+  %nop11637 = alloca i1, i1 0
+  %nop11638 = alloca i1, i1 0
+  %nop11639 = alloca i1, i1 0
+  %nop11640 = alloca i1, i1 0
+  %nop11641 = alloca i1, i1 0
+  %nop11642 = alloca i1, i1 0
+  %nop11643 = alloca i1, i1 0
+  %nop11644 = alloca i1, i1 0
+  %nop11645 = alloca i1, i1 0
+  %nop11646 = alloca i1, i1 0
+  %nop11647 = alloca i1, i1 0
+  %nop11648 = alloca i1, i1 0
+  %nop11649 = alloca i1, i1 0
+  %nop11650 = alloca i1, i1 0
+  %nop11651 = alloca i1, i1 0
+  %nop11652 = alloca i1, i1 0
+  %nop11653 = alloca i1, i1 0
+  %nop11654 = alloca i1, i1 0
+  %nop11655 = alloca i1, i1 0
+  %nop11656 = alloca i1, i1 0
+  %nop11657 = alloca i1, i1 0
+  %nop11658 = alloca i1, i1 0
+  %nop11659 = alloca i1, i1 0
+  %nop11660 = alloca i1, i1 0
+  %nop11661 = alloca i1, i1 0
+  %nop11662 = alloca i1, i1 0
+  %nop11663 = alloca i1, i1 0
+  %nop11664 = alloca i1, i1 0
+  %nop11665 = alloca i1, i1 0
+  %nop11666 = alloca i1, i1 0
+  %nop11667 = alloca i1, i1 0
+  %nop11668 = alloca i1, i1 0
+  %nop11669 = alloca i1, i1 0
+  %nop11670 = alloca i1, i1 0
+  %nop11671 = alloca i1, i1 0
+  %nop11672 = alloca i1, i1 0
+  %nop11673 = alloca i1, i1 0
+  %nop11674 = alloca i1, i1 0
+  %nop11675 = alloca i1, i1 0
+  %nop11676 = alloca i1, i1 0
+  %nop11677 = alloca i1, i1 0
+  %nop11678 = alloca i1, i1 0
+  %nop11679 = alloca i1, i1 0
+  %nop11680 = alloca i1, i1 0
+  %nop11681 = alloca i1, i1 0
+  %nop11682 = alloca i1, i1 0
+  %nop11683 = alloca i1, i1 0
+  %nop11684 = alloca i1, i1 0
+  %nop11685 = alloca i1, i1 0
+  %nop11686 = alloca i1, i1 0
+  %nop11687 = alloca i1, i1 0
+  %nop11688 = alloca i1, i1 0
+  %nop11689 = alloca i1, i1 0
+  %nop11690 = alloca i1, i1 0
+  %nop11691 = alloca i1, i1 0
+  %nop11692 = alloca i1, i1 0
+  %nop11693 = alloca i1, i1 0
+  %nop11694 = alloca i1, i1 0
+  %nop11695 = alloca i1, i1 0
+  %nop11696 = alloca i1, i1 0
+  %nop11697 = alloca i1, i1 0
+  %nop11698 = alloca i1, i1 0
+  %nop11699 = alloca i1, i1 0
+  %nop11700 = alloca i1, i1 0
+  %nop11701 = alloca i1, i1 0
+  %nop11702 = alloca i1, i1 0
+  %nop11703 = alloca i1, i1 0
+  %nop11704 = alloca i1, i1 0
+  %nop11705 = alloca i1, i1 0
+  %nop11706 = alloca i1, i1 0
+  %nop11707 = alloca i1, i1 0
+  %nop11708 = alloca i1, i1 0
+  %nop11709 = alloca i1, i1 0
+  %nop11710 = alloca i1, i1 0
+  %nop11711 = alloca i1, i1 0
+  %nop11712 = alloca i1, i1 0
+  %nop11713 = alloca i1, i1 0
+  %nop11714 = alloca i1, i1 0
+  %nop11715 = alloca i1, i1 0
+  %nop11716 = alloca i1, i1 0
+  %nop11717 = alloca i1, i1 0
+  %nop11718 = alloca i1, i1 0
+  %nop11719 = alloca i1, i1 0
+  %nop11720 = alloca i1, i1 0
+  %nop11721 = alloca i1, i1 0
+  %nop11722 = alloca i1, i1 0
+  %nop11723 = alloca i1, i1 0
+  %nop11724 = alloca i1, i1 0
+  %nop11725 = alloca i1, i1 0
+  %nop11726 = alloca i1, i1 0
+  %nop11727 = alloca i1, i1 0
+  %nop11728 = alloca i1, i1 0
+  %nop11729 = alloca i1, i1 0
+  %nop11730 = alloca i1, i1 0
+  %nop11731 = alloca i1, i1 0
+  %nop11732 = alloca i1, i1 0
+  %nop11733 = alloca i1, i1 0
+  %nop11734 = alloca i1, i1 0
+  %nop11735 = alloca i1, i1 0
+  %nop11736 = alloca i1, i1 0
+  %nop11737 = alloca i1, i1 0
+  %nop11738 = alloca i1, i1 0
+  %nop11739 = alloca i1, i1 0
+  %nop11740 = alloca i1, i1 0
+  %nop11741 = alloca i1, i1 0
+  %nop11742 = alloca i1, i1 0
+  %nop11743 = alloca i1, i1 0
+  %nop11744 = alloca i1, i1 0
+  %nop11745 = alloca i1, i1 0
+  %nop11746 = alloca i1, i1 0
+  %nop11747 = alloca i1, i1 0
+  %nop11748 = alloca i1, i1 0
+  %nop11749 = alloca i1, i1 0
+  %nop11750 = alloca i1, i1 0
+  %nop11751 = alloca i1, i1 0
+  %nop11752 = alloca i1, i1 0
+  %nop11753 = alloca i1, i1 0
+  %nop11754 = alloca i1, i1 0
+  %nop11755 = alloca i1, i1 0
+  %nop11756 = alloca i1, i1 0
+  %nop11757 = alloca i1, i1 0
+  %nop11758 = alloca i1, i1 0
+  %nop11759 = alloca i1, i1 0
+  %nop11760 = alloca i1, i1 0
+  %nop11761 = alloca i1, i1 0
+  %nop11762 = alloca i1, i1 0
+  %nop11763 = alloca i1, i1 0
+  %nop11764 = alloca i1, i1 0
+  %nop11765 = alloca i1, i1 0
+  %nop11766 = alloca i1, i1 0
+  %nop11767 = alloca i1, i1 0
+  %nop11768 = alloca i1, i1 0
+  %nop11769 = alloca i1, i1 0
+  %nop11770 = alloca i1, i1 0
+  %nop11771 = alloca i1, i1 0
+  %nop11772 = alloca i1, i1 0
+  %nop11773 = alloca i1, i1 0
+  %nop11774 = alloca i1, i1 0
+  %nop11775 = alloca i1, i1 0
+  %nop11776 = alloca i1, i1 0
+  %nop11777 = alloca i1, i1 0
+  %nop11778 = alloca i1, i1 0
+  %nop11779 = alloca i1, i1 0
+  %nop11780 = alloca i1, i1 0
+  %nop11781 = alloca i1, i1 0
+  %nop11782 = alloca i1, i1 0
+  %nop11783 = alloca i1, i1 0
+  %nop11784 = alloca i1, i1 0
+  %nop11785 = alloca i1, i1 0
+  %nop11786 = alloca i1, i1 0
+  %nop11787 = alloca i1, i1 0
+  %nop11788 = alloca i1, i1 0
+  %nop11789 = alloca i1, i1 0
+  %nop11790 = alloca i1, i1 0
+  %nop11791 = alloca i1, i1 0
+  %nop11792 = alloca i1, i1 0
+  %nop11793 = alloca i1, i1 0
+  %nop11794 = alloca i1, i1 0
+  %nop11795 = alloca i1, i1 0
+  %nop11796 = alloca i1, i1 0
+  %nop11797 = alloca i1, i1 0
+  %nop11798 = alloca i1, i1 0
+  %nop11799 = alloca i1, i1 0
+  %nop11800 = alloca i1, i1 0
+  %nop11801 = alloca i1, i1 0
+  %nop11802 = alloca i1, i1 0
+  %nop11803 = alloca i1, i1 0
+  %nop11804 = alloca i1, i1 0
+  %nop11805 = alloca i1, i1 0
+  %nop11806 = alloca i1, i1 0
+  %nop11807 = alloca i1, i1 0
+  %nop11808 = alloca i1, i1 0
+  %nop11809 = alloca i1, i1 0
+  %nop11810 = alloca i1, i1 0
+  %nop11811 = alloca i1, i1 0
+  %nop11812 = alloca i1, i1 0
+  %nop11813 = alloca i1, i1 0
+  %nop11814 = alloca i1, i1 0
+  %nop11815 = alloca i1, i1 0
+  %nop11816 = alloca i1, i1 0
+  %nop11817 = alloca i1, i1 0
+  %nop11818 = alloca i1, i1 0
+  %nop11819 = alloca i1, i1 0
+  %nop11820 = alloca i1, i1 0
+  %nop11821 = alloca i1, i1 0
+  %nop11822 = alloca i1, i1 0
+  %nop11823 = alloca i1, i1 0
+  %nop11824 = alloca i1, i1 0
+  %nop11825 = alloca i1, i1 0
+  %nop11826 = alloca i1, i1 0
+  %nop11827 = alloca i1, i1 0
+  %nop11828 = alloca i1, i1 0
+  %nop11829 = alloca i1, i1 0
+  %nop11830 = alloca i1, i1 0
+  %nop11831 = alloca i1, i1 0
+  %nop11832 = alloca i1, i1 0
+  %nop11833 = alloca i1, i1 0
+  %nop11834 = alloca i1, i1 0
+  %nop11835 = alloca i1, i1 0
+  %nop11836 = alloca i1, i1 0
+  %nop11837 = alloca i1, i1 0
+  %nop11838 = alloca i1, i1 0
+  %nop11839 = alloca i1, i1 0
+  %nop11840 = alloca i1, i1 0
+  %nop11841 = alloca i1, i1 0
+  %nop11842 = alloca i1, i1 0
+  %nop11843 = alloca i1, i1 0
+  %nop11844 = alloca i1, i1 0
+  %nop11845 = alloca i1, i1 0
+  %nop11846 = alloca i1, i1 0
+  %nop11847 = alloca i1, i1 0
+  %nop11848 = alloca i1, i1 0
+  %nop11849 = alloca i1, i1 0
+  %nop11850 = alloca i1, i1 0
+  %nop11851 = alloca i1, i1 0
+  %nop11852 = alloca i1, i1 0
+  %nop11853 = alloca i1, i1 0
+  %nop11854 = alloca i1, i1 0
+  %nop11855 = alloca i1, i1 0
+  %nop11856 = alloca i1, i1 0
+  %nop11857 = alloca i1, i1 0
+  %nop11858 = alloca i1, i1 0
+  %nop11859 = alloca i1, i1 0
+  %nop11860 = alloca i1, i1 0
+  %nop11861 = alloca i1, i1 0
+  %nop11862 = alloca i1, i1 0
+  %nop11863 = alloca i1, i1 0
+  %nop11864 = alloca i1, i1 0
+  %nop11865 = alloca i1, i1 0
+  %nop11866 = alloca i1, i1 0
+  %nop11867 = alloca i1, i1 0
+  %nop11868 = alloca i1, i1 0
+  %nop11869 = alloca i1, i1 0
+  %nop11870 = alloca i1, i1 0
+  %nop11871 = alloca i1, i1 0
+  %nop11872 = alloca i1, i1 0
+  %nop11873 = alloca i1, i1 0
+  %nop11874 = alloca i1, i1 0
+  %nop11875 = alloca i1, i1 0
+  %nop11876 = alloca i1, i1 0
+  %nop11877 = alloca i1, i1 0
+  %nop11878 = alloca i1, i1 0
+  %nop11879 = alloca i1, i1 0
+  %nop11880 = alloca i1, i1 0
+  %nop11881 = alloca i1, i1 0
+  %nop11882 = alloca i1, i1 0
+  %nop11883 = alloca i1, i1 0
+  %nop11884 = alloca i1, i1 0
+  %nop11885 = alloca i1, i1 0
+  %nop11886 = alloca i1, i1 0
+  %nop11887 = alloca i1, i1 0
+  %nop11888 = alloca i1, i1 0
+  %nop11889 = alloca i1, i1 0
+  %nop11890 = alloca i1, i1 0
+  %nop11891 = alloca i1, i1 0
+  %nop11892 = alloca i1, i1 0
+  %nop11893 = alloca i1, i1 0
+  %nop11894 = alloca i1, i1 0
+  %nop11895 = alloca i1, i1 0
+  %nop11896 = alloca i1, i1 0
+  %nop11897 = alloca i1, i1 0
+  %nop11898 = alloca i1, i1 0
+  %nop11899 = alloca i1, i1 0
+  %nop11900 = alloca i1, i1 0
+  %nop11901 = alloca i1, i1 0
+  %nop11902 = alloca i1, i1 0
+  %nop11903 = alloca i1, i1 0
+  %nop11904 = alloca i1, i1 0
+  %nop11905 = alloca i1, i1 0
+  %nop11906 = alloca i1, i1 0
+  %nop11907 = alloca i1, i1 0
+  %nop11908 = alloca i1, i1 0
+  %nop11909 = alloca i1, i1 0
+  %nop11910 = alloca i1, i1 0
+  %nop11911 = alloca i1, i1 0
+  %nop11912 = alloca i1, i1 0
+  %nop11913 = alloca i1, i1 0
+  %nop11914 = alloca i1, i1 0
+  %nop11915 = alloca i1, i1 0
+  %nop11916 = alloca i1, i1 0
+  %nop11917 = alloca i1, i1 0
+  %nop11918 = alloca i1, i1 0
+  %nop11919 = alloca i1, i1 0
+  %nop11920 = alloca i1, i1 0
+  %nop11921 = alloca i1, i1 0
+  %nop11922 = alloca i1, i1 0
+  %nop11923 = alloca i1, i1 0
+  %nop11924 = alloca i1, i1 0
+  %nop11925 = alloca i1, i1 0
+  %nop11926 = alloca i1, i1 0
+  %nop11927 = alloca i1, i1 0
+  %nop11928 = alloca i1, i1 0
+  %nop11929 = alloca i1, i1 0
+  %nop11930 = alloca i1, i1 0
+  %nop11931 = alloca i1, i1 0
+  %nop11932 = alloca i1, i1 0
+  %nop11933 = alloca i1, i1 0
+  %nop11934 = alloca i1, i1 0
+  %nop11935 = alloca i1, i1 0
+  %nop11936 = alloca i1, i1 0
+  %nop11937 = alloca i1, i1 0
+  %nop11938 = alloca i1, i1 0
+  %nop11939 = alloca i1, i1 0
+  %nop11940 = alloca i1, i1 0
+  %nop11941 = alloca i1, i1 0
+  %nop11942 = alloca i1, i1 0
+  %nop11943 = alloca i1, i1 0
+  %nop11944 = alloca i1, i1 0
+  %nop11945 = alloca i1, i1 0
+  %nop11946 = alloca i1, i1 0
+  %nop11947 = alloca i1, i1 0
+  %nop11948 = alloca i1, i1 0
+  %nop11949 = alloca i1, i1 0
+  %nop11950 = alloca i1, i1 0
+  %nop11951 = alloca i1, i1 0
+  %nop11952 = alloca i1, i1 0
+  %nop11953 = alloca i1, i1 0
+  %nop11954 = alloca i1, i1 0
+  %nop11955 = alloca i1, i1 0
+  %nop11956 = alloca i1, i1 0
+  %nop11957 = alloca i1, i1 0
+  %nop11958 = alloca i1, i1 0
+  %nop11959 = alloca i1, i1 0
+  %nop11960 = alloca i1, i1 0
+  %nop11961 = alloca i1, i1 0
+  %nop11962 = alloca i1, i1 0
+  %nop11963 = alloca i1, i1 0
+  %nop11964 = alloca i1, i1 0
+  %nop11965 = alloca i1, i1 0
+  %nop11966 = alloca i1, i1 0
+  %nop11967 = alloca i1, i1 0
+  %nop11968 = alloca i1, i1 0
+  %nop11969 = alloca i1, i1 0
+  %nop11970 = alloca i1, i1 0
+  %nop11971 = alloca i1, i1 0
+  %nop11972 = alloca i1, i1 0
+  %nop11973 = alloca i1, i1 0
+  %nop11974 = alloca i1, i1 0
+  %nop11975 = alloca i1, i1 0
+  %nop11976 = alloca i1, i1 0
+  %nop11977 = alloca i1, i1 0
+  %nop11978 = alloca i1, i1 0
+  %nop11979 = alloca i1, i1 0
+  %nop11980 = alloca i1, i1 0
+  %nop11981 = alloca i1, i1 0
+  %nop11982 = alloca i1, i1 0
+  %nop11983 = alloca i1, i1 0
+  %nop11984 = alloca i1, i1 0
+  %nop11985 = alloca i1, i1 0
+  %nop11986 = alloca i1, i1 0
+  %nop11987 = alloca i1, i1 0
+  %nop11988 = alloca i1, i1 0
+  %nop11989 = alloca i1, i1 0
+  %nop11990 = alloca i1, i1 0
+  %nop11991 = alloca i1, i1 0
+  %nop11992 = alloca i1, i1 0
+  %nop11993 = alloca i1, i1 0
+  %nop11994 = alloca i1, i1 0
+  %nop11995 = alloca i1, i1 0
+  %nop11996 = alloca i1, i1 0
+  %nop11997 = alloca i1, i1 0
+  %nop11998 = alloca i1, i1 0
+  %nop11999 = alloca i1, i1 0
+  %nop12000 = alloca i1, i1 0
+  %nop12001 = alloca i1, i1 0
+  %nop12002 = alloca i1, i1 0
+  %nop12003 = alloca i1, i1 0
+  %nop12004 = alloca i1, i1 0
+  %nop12005 = alloca i1, i1 0
+  %nop12006 = alloca i1, i1 0
+  %nop12007 = alloca i1, i1 0
+  %nop12008 = alloca i1, i1 0
+  %nop12009 = alloca i1, i1 0
+  %nop12010 = alloca i1, i1 0
+  %nop12011 = alloca i1, i1 0
+  %nop12012 = alloca i1, i1 0
+  %nop12013 = alloca i1, i1 0
+  %nop12014 = alloca i1, i1 0
+  %nop12015 = alloca i1, i1 0
+  %nop12016 = alloca i1, i1 0
+  %nop12017 = alloca i1, i1 0
+  %nop12018 = alloca i1, i1 0
+  %nop12019 = alloca i1, i1 0
+  %nop12020 = alloca i1, i1 0
+  %nop12021 = alloca i1, i1 0
+  %nop12022 = alloca i1, i1 0
+  %nop12023 = alloca i1, i1 0
+  %nop12024 = alloca i1, i1 0
+  %nop12025 = alloca i1, i1 0
+  %nop12026 = alloca i1, i1 0
+  %nop12027 = alloca i1, i1 0
+  %nop12028 = alloca i1, i1 0
+  %nop12029 = alloca i1, i1 0
+  %nop12030 = alloca i1, i1 0
+  %nop12031 = alloca i1, i1 0
+  %nop12032 = alloca i1, i1 0
+  %nop12033 = alloca i1, i1 0
+  %nop12034 = alloca i1, i1 0
+  %nop12035 = alloca i1, i1 0
+  %nop12036 = alloca i1, i1 0
+  %nop12037 = alloca i1, i1 0
+  %nop12038 = alloca i1, i1 0
+  %nop12039 = alloca i1, i1 0
+  %nop12040 = alloca i1, i1 0
+  %nop12041 = alloca i1, i1 0
+  %nop12042 = alloca i1, i1 0
+  %nop12043 = alloca i1, i1 0
+  %nop12044 = alloca i1, i1 0
+  %nop12045 = alloca i1, i1 0
+  %nop12046 = alloca i1, i1 0
+  %nop12047 = alloca i1, i1 0
+  %nop12048 = alloca i1, i1 0
+  %nop12049 = alloca i1, i1 0
+  %nop12050 = alloca i1, i1 0
+  %nop12051 = alloca i1, i1 0
+  %nop12052 = alloca i1, i1 0
+  %nop12053 = alloca i1, i1 0
+  %nop12054 = alloca i1, i1 0
+  %nop12055 = alloca i1, i1 0
+  %nop12056 = alloca i1, i1 0
+  %nop12057 = alloca i1, i1 0
+  %nop12058 = alloca i1, i1 0
+  %nop12059 = alloca i1, i1 0
+  %nop12060 = alloca i1, i1 0
+  %nop12061 = alloca i1, i1 0
+  %nop12062 = alloca i1, i1 0
+  %nop12063 = alloca i1, i1 0
+  %nop12064 = alloca i1, i1 0
+  %nop12065 = alloca i1, i1 0
+  %nop12066 = alloca i1, i1 0
+  %nop12067 = alloca i1, i1 0
+  %nop12068 = alloca i1, i1 0
+  %nop12069 = alloca i1, i1 0
+  %nop12070 = alloca i1, i1 0
+  %nop12071 = alloca i1, i1 0
+  %nop12072 = alloca i1, i1 0
+  %nop12073 = alloca i1, i1 0
+  %nop12074 = alloca i1, i1 0
+  %nop12075 = alloca i1, i1 0
+  %nop12076 = alloca i1, i1 0
+  %nop12077 = alloca i1, i1 0
+  %nop12078 = alloca i1, i1 0
+  %nop12079 = alloca i1, i1 0
+  %nop12080 = alloca i1, i1 0
+  %nop12081 = alloca i1, i1 0
+  %nop12082 = alloca i1, i1 0
+  %nop12083 = alloca i1, i1 0
+  %nop12084 = alloca i1, i1 0
+  %nop12085 = alloca i1, i1 0
+  %nop12086 = alloca i1, i1 0
+  %nop12087 = alloca i1, i1 0
+  %nop12088 = alloca i1, i1 0
+  %nop12089 = alloca i1, i1 0
+  %nop12090 = alloca i1, i1 0
+  %nop12091 = alloca i1, i1 0
+  %nop12092 = alloca i1, i1 0
+  %nop12093 = alloca i1, i1 0
+  %nop12094 = alloca i1, i1 0
+  %nop12095 = alloca i1, i1 0
+  %nop12096 = alloca i1, i1 0
+  %nop12097 = alloca i1, i1 0
+  %nop12098 = alloca i1, i1 0
+  %nop12099 = alloca i1, i1 0
+  %nop12100 = alloca i1, i1 0
+  %nop12101 = alloca i1, i1 0
+  %nop12102 = alloca i1, i1 0
+  %nop12103 = alloca i1, i1 0
+  %nop12104 = alloca i1, i1 0
+  %nop12105 = alloca i1, i1 0
+  %nop12106 = alloca i1, i1 0
+  %nop12107 = alloca i1, i1 0
+  %nop12108 = alloca i1, i1 0
+  %nop12109 = alloca i1, i1 0
+  %nop12110 = alloca i1, i1 0
+  %nop12111 = alloca i1, i1 0
+  %nop12112 = alloca i1, i1 0
+  %nop12113 = alloca i1, i1 0
+  %nop12114 = alloca i1, i1 0
+  %nop12115 = alloca i1, i1 0
+  %nop12116 = alloca i1, i1 0
+  %nop12117 = alloca i1, i1 0
+  %nop12118 = alloca i1, i1 0
+  %nop12119 = alloca i1, i1 0
+  %nop12120 = alloca i1, i1 0
+  %nop12121 = alloca i1, i1 0
+  %nop12122 = alloca i1, i1 0
+  %nop12123 = alloca i1, i1 0
+  %nop12124 = alloca i1, i1 0
+  %nop12125 = alloca i1, i1 0
+  %nop12126 = alloca i1, i1 0
+  %nop12127 = alloca i1, i1 0
+  %nop12128 = alloca i1, i1 0
+  %nop12129 = alloca i1, i1 0
+  %nop12130 = alloca i1, i1 0
+  %nop12131 = alloca i1, i1 0
+  %nop12132 = alloca i1, i1 0
+  %nop12133 = alloca i1, i1 0
+  %nop12134 = alloca i1, i1 0
+  %nop12135 = alloca i1, i1 0
+  %nop12136 = alloca i1, i1 0
+  %nop12137 = alloca i1, i1 0
+  %nop12138 = alloca i1, i1 0
+  %nop12139 = alloca i1, i1 0
+  %nop12140 = alloca i1, i1 0
+  %nop12141 = alloca i1, i1 0
+  %nop12142 = alloca i1, i1 0
+  %nop12143 = alloca i1, i1 0
+  %nop12144 = alloca i1, i1 0
+  %nop12145 = alloca i1, i1 0
+  %nop12146 = alloca i1, i1 0
+  %nop12147 = alloca i1, i1 0
+  %nop12148 = alloca i1, i1 0
+  %nop12149 = alloca i1, i1 0
+  %nop12150 = alloca i1, i1 0
+  %nop12151 = alloca i1, i1 0
+  %nop12152 = alloca i1, i1 0
+  %nop12153 = alloca i1, i1 0
+  %nop12154 = alloca i1, i1 0
+  %nop12155 = alloca i1, i1 0
+  %nop12156 = alloca i1, i1 0
+  %nop12157 = alloca i1, i1 0
+  %nop12158 = alloca i1, i1 0
+  %nop12159 = alloca i1, i1 0
+  %nop12160 = alloca i1, i1 0
+  %nop12161 = alloca i1, i1 0
+  %nop12162 = alloca i1, i1 0
+  %nop12163 = alloca i1, i1 0
+  %nop12164 = alloca i1, i1 0
+  %nop12165 = alloca i1, i1 0
+  %nop12166 = alloca i1, i1 0
+  %nop12167 = alloca i1, i1 0
+  %nop12168 = alloca i1, i1 0
+  %nop12169 = alloca i1, i1 0
+  %nop12170 = alloca i1, i1 0
+  %nop12171 = alloca i1, i1 0
+  %nop12172 = alloca i1, i1 0
+  %nop12173 = alloca i1, i1 0
+  %nop12174 = alloca i1, i1 0
+  %nop12175 = alloca i1, i1 0
+  %nop12176 = alloca i1, i1 0
+  %nop12177 = alloca i1, i1 0
+  %nop12178 = alloca i1, i1 0
+  %nop12179 = alloca i1, i1 0
+  %nop12180 = alloca i1, i1 0
+  %nop12181 = alloca i1, i1 0
+  %nop12182 = alloca i1, i1 0
+  %nop12183 = alloca i1, i1 0
+  %nop12184 = alloca i1, i1 0
+  %nop12185 = alloca i1, i1 0
+  %nop12186 = alloca i1, i1 0
+  %nop12187 = alloca i1, i1 0
+  %nop12188 = alloca i1, i1 0
+  %nop12189 = alloca i1, i1 0
+  %nop12190 = alloca i1, i1 0
+  %nop12191 = alloca i1, i1 0
+  %nop12192 = alloca i1, i1 0
+  %nop12193 = alloca i1, i1 0
+  %nop12194 = alloca i1, i1 0
+  %nop12195 = alloca i1, i1 0
+  %nop12196 = alloca i1, i1 0
+  %nop12197 = alloca i1, i1 0
+  %nop12198 = alloca i1, i1 0
+  %nop12199 = alloca i1, i1 0
+  %nop12200 = alloca i1, i1 0
+  %nop12201 = alloca i1, i1 0
+  %nop12202 = alloca i1, i1 0
+  %nop12203 = alloca i1, i1 0
+  %nop12204 = alloca i1, i1 0
+  %nop12205 = alloca i1, i1 0
+  %nop12206 = alloca i1, i1 0
+  %nop12207 = alloca i1, i1 0
+  %nop12208 = alloca i1, i1 0
+  %nop12209 = alloca i1, i1 0
+  %nop12210 = alloca i1, i1 0
+  %nop12211 = alloca i1, i1 0
+  %nop12212 = alloca i1, i1 0
+  %nop12213 = alloca i1, i1 0
+  %nop12214 = alloca i1, i1 0
+  %nop12215 = alloca i1, i1 0
+  %nop12216 = alloca i1, i1 0
+  %nop12217 = alloca i1, i1 0
+  %nop12218 = alloca i1, i1 0
+  %nop12219 = alloca i1, i1 0
+  %nop12220 = alloca i1, i1 0
+  %nop12221 = alloca i1, i1 0
+  %nop12222 = alloca i1, i1 0
+  %nop12223 = alloca i1, i1 0
+  %nop12224 = alloca i1, i1 0
+  %nop12225 = alloca i1, i1 0
+  %nop12226 = alloca i1, i1 0
+  %nop12227 = alloca i1, i1 0
+  %nop12228 = alloca i1, i1 0
+  %nop12229 = alloca i1, i1 0
+  %nop12230 = alloca i1, i1 0
+  %nop12231 = alloca i1, i1 0
+  %nop12232 = alloca i1, i1 0
+  %nop12233 = alloca i1, i1 0
+  %nop12234 = alloca i1, i1 0
+  %nop12235 = alloca i1, i1 0
+  %nop12236 = alloca i1, i1 0
+  %nop12237 = alloca i1, i1 0
+  %nop12238 = alloca i1, i1 0
+  %nop12239 = alloca i1, i1 0
+  %nop12240 = alloca i1, i1 0
+  %nop12241 = alloca i1, i1 0
+  %nop12242 = alloca i1, i1 0
+  %nop12243 = alloca i1, i1 0
+  %nop12244 = alloca i1, i1 0
+  %nop12245 = alloca i1, i1 0
+  %nop12246 = alloca i1, i1 0
+  %nop12247 = alloca i1, i1 0
+  %nop12248 = alloca i1, i1 0
+  %nop12249 = alloca i1, i1 0
+  %nop12250 = alloca i1, i1 0
+  %nop12251 = alloca i1, i1 0
+  %nop12252 = alloca i1, i1 0
+  %nop12253 = alloca i1, i1 0
+  %nop12254 = alloca i1, i1 0
+  %nop12255 = alloca i1, i1 0
+  %nop12256 = alloca i1, i1 0
+  %nop12257 = alloca i1, i1 0
+  %nop12258 = alloca i1, i1 0
+  %nop12259 = alloca i1, i1 0
+  %nop12260 = alloca i1, i1 0
+  %nop12261 = alloca i1, i1 0
+  %nop12262 = alloca i1, i1 0
+  %nop12263 = alloca i1, i1 0
+  %nop12264 = alloca i1, i1 0
+  %nop12265 = alloca i1, i1 0
+  %nop12266 = alloca i1, i1 0
+  %nop12267 = alloca i1, i1 0
+  %nop12268 = alloca i1, i1 0
+  %nop12269 = alloca i1, i1 0
+  %nop12270 = alloca i1, i1 0
+  %nop12271 = alloca i1, i1 0
+  %nop12272 = alloca i1, i1 0
+  %nop12273 = alloca i1, i1 0
+  %nop12274 = alloca i1, i1 0
+  %nop12275 = alloca i1, i1 0
+  %nop12276 = alloca i1, i1 0
+  %nop12277 = alloca i1, i1 0
+  %nop12278 = alloca i1, i1 0
+  %nop12279 = alloca i1, i1 0
+  %nop12280 = alloca i1, i1 0
+  %nop12281 = alloca i1, i1 0
+  %nop12282 = alloca i1, i1 0
+  %nop12283 = alloca i1, i1 0
+  %nop12284 = alloca i1, i1 0
+  %nop12285 = alloca i1, i1 0
+  %nop12286 = alloca i1, i1 0
+  %nop12287 = alloca i1, i1 0
+  %nop12288 = alloca i1, i1 0
+  %nop12289 = alloca i1, i1 0
+  %nop12290 = alloca i1, i1 0
+  %nop12291 = alloca i1, i1 0
+  %nop12292 = alloca i1, i1 0
+  %nop12293 = alloca i1, i1 0
+  %nop12294 = alloca i1, i1 0
+  %nop12295 = alloca i1, i1 0
+  %nop12296 = alloca i1, i1 0
+  %nop12297 = alloca i1, i1 0
+  %nop12298 = alloca i1, i1 0
+  %nop12299 = alloca i1, i1 0
+  %nop12300 = alloca i1, i1 0
+  %nop12301 = alloca i1, i1 0
+  %nop12302 = alloca i1, i1 0
+  %nop12303 = alloca i1, i1 0
+  %nop12304 = alloca i1, i1 0
+  %nop12305 = alloca i1, i1 0
+  %nop12306 = alloca i1, i1 0
+  %nop12307 = alloca i1, i1 0
+  %nop12308 = alloca i1, i1 0
+  %nop12309 = alloca i1, i1 0
+  %nop12310 = alloca i1, i1 0
+  %nop12311 = alloca i1, i1 0
+  %nop12312 = alloca i1, i1 0
+  %nop12313 = alloca i1, i1 0
+  %nop12314 = alloca i1, i1 0
+  %nop12315 = alloca i1, i1 0
+  %nop12316 = alloca i1, i1 0
+  %nop12317 = alloca i1, i1 0
+  %nop12318 = alloca i1, i1 0
+  %nop12319 = alloca i1, i1 0
+  %nop12320 = alloca i1, i1 0
+  %nop12321 = alloca i1, i1 0
+  %nop12322 = alloca i1, i1 0
+  %nop12323 = alloca i1, i1 0
+  %nop12324 = alloca i1, i1 0
+  %nop12325 = alloca i1, i1 0
+  %nop12326 = alloca i1, i1 0
+  %nop12327 = alloca i1, i1 0
+  %nop12328 = alloca i1, i1 0
+  %nop12329 = alloca i1, i1 0
+  %nop12330 = alloca i1, i1 0
+  %nop12331 = alloca i1, i1 0
+  %nop12332 = alloca i1, i1 0
+  %nop12333 = alloca i1, i1 0
+  %nop12334 = alloca i1, i1 0
+  %nop12335 = alloca i1, i1 0
+  %nop12336 = alloca i1, i1 0
+  %nop12337 = alloca i1, i1 0
+  %nop12338 = alloca i1, i1 0
+  %nop12339 = alloca i1, i1 0
+  %nop12340 = alloca i1, i1 0
+  %nop12341 = alloca i1, i1 0
+  %nop12342 = alloca i1, i1 0
+  %nop12343 = alloca i1, i1 0
+  %nop12344 = alloca i1, i1 0
+  %nop12345 = alloca i1, i1 0
+  %nop12346 = alloca i1, i1 0
+  %nop12347 = alloca i1, i1 0
+  %nop12348 = alloca i1, i1 0
+  %nop12349 = alloca i1, i1 0
+  %nop12350 = alloca i1, i1 0
+  %nop12351 = alloca i1, i1 0
+  %nop12352 = alloca i1, i1 0
+  %nop12353 = alloca i1, i1 0
+  %nop12354 = alloca i1, i1 0
+  %nop12355 = alloca i1, i1 0
+  %nop12356 = alloca i1, i1 0
+  %nop12357 = alloca i1, i1 0
+  %nop12358 = alloca i1, i1 0
+  %nop12359 = alloca i1, i1 0
+  %nop12360 = alloca i1, i1 0
+  %nop12361 = alloca i1, i1 0
+  %nop12362 = alloca i1, i1 0
+  %nop12363 = alloca i1, i1 0
+  %nop12364 = alloca i1, i1 0
+  %nop12365 = alloca i1, i1 0
+  %nop12366 = alloca i1, i1 0
+  %nop12367 = alloca i1, i1 0
+  %nop12368 = alloca i1, i1 0
+  %nop12369 = alloca i1, i1 0
+  %nop12370 = alloca i1, i1 0
+  %nop12371 = alloca i1, i1 0
+  %nop12372 = alloca i1, i1 0
+  %nop12373 = alloca i1, i1 0
+  %nop12374 = alloca i1, i1 0
+  %nop12375 = alloca i1, i1 0
+  %nop12376 = alloca i1, i1 0
+  %nop12377 = alloca i1, i1 0
+  %nop12378 = alloca i1, i1 0
+  %nop12379 = alloca i1, i1 0
+  %nop12380 = alloca i1, i1 0
+  %nop12381 = alloca i1, i1 0
+  %nop12382 = alloca i1, i1 0
+  %nop12383 = alloca i1, i1 0
+  %nop12384 = alloca i1, i1 0
+  %nop12385 = alloca i1, i1 0
+  %nop12386 = alloca i1, i1 0
+  %nop12387 = alloca i1, i1 0
+  %nop12388 = alloca i1, i1 0
+  %nop12389 = alloca i1, i1 0
+  %nop12390 = alloca i1, i1 0
+  %nop12391 = alloca i1, i1 0
+  %nop12392 = alloca i1, i1 0
+  %nop12393 = alloca i1, i1 0
+  %nop12394 = alloca i1, i1 0
+  %nop12395 = alloca i1, i1 0
+  %nop12396 = alloca i1, i1 0
+  %nop12397 = alloca i1, i1 0
+  %nop12398 = alloca i1, i1 0
+  %nop12399 = alloca i1, i1 0
+  %nop12400 = alloca i1, i1 0
+  %nop12401 = alloca i1, i1 0
+  %nop12402 = alloca i1, i1 0
+  %nop12403 = alloca i1, i1 0
+  %nop12404 = alloca i1, i1 0
+  %nop12405 = alloca i1, i1 0
+  %nop12406 = alloca i1, i1 0
+  %nop12407 = alloca i1, i1 0
+  %nop12408 = alloca i1, i1 0
+  %nop12409 = alloca i1, i1 0
+  %nop12410 = alloca i1, i1 0
+  %nop12411 = alloca i1, i1 0
+  %nop12412 = alloca i1, i1 0
+  %nop12413 = alloca i1, i1 0
+  %nop12414 = alloca i1, i1 0
+  %nop12415 = alloca i1, i1 0
+  %nop12416 = alloca i1, i1 0
+  %nop12417 = alloca i1, i1 0
+  %nop12418 = alloca i1, i1 0
+  %nop12419 = alloca i1, i1 0
+  %nop12420 = alloca i1, i1 0
+  %nop12421 = alloca i1, i1 0
+  %nop12422 = alloca i1, i1 0
+  %nop12423 = alloca i1, i1 0
+  %nop12424 = alloca i1, i1 0
+  %nop12425 = alloca i1, i1 0
+  %nop12426 = alloca i1, i1 0
+  %nop12427 = alloca i1, i1 0
+  %nop12428 = alloca i1, i1 0
+  %nop12429 = alloca i1, i1 0
+  %nop12430 = alloca i1, i1 0
+  %nop12431 = alloca i1, i1 0
+  %nop12432 = alloca i1, i1 0
+  %nop12433 = alloca i1, i1 0
+  %nop12434 = alloca i1, i1 0
+  %nop12435 = alloca i1, i1 0
+  %nop12436 = alloca i1, i1 0
+  %nop12437 = alloca i1, i1 0
+  %nop12438 = alloca i1, i1 0
+  %nop12439 = alloca i1, i1 0
+  %nop12440 = alloca i1, i1 0
+  %nop12441 = alloca i1, i1 0
+  %nop12442 = alloca i1, i1 0
+  %nop12443 = alloca i1, i1 0
+  %nop12444 = alloca i1, i1 0
+  %nop12445 = alloca i1, i1 0
+  %nop12446 = alloca i1, i1 0
+  %nop12447 = alloca i1, i1 0
+  %nop12448 = alloca i1, i1 0
+  %nop12449 = alloca i1, i1 0
+  %nop12450 = alloca i1, i1 0
+  %nop12451 = alloca i1, i1 0
+  %nop12452 = alloca i1, i1 0
+  %nop12453 = alloca i1, i1 0
+  %nop12454 = alloca i1, i1 0
+  %nop12455 = alloca i1, i1 0
+  %nop12456 = alloca i1, i1 0
+  %nop12457 = alloca i1, i1 0
+  %nop12458 = alloca i1, i1 0
+  %nop12459 = alloca i1, i1 0
+  %nop12460 = alloca i1, i1 0
+  %nop12461 = alloca i1, i1 0
+  %nop12462 = alloca i1, i1 0
+  %nop12463 = alloca i1, i1 0
+  %nop12464 = alloca i1, i1 0
+  %nop12465 = alloca i1, i1 0
+  %nop12466 = alloca i1, i1 0
+  %nop12467 = alloca i1, i1 0
+  %nop12468 = alloca i1, i1 0
+  %nop12469 = alloca i1, i1 0
+  %nop12470 = alloca i1, i1 0
+  %nop12471 = alloca i1, i1 0
+  %nop12472 = alloca i1, i1 0
+  %nop12473 = alloca i1, i1 0
+  %nop12474 = alloca i1, i1 0
+  %nop12475 = alloca i1, i1 0
+  %nop12476 = alloca i1, i1 0
+  %nop12477 = alloca i1, i1 0
+  %nop12478 = alloca i1, i1 0
+  %nop12479 = alloca i1, i1 0
+  %nop12480 = alloca i1, i1 0
+  %nop12481 = alloca i1, i1 0
+  %nop12482 = alloca i1, i1 0
+  %nop12483 = alloca i1, i1 0
+  %nop12484 = alloca i1, i1 0
+  %nop12485 = alloca i1, i1 0
+  %nop12486 = alloca i1, i1 0
+  %nop12487 = alloca i1, i1 0
+  %nop12488 = alloca i1, i1 0
+  %nop12489 = alloca i1, i1 0
+  %nop12490 = alloca i1, i1 0
+  %nop12491 = alloca i1, i1 0
+  %nop12492 = alloca i1, i1 0
+  %nop12493 = alloca i1, i1 0
+  %nop12494 = alloca i1, i1 0
+  %nop12495 = alloca i1, i1 0
+  %nop12496 = alloca i1, i1 0
+  %nop12497 = alloca i1, i1 0
+  %nop12498 = alloca i1, i1 0
+  %nop12499 = alloca i1, i1 0
+  %nop12500 = alloca i1, i1 0
+  %nop12501 = alloca i1, i1 0
+  %nop12502 = alloca i1, i1 0
+  %nop12503 = alloca i1, i1 0
+  %nop12504 = alloca i1, i1 0
+  %nop12505 = alloca i1, i1 0
+  %nop12506 = alloca i1, i1 0
+  %nop12507 = alloca i1, i1 0
+  %nop12508 = alloca i1, i1 0
+  %nop12509 = alloca i1, i1 0
+  %nop12510 = alloca i1, i1 0
+  %nop12511 = alloca i1, i1 0
+  %nop12512 = alloca i1, i1 0
+  %nop12513 = alloca i1, i1 0
+  %nop12514 = alloca i1, i1 0
+  %nop12515 = alloca i1, i1 0
+  %nop12516 = alloca i1, i1 0
+  %nop12517 = alloca i1, i1 0
+  %nop12518 = alloca i1, i1 0
+  %nop12519 = alloca i1, i1 0
+  %nop12520 = alloca i1, i1 0
+  %nop12521 = alloca i1, i1 0
+  %nop12522 = alloca i1, i1 0
+  %nop12523 = alloca i1, i1 0
+  %nop12524 = alloca i1, i1 0
+  %nop12525 = alloca i1, i1 0
+  %nop12526 = alloca i1, i1 0
+  %nop12527 = alloca i1, i1 0
+  %nop12528 = alloca i1, i1 0
+  %nop12529 = alloca i1, i1 0
+  %nop12530 = alloca i1, i1 0
+  %nop12531 = alloca i1, i1 0
+  %nop12532 = alloca i1, i1 0
+  %nop12533 = alloca i1, i1 0
+  %nop12534 = alloca i1, i1 0
+  %nop12535 = alloca i1, i1 0
+  %nop12536 = alloca i1, i1 0
+  %nop12537 = alloca i1, i1 0
+  %nop12538 = alloca i1, i1 0
+  %nop12539 = alloca i1, i1 0
+  %nop12540 = alloca i1, i1 0
+  %nop12541 = alloca i1, i1 0
+  %nop12542 = alloca i1, i1 0
+  %nop12543 = alloca i1, i1 0
+  %nop12544 = alloca i1, i1 0
+  %nop12545 = alloca i1, i1 0
+  %nop12546 = alloca i1, i1 0
+  %nop12547 = alloca i1, i1 0
+  %nop12548 = alloca i1, i1 0
+  %nop12549 = alloca i1, i1 0
+  %nop12550 = alloca i1, i1 0
+  %nop12551 = alloca i1, i1 0
+  %nop12552 = alloca i1, i1 0
+  %nop12553 = alloca i1, i1 0
+  %nop12554 = alloca i1, i1 0
+  %nop12555 = alloca i1, i1 0
+  %nop12556 = alloca i1, i1 0
+  %nop12557 = alloca i1, i1 0
+  %nop12558 = alloca i1, i1 0
+  %nop12559 = alloca i1, i1 0
+  %nop12560 = alloca i1, i1 0
+  %nop12561 = alloca i1, i1 0
+  %nop12562 = alloca i1, i1 0
+  %nop12563 = alloca i1, i1 0
+  %nop12564 = alloca i1, i1 0
+  %nop12565 = alloca i1, i1 0
+  %nop12566 = alloca i1, i1 0
+  %nop12567 = alloca i1, i1 0
+  %nop12568 = alloca i1, i1 0
+  %nop12569 = alloca i1, i1 0
+  %nop12570 = alloca i1, i1 0
+  %nop12571 = alloca i1, i1 0
+  %nop12572 = alloca i1, i1 0
+  %nop12573 = alloca i1, i1 0
+  %nop12574 = alloca i1, i1 0
+  %nop12575 = alloca i1, i1 0
+  %nop12576 = alloca i1, i1 0
+  %nop12577 = alloca i1, i1 0
+  %nop12578 = alloca i1, i1 0
+  %nop12579 = alloca i1, i1 0
+  %nop12580 = alloca i1, i1 0
+  %nop12581 = alloca i1, i1 0
+  %nop12582 = alloca i1, i1 0
+  %nop12583 = alloca i1, i1 0
+  %nop12584 = alloca i1, i1 0
+  %nop12585 = alloca i1, i1 0
+  %nop12586 = alloca i1, i1 0
+  %nop12587 = alloca i1, i1 0
+  %nop12588 = alloca i1, i1 0
+  %nop12589 = alloca i1, i1 0
+  %nop12590 = alloca i1, i1 0
+  %nop12591 = alloca i1, i1 0
+  %nop12592 = alloca i1, i1 0
+  %nop12593 = alloca i1, i1 0
+  %nop12594 = alloca i1, i1 0
+  %nop12595 = alloca i1, i1 0
+  %nop12596 = alloca i1, i1 0
+  %nop12597 = alloca i1, i1 0
+  %nop12598 = alloca i1, i1 0
+  %nop12599 = alloca i1, i1 0
+  %nop12600 = alloca i1, i1 0
+  %nop12601 = alloca i1, i1 0
+  %nop12602 = alloca i1, i1 0
+  %nop12603 = alloca i1, i1 0
+  %nop12604 = alloca i1, i1 0
+  %nop12605 = alloca i1, i1 0
+  %nop12606 = alloca i1, i1 0
+  %nop12607 = alloca i1, i1 0
+  %nop12608 = alloca i1, i1 0
+  %nop12609 = alloca i1, i1 0
+  %nop12610 = alloca i1, i1 0
+  %nop12611 = alloca i1, i1 0
+  %nop12612 = alloca i1, i1 0
+  %nop12613 = alloca i1, i1 0
+  %nop12614 = alloca i1, i1 0
+  %nop12615 = alloca i1, i1 0
+  %nop12616 = alloca i1, i1 0
+  %nop12617 = alloca i1, i1 0
+  %nop12618 = alloca i1, i1 0
+  %nop12619 = alloca i1, i1 0
+  %nop12620 = alloca i1, i1 0
+  %nop12621 = alloca i1, i1 0
+  %nop12622 = alloca i1, i1 0
+  %nop12623 = alloca i1, i1 0
+  %nop12624 = alloca i1, i1 0
+  %nop12625 = alloca i1, i1 0
+  %nop12626 = alloca i1, i1 0
+  %nop12627 = alloca i1, i1 0
+  %nop12628 = alloca i1, i1 0
+  %nop12629 = alloca i1, i1 0
+  %nop12630 = alloca i1, i1 0
+  %nop12631 = alloca i1, i1 0
+  %nop12632 = alloca i1, i1 0
+  %nop12633 = alloca i1, i1 0
+  %nop12634 = alloca i1, i1 0
+  %nop12635 = alloca i1, i1 0
+  %nop12636 = alloca i1, i1 0
+  %nop12637 = alloca i1, i1 0
+  %nop12638 = alloca i1, i1 0
+  %nop12639 = alloca i1, i1 0
+  %nop12640 = alloca i1, i1 0
+  %nop12641 = alloca i1, i1 0
+  %nop12642 = alloca i1, i1 0
+  %nop12643 = alloca i1, i1 0
+  %nop12644 = alloca i1, i1 0
+  %nop12645 = alloca i1, i1 0
+  %nop12646 = alloca i1, i1 0
+  %nop12647 = alloca i1, i1 0
+  %nop12648 = alloca i1, i1 0
+  %nop12649 = alloca i1, i1 0
+  %nop12650 = alloca i1, i1 0
+  %nop12651 = alloca i1, i1 0
+  %nop12652 = alloca i1, i1 0
+  %nop12653 = alloca i1, i1 0
+  %nop12654 = alloca i1, i1 0
+  %nop12655 = alloca i1, i1 0
+  %nop12656 = alloca i1, i1 0
+  %nop12657 = alloca i1, i1 0
+  %nop12658 = alloca i1, i1 0
+  %nop12659 = alloca i1, i1 0
+  %nop12660 = alloca i1, i1 0
+  %nop12661 = alloca i1, i1 0
+  %nop12662 = alloca i1, i1 0
+  %nop12663 = alloca i1, i1 0
+  %nop12664 = alloca i1, i1 0
+  %nop12665 = alloca i1, i1 0
+  %nop12666 = alloca i1, i1 0
+  %nop12667 = alloca i1, i1 0
+  %nop12668 = alloca i1, i1 0
+  %nop12669 = alloca i1, i1 0
+  %nop12670 = alloca i1, i1 0
+  %nop12671 = alloca i1, i1 0
+  %nop12672 = alloca i1, i1 0
+  %nop12673 = alloca i1, i1 0
+  %nop12674 = alloca i1, i1 0
+  %nop12675 = alloca i1, i1 0
+  %nop12676 = alloca i1, i1 0
+  %nop12677 = alloca i1, i1 0
+  %nop12678 = alloca i1, i1 0
+  %nop12679 = alloca i1, i1 0
+  %nop12680 = alloca i1, i1 0
+  %nop12681 = alloca i1, i1 0
+  %nop12682 = alloca i1, i1 0
+  %nop12683 = alloca i1, i1 0
+  %nop12684 = alloca i1, i1 0
+  %nop12685 = alloca i1, i1 0
+  %nop12686 = alloca i1, i1 0
+  %nop12687 = alloca i1, i1 0
+  %nop12688 = alloca i1, i1 0
+  %nop12689 = alloca i1, i1 0
+  %nop12690 = alloca i1, i1 0
+  %nop12691 = alloca i1, i1 0
+  %nop12692 = alloca i1, i1 0
+  %nop12693 = alloca i1, i1 0
+  %nop12694 = alloca i1, i1 0
+  %nop12695 = alloca i1, i1 0
+  %nop12696 = alloca i1, i1 0
+  %nop12697 = alloca i1, i1 0
+  %nop12698 = alloca i1, i1 0
+  %nop12699 = alloca i1, i1 0
+  %nop12700 = alloca i1, i1 0
+  %nop12701 = alloca i1, i1 0
+  %nop12702 = alloca i1, i1 0
+  %nop12703 = alloca i1, i1 0
+  %nop12704 = alloca i1, i1 0
+  %nop12705 = alloca i1, i1 0
+  %nop12706 = alloca i1, i1 0
+  %nop12707 = alloca i1, i1 0
+  %nop12708 = alloca i1, i1 0
+  %nop12709 = alloca i1, i1 0
+  %nop12710 = alloca i1, i1 0
+  %nop12711 = alloca i1, i1 0
+  %nop12712 = alloca i1, i1 0
+  %nop12713 = alloca i1, i1 0
+  %nop12714 = alloca i1, i1 0
+  %nop12715 = alloca i1, i1 0
+  %nop12716 = alloca i1, i1 0
+  %nop12717 = alloca i1, i1 0
+  %nop12718 = alloca i1, i1 0
+  %nop12719 = alloca i1, i1 0
+  %nop12720 = alloca i1, i1 0
+  %nop12721 = alloca i1, i1 0
+  %nop12722 = alloca i1, i1 0
+  %nop12723 = alloca i1, i1 0
+  %nop12724 = alloca i1, i1 0
+  %nop12725 = alloca i1, i1 0
+  %nop12726 = alloca i1, i1 0
+  %nop12727 = alloca i1, i1 0
+  %nop12728 = alloca i1, i1 0
+  %nop12729 = alloca i1, i1 0
+  %nop12730 = alloca i1, i1 0
+  %nop12731 = alloca i1, i1 0
+  %nop12732 = alloca i1, i1 0
+  %nop12733 = alloca i1, i1 0
+  %nop12734 = alloca i1, i1 0
+  %nop12735 = alloca i1, i1 0
+  %nop12736 = alloca i1, i1 0
+  %nop12737 = alloca i1, i1 0
+  %nop12738 = alloca i1, i1 0
+  %nop12739 = alloca i1, i1 0
+  %nop12740 = alloca i1, i1 0
+  %nop12741 = alloca i1, i1 0
+  %nop12742 = alloca i1, i1 0
+  %nop12743 = alloca i1, i1 0
+  %nop12744 = alloca i1, i1 0
+  %nop12745 = alloca i1, i1 0
+  %nop12746 = alloca i1, i1 0
+  %nop12747 = alloca i1, i1 0
+  %nop12748 = alloca i1, i1 0
+  %nop12749 = alloca i1, i1 0
+  %nop12750 = alloca i1, i1 0
+  %nop12751 = alloca i1, i1 0
+  %nop12752 = alloca i1, i1 0
+  %nop12753 = alloca i1, i1 0
+  %nop12754 = alloca i1, i1 0
+  %nop12755 = alloca i1, i1 0
+  %nop12756 = alloca i1, i1 0
+  %nop12757 = alloca i1, i1 0
+  %nop12758 = alloca i1, i1 0
+  %nop12759 = alloca i1, i1 0
+  %nop12760 = alloca i1, i1 0
+  %nop12761 = alloca i1, i1 0
+  %nop12762 = alloca i1, i1 0
+  %nop12763 = alloca i1, i1 0
+  %nop12764 = alloca i1, i1 0
+  %nop12765 = alloca i1, i1 0
+  %nop12766 = alloca i1, i1 0
+  %nop12767 = alloca i1, i1 0
+  %nop12768 = alloca i1, i1 0
+  %nop12769 = alloca i1, i1 0
+  %nop12770 = alloca i1, i1 0
+  %nop12771 = alloca i1, i1 0
+  %nop12772 = alloca i1, i1 0
+  %nop12773 = alloca i1, i1 0
+  %nop12774 = alloca i1, i1 0
+  %nop12775 = alloca i1, i1 0
+  %nop12776 = alloca i1, i1 0
+  %nop12777 = alloca i1, i1 0
+  %nop12778 = alloca i1, i1 0
+  %nop12779 = alloca i1, i1 0
+  %nop12780 = alloca i1, i1 0
+  %nop12781 = alloca i1, i1 0
+  %nop12782 = alloca i1, i1 0
+  %nop12783 = alloca i1, i1 0
+  %nop12784 = alloca i1, i1 0
+  %nop12785 = alloca i1, i1 0
+  %nop12786 = alloca i1, i1 0
+  %nop12787 = alloca i1, i1 0
+  %nop12788 = alloca i1, i1 0
+  %nop12789 = alloca i1, i1 0
+  %nop12790 = alloca i1, i1 0
+  %nop12791 = alloca i1, i1 0
+  %nop12792 = alloca i1, i1 0
+  %nop12793 = alloca i1, i1 0
+  %nop12794 = alloca i1, i1 0
+  %nop12795 = alloca i1, i1 0
+  %nop12796 = alloca i1, i1 0
+  %nop12797 = alloca i1, i1 0
+  %nop12798 = alloca i1, i1 0
+  %nop12799 = alloca i1, i1 0
+  %nop12800 = alloca i1, i1 0
+  %nop12801 = alloca i1, i1 0
+  %nop12802 = alloca i1, i1 0
+  %nop12803 = alloca i1, i1 0
+  %nop12804 = alloca i1, i1 0
+  %nop12805 = alloca i1, i1 0
+  %nop12806 = alloca i1, i1 0
+  %nop12807 = alloca i1, i1 0
+  %nop12808 = alloca i1, i1 0
+  %nop12809 = alloca i1, i1 0
+  %nop12810 = alloca i1, i1 0
+  %nop12811 = alloca i1, i1 0
+  %nop12812 = alloca i1, i1 0
+  %nop12813 = alloca i1, i1 0
+  %nop12814 = alloca i1, i1 0
+  %nop12815 = alloca i1, i1 0
+  %nop12816 = alloca i1, i1 0
+  %nop12817 = alloca i1, i1 0
+  %nop12818 = alloca i1, i1 0
+  %nop12819 = alloca i1, i1 0
+  %nop12820 = alloca i1, i1 0
+  %nop12821 = alloca i1, i1 0
+  %nop12822 = alloca i1, i1 0
+  %nop12823 = alloca i1, i1 0
+  %nop12824 = alloca i1, i1 0
+  %nop12825 = alloca i1, i1 0
+  %nop12826 = alloca i1, i1 0
+  %nop12827 = alloca i1, i1 0
+  %nop12828 = alloca i1, i1 0
+  %nop12829 = alloca i1, i1 0
+  %nop12830 = alloca i1, i1 0
+  %nop12831 = alloca i1, i1 0
+  %nop12832 = alloca i1, i1 0
+  %nop12833 = alloca i1, i1 0
+  %nop12834 = alloca i1, i1 0
+  %nop12835 = alloca i1, i1 0
+  %nop12836 = alloca i1, i1 0
+  %nop12837 = alloca i1, i1 0
+  %nop12838 = alloca i1, i1 0
+  %nop12839 = alloca i1, i1 0
+  %nop12840 = alloca i1, i1 0
+  %nop12841 = alloca i1, i1 0
+  %nop12842 = alloca i1, i1 0
+  %nop12843 = alloca i1, i1 0
+  %nop12844 = alloca i1, i1 0
+  %nop12845 = alloca i1, i1 0
+  %nop12846 = alloca i1, i1 0
+  %nop12847 = alloca i1, i1 0
+  %nop12848 = alloca i1, i1 0
+  %nop12849 = alloca i1, i1 0
+  %nop12850 = alloca i1, i1 0
+  %nop12851 = alloca i1, i1 0
+  %nop12852 = alloca i1, i1 0
+  %nop12853 = alloca i1, i1 0
+  %nop12854 = alloca i1, i1 0
+  %nop12855 = alloca i1, i1 0
+  %nop12856 = alloca i1, i1 0
+  %nop12857 = alloca i1, i1 0
+  %nop12858 = alloca i1, i1 0
+  %nop12859 = alloca i1, i1 0
+  %nop12860 = alloca i1, i1 0
+  %nop12861 = alloca i1, i1 0
+  %nop12862 = alloca i1, i1 0
+  %nop12863 = alloca i1, i1 0
+  %nop12864 = alloca i1, i1 0
+  %nop12865 = alloca i1, i1 0
+  %nop12866 = alloca i1, i1 0
+  %nop12867 = alloca i1, i1 0
+  %nop12868 = alloca i1, i1 0
+  %nop12869 = alloca i1, i1 0
+  %nop12870 = alloca i1, i1 0
+  %nop12871 = alloca i1, i1 0
+  %nop12872 = alloca i1, i1 0
+  %nop12873 = alloca i1, i1 0
+  %nop12874 = alloca i1, i1 0
+  %nop12875 = alloca i1, i1 0
+  %nop12876 = alloca i1, i1 0
+  %nop12877 = alloca i1, i1 0
+  %nop12878 = alloca i1, i1 0
+  %nop12879 = alloca i1, i1 0
+  %nop12880 = alloca i1, i1 0
+  %nop12881 = alloca i1, i1 0
+  %nop12882 = alloca i1, i1 0
+  %nop12883 = alloca i1, i1 0
+  %nop12884 = alloca i1, i1 0
+  %nop12885 = alloca i1, i1 0
+  %nop12886 = alloca i1, i1 0
+  %nop12887 = alloca i1, i1 0
+  %nop12888 = alloca i1, i1 0
+  %nop12889 = alloca i1, i1 0
+  %nop12890 = alloca i1, i1 0
+  %nop12891 = alloca i1, i1 0
+  %nop12892 = alloca i1, i1 0
+  %nop12893 = alloca i1, i1 0
+  %nop12894 = alloca i1, i1 0
+  %nop12895 = alloca i1, i1 0
+  %nop12896 = alloca i1, i1 0
+  %nop12897 = alloca i1, i1 0
+  %nop12898 = alloca i1, i1 0
+  %nop12899 = alloca i1, i1 0
+  %nop12900 = alloca i1, i1 0
+  %nop12901 = alloca i1, i1 0
+  %nop12902 = alloca i1, i1 0
+  %nop12903 = alloca i1, i1 0
+  %nop12904 = alloca i1, i1 0
+  %nop12905 = alloca i1, i1 0
+  %nop12906 = alloca i1, i1 0
+  %nop12907 = alloca i1, i1 0
+  %nop12908 = alloca i1, i1 0
+  %nop12909 = alloca i1, i1 0
+  %nop12910 = alloca i1, i1 0
+  %nop12911 = alloca i1, i1 0
+  %nop12912 = alloca i1, i1 0
+  %nop12913 = alloca i1, i1 0
+  %nop12914 = alloca i1, i1 0
+  %nop12915 = alloca i1, i1 0
+  %nop12916 = alloca i1, i1 0
+  %nop12917 = alloca i1, i1 0
+  %nop12918 = alloca i1, i1 0
+  %nop12919 = alloca i1, i1 0
+  %nop12920 = alloca i1, i1 0
+  %nop12921 = alloca i1, i1 0
+  %nop12922 = alloca i1, i1 0
+  %nop12923 = alloca i1, i1 0
+  %nop12924 = alloca i1, i1 0
+  %nop12925 = alloca i1, i1 0
+  %nop12926 = alloca i1, i1 0
+  %nop12927 = alloca i1, i1 0
+  %nop12928 = alloca i1, i1 0
+  %nop12929 = alloca i1, i1 0
+  %nop12930 = alloca i1, i1 0
+  %nop12931 = alloca i1, i1 0
+  %nop12932 = alloca i1, i1 0
+  %nop12933 = alloca i1, i1 0
+  %nop12934 = alloca i1, i1 0
+  %nop12935 = alloca i1, i1 0
+  %nop12936 = alloca i1, i1 0
+  %nop12937 = alloca i1, i1 0
+  %nop12938 = alloca i1, i1 0
+  %nop12939 = alloca i1, i1 0
+  %nop12940 = alloca i1, i1 0
+  %nop12941 = alloca i1, i1 0
+  %nop12942 = alloca i1, i1 0
+  %nop12943 = alloca i1, i1 0
+  %nop12944 = alloca i1, i1 0
+  %nop12945 = alloca i1, i1 0
+  %nop12946 = alloca i1, i1 0
+  %nop12947 = alloca i1, i1 0
+  %nop12948 = alloca i1, i1 0
+  %nop12949 = alloca i1, i1 0
+  %nop12950 = alloca i1, i1 0
+  %nop12951 = alloca i1, i1 0
+  %nop12952 = alloca i1, i1 0
+  %nop12953 = alloca i1, i1 0
+  %nop12954 = alloca i1, i1 0
+  %nop12955 = alloca i1, i1 0
+  %nop12956 = alloca i1, i1 0
+  %nop12957 = alloca i1, i1 0
+  %nop12958 = alloca i1, i1 0
+  %nop12959 = alloca i1, i1 0
+  %nop12960 = alloca i1, i1 0
+  %nop12961 = alloca i1, i1 0
+  %nop12962 = alloca i1, i1 0
+  %nop12963 = alloca i1, i1 0
+  %nop12964 = alloca i1, i1 0
+  %nop12965 = alloca i1, i1 0
+  %nop12966 = alloca i1, i1 0
+  %nop12967 = alloca i1, i1 0
+  %nop12968 = alloca i1, i1 0
+  %nop12969 = alloca i1, i1 0
+  %nop12970 = alloca i1, i1 0
+  %nop12971 = alloca i1, i1 0
+  %nop12972 = alloca i1, i1 0
+  %nop12973 = alloca i1, i1 0
+  %nop12974 = alloca i1, i1 0
+  %nop12975 = alloca i1, i1 0
+  %nop12976 = alloca i1, i1 0
+  %nop12977 = alloca i1, i1 0
+  %nop12978 = alloca i1, i1 0
+  %nop12979 = alloca i1, i1 0
+  %nop12980 = alloca i1, i1 0
+  %nop12981 = alloca i1, i1 0
+  %nop12982 = alloca i1, i1 0
+  %nop12983 = alloca i1, i1 0
+  %nop12984 = alloca i1, i1 0
+  %nop12985 = alloca i1, i1 0
+  %nop12986 = alloca i1, i1 0
+  %nop12987 = alloca i1, i1 0
+  %nop12988 = alloca i1, i1 0
+  %nop12989 = alloca i1, i1 0
+  %nop12990 = alloca i1, i1 0
+  %nop12991 = alloca i1, i1 0
+  %nop12992 = alloca i1, i1 0
+  %nop12993 = alloca i1, i1 0
+  %nop12994 = alloca i1, i1 0
+  %nop12995 = alloca i1, i1 0
+  %nop12996 = alloca i1, i1 0
+  %nop12997 = alloca i1, i1 0
+  %nop12998 = alloca i1, i1 0
+  %nop12999 = alloca i1, i1 0
+  %nop13000 = alloca i1, i1 0
+  %nop13001 = alloca i1, i1 0
+  %nop13002 = alloca i1, i1 0
+  %nop13003 = alloca i1, i1 0
+  %nop13004 = alloca i1, i1 0
+  %nop13005 = alloca i1, i1 0
+  %nop13006 = alloca i1, i1 0
+  %nop13007 = alloca i1, i1 0
+  %nop13008 = alloca i1, i1 0
+  %nop13009 = alloca i1, i1 0
+  %nop13010 = alloca i1, i1 0
+  %nop13011 = alloca i1, i1 0
+  %nop13012 = alloca i1, i1 0
+  %nop13013 = alloca i1, i1 0
+  %nop13014 = alloca i1, i1 0
+  %nop13015 = alloca i1, i1 0
+  %nop13016 = alloca i1, i1 0
+  %nop13017 = alloca i1, i1 0
+  %nop13018 = alloca i1, i1 0
+  %nop13019 = alloca i1, i1 0
+  %nop13020 = alloca i1, i1 0
+  %nop13021 = alloca i1, i1 0
+  %nop13022 = alloca i1, i1 0
+  %nop13023 = alloca i1, i1 0
+  %nop13024 = alloca i1, i1 0
+  %nop13025 = alloca i1, i1 0
+  %nop13026 = alloca i1, i1 0
+  %nop13027 = alloca i1, i1 0
+  %nop13028 = alloca i1, i1 0
+  %nop13029 = alloca i1, i1 0
+  %nop13030 = alloca i1, i1 0
+  %nop13031 = alloca i1, i1 0
+  %nop13032 = alloca i1, i1 0
+  %nop13033 = alloca i1, i1 0
+  %nop13034 = alloca i1, i1 0
+  %nop13035 = alloca i1, i1 0
+  %nop13036 = alloca i1, i1 0
+  %nop13037 = alloca i1, i1 0
+  %nop13038 = alloca i1, i1 0
+  %nop13039 = alloca i1, i1 0
+  %nop13040 = alloca i1, i1 0
+  %nop13041 = alloca i1, i1 0
+  %nop13042 = alloca i1, i1 0
+  %nop13043 = alloca i1, i1 0
+  %nop13044 = alloca i1, i1 0
+  %nop13045 = alloca i1, i1 0
+  %nop13046 = alloca i1, i1 0
+  %nop13047 = alloca i1, i1 0
+  %nop13048 = alloca i1, i1 0
+  %nop13049 = alloca i1, i1 0
+  %nop13050 = alloca i1, i1 0
+  %nop13051 = alloca i1, i1 0
+  %nop13052 = alloca i1, i1 0
+  %nop13053 = alloca i1, i1 0
+  %nop13054 = alloca i1, i1 0
+  %nop13055 = alloca i1, i1 0
+  %nop13056 = alloca i1, i1 0
+  %nop13057 = alloca i1, i1 0
+  %nop13058 = alloca i1, i1 0
+  %nop13059 = alloca i1, i1 0
+  %nop13060 = alloca i1, i1 0
+  %nop13061 = alloca i1, i1 0
+  %nop13062 = alloca i1, i1 0
+  %nop13063 = alloca i1, i1 0
+  %nop13064 = alloca i1, i1 0
+  %nop13065 = alloca i1, i1 0
+  %nop13066 = alloca i1, i1 0
+  %nop13067 = alloca i1, i1 0
+  %nop13068 = alloca i1, i1 0
+  %nop13069 = alloca i1, i1 0
+  %nop13070 = alloca i1, i1 0
+  %nop13071 = alloca i1, i1 0
+  %nop13072 = alloca i1, i1 0
+  %nop13073 = alloca i1, i1 0
+  %nop13074 = alloca i1, i1 0
+  %nop13075 = alloca i1, i1 0
+  %nop13076 = alloca i1, i1 0
+  %nop13077 = alloca i1, i1 0
+  %nop13078 = alloca i1, i1 0
+  %nop13079 = alloca i1, i1 0
+  %nop13080 = alloca i1, i1 0
+  %nop13081 = alloca i1, i1 0
+  %nop13082 = alloca i1, i1 0
+  %nop13083 = alloca i1, i1 0
+  %nop13084 = alloca i1, i1 0
+  %nop13085 = alloca i1, i1 0
+  %nop13086 = alloca i1, i1 0
+  %nop13087 = alloca i1, i1 0
+  %nop13088 = alloca i1, i1 0
+  %nop13089 = alloca i1, i1 0
+  %nop13090 = alloca i1, i1 0
+  %nop13091 = alloca i1, i1 0
+  %nop13092 = alloca i1, i1 0
+  %nop13093 = alloca i1, i1 0
+  %nop13094 = alloca i1, i1 0
+  %nop13095 = alloca i1, i1 0
+  %nop13096 = alloca i1, i1 0
+  %nop13097 = alloca i1, i1 0
+  %nop13098 = alloca i1, i1 0
+  %nop13099 = alloca i1, i1 0
+  %nop13100 = alloca i1, i1 0
+  %nop13101 = alloca i1, i1 0
+  %nop13102 = alloca i1, i1 0
+  %nop13103 = alloca i1, i1 0
+  %nop13104 = alloca i1, i1 0
+  %nop13105 = alloca i1, i1 0
+  %nop13106 = alloca i1, i1 0
+  %nop13107 = alloca i1, i1 0
+  %nop13108 = alloca i1, i1 0
+  %nop13109 = alloca i1, i1 0
+  %nop13110 = alloca i1, i1 0
+  %nop13111 = alloca i1, i1 0
+  %nop13112 = alloca i1, i1 0
+  %nop13113 = alloca i1, i1 0
+  %nop13114 = alloca i1, i1 0
+  %nop13115 = alloca i1, i1 0
+  %nop13116 = alloca i1, i1 0
+  %nop13117 = alloca i1, i1 0
+  %nop13118 = alloca i1, i1 0
+  %nop13119 = alloca i1, i1 0
+  %nop13120 = alloca i1, i1 0
+  %nop13121 = alloca i1, i1 0
+  %nop13122 = alloca i1, i1 0
+  %nop13123 = alloca i1, i1 0
+  %nop13124 = alloca i1, i1 0
+  %nop13125 = alloca i1, i1 0
+  %nop13126 = alloca i1, i1 0
+  %nop13127 = alloca i1, i1 0
+  %nop13128 = alloca i1, i1 0
+  %nop13129 = alloca i1, i1 0
+  %nop13130 = alloca i1, i1 0
+  %nop13131 = alloca i1, i1 0
+  %nop13132 = alloca i1, i1 0
+  %nop13133 = alloca i1, i1 0
+  %nop13134 = alloca i1, i1 0
+  %nop13135 = alloca i1, i1 0
+  %nop13136 = alloca i1, i1 0
+  %nop13137 = alloca i1, i1 0
+  %nop13138 = alloca i1, i1 0
+  %nop13139 = alloca i1, i1 0
+  %nop13140 = alloca i1, i1 0
+  %nop13141 = alloca i1, i1 0
+  %nop13142 = alloca i1, i1 0
+  %nop13143 = alloca i1, i1 0
+  %nop13144 = alloca i1, i1 0
+  %nop13145 = alloca i1, i1 0
+  %nop13146 = alloca i1, i1 0
+  %nop13147 = alloca i1, i1 0
+  %nop13148 = alloca i1, i1 0
+  %nop13149 = alloca i1, i1 0
+  %nop13150 = alloca i1, i1 0
+  %nop13151 = alloca i1, i1 0
+  %nop13152 = alloca i1, i1 0
+  %nop13153 = alloca i1, i1 0
+  %nop13154 = alloca i1, i1 0
+  %nop13155 = alloca i1, i1 0
+  %nop13156 = alloca i1, i1 0
+  %nop13157 = alloca i1, i1 0
+  %nop13158 = alloca i1, i1 0
+  %nop13159 = alloca i1, i1 0
+  %nop13160 = alloca i1, i1 0
+  %nop13161 = alloca i1, i1 0
+  %nop13162 = alloca i1, i1 0
+  %nop13163 = alloca i1, i1 0
+  %nop13164 = alloca i1, i1 0
+  %nop13165 = alloca i1, i1 0
+  %nop13166 = alloca i1, i1 0
+  %nop13167 = alloca i1, i1 0
+  %nop13168 = alloca i1, i1 0
+  %nop13169 = alloca i1, i1 0
+  %nop13170 = alloca i1, i1 0
+  %nop13171 = alloca i1, i1 0
+  %nop13172 = alloca i1, i1 0
+  %nop13173 = alloca i1, i1 0
+  %nop13174 = alloca i1, i1 0
+  %nop13175 = alloca i1, i1 0
+  %nop13176 = alloca i1, i1 0
+  %nop13177 = alloca i1, i1 0
+  %nop13178 = alloca i1, i1 0
+  %nop13179 = alloca i1, i1 0
+  %nop13180 = alloca i1, i1 0
+  %nop13181 = alloca i1, i1 0
+  %nop13182 = alloca i1, i1 0
+  %nop13183 = alloca i1, i1 0
+  %nop13184 = alloca i1, i1 0
+  %nop13185 = alloca i1, i1 0
+  %nop13186 = alloca i1, i1 0
+  %nop13187 = alloca i1, i1 0
+  %nop13188 = alloca i1, i1 0
+  %nop13189 = alloca i1, i1 0
+  %nop13190 = alloca i1, i1 0
+  %nop13191 = alloca i1, i1 0
+  %nop13192 = alloca i1, i1 0
+  %nop13193 = alloca i1, i1 0
+  %nop13194 = alloca i1, i1 0
+  %nop13195 = alloca i1, i1 0
+  %nop13196 = alloca i1, i1 0
+  %nop13197 = alloca i1, i1 0
+  %nop13198 = alloca i1, i1 0
+  %nop13199 = alloca i1, i1 0
+  %nop13200 = alloca i1, i1 0
+  %nop13201 = alloca i1, i1 0
+  %nop13202 = alloca i1, i1 0
+  %nop13203 = alloca i1, i1 0
+  %nop13204 = alloca i1, i1 0
+  %nop13205 = alloca i1, i1 0
+  %nop13206 = alloca i1, i1 0
+  %nop13207 = alloca i1, i1 0
+  %nop13208 = alloca i1, i1 0
+  %nop13209 = alloca i1, i1 0
+  %nop13210 = alloca i1, i1 0
+  %nop13211 = alloca i1, i1 0
+  %nop13212 = alloca i1, i1 0
+  %nop13213 = alloca i1, i1 0
+  %nop13214 = alloca i1, i1 0
+  %nop13215 = alloca i1, i1 0
+  %nop13216 = alloca i1, i1 0
+  %nop13217 = alloca i1, i1 0
+  %nop13218 = alloca i1, i1 0
+  %nop13219 = alloca i1, i1 0
+  %nop13220 = alloca i1, i1 0
+  %nop13221 = alloca i1, i1 0
+  %nop13222 = alloca i1, i1 0
+  %nop13223 = alloca i1, i1 0
+  %nop13224 = alloca i1, i1 0
+  %nop13225 = alloca i1, i1 0
+  %nop13226 = alloca i1, i1 0
+  %nop13227 = alloca i1, i1 0
+  %nop13228 = alloca i1, i1 0
+  %nop13229 = alloca i1, i1 0
+  %nop13230 = alloca i1, i1 0
+  %nop13231 = alloca i1, i1 0
+  %nop13232 = alloca i1, i1 0
+  %nop13233 = alloca i1, i1 0
+  %nop13234 = alloca i1, i1 0
+  %nop13235 = alloca i1, i1 0
+  %nop13236 = alloca i1, i1 0
+  %nop13237 = alloca i1, i1 0
+  %nop13238 = alloca i1, i1 0
+  %nop13239 = alloca i1, i1 0
+  %nop13240 = alloca i1, i1 0
+  %nop13241 = alloca i1, i1 0
+  %nop13242 = alloca i1, i1 0
+  %nop13243 = alloca i1, i1 0
+  %nop13244 = alloca i1, i1 0
+  %nop13245 = alloca i1, i1 0
+  %nop13246 = alloca i1, i1 0
+  %nop13247 = alloca i1, i1 0
+  %nop13248 = alloca i1, i1 0
+  %nop13249 = alloca i1, i1 0
+  %nop13250 = alloca i1, i1 0
+  %nop13251 = alloca i1, i1 0
+  %nop13252 = alloca i1, i1 0
+  %nop13253 = alloca i1, i1 0
+  %nop13254 = alloca i1, i1 0
+  %nop13255 = alloca i1, i1 0
+  %nop13256 = alloca i1, i1 0
+  %nop13257 = alloca i1, i1 0
+  %nop13258 = alloca i1, i1 0
+  %nop13259 = alloca i1, i1 0
+  %nop13260 = alloca i1, i1 0
+  %nop13261 = alloca i1, i1 0
+  %nop13262 = alloca i1, i1 0
+  %nop13263 = alloca i1, i1 0
+  %nop13264 = alloca i1, i1 0
+  %nop13265 = alloca i1, i1 0
+  %nop13266 = alloca i1, i1 0
+  %nop13267 = alloca i1, i1 0
+  %nop13268 = alloca i1, i1 0
+  %nop13269 = alloca i1, i1 0
+  %nop13270 = alloca i1, i1 0
+  %nop13271 = alloca i1, i1 0
+  %nop13272 = alloca i1, i1 0
+  %nop13273 = alloca i1, i1 0
+  %nop13274 = alloca i1, i1 0
+  %nop13275 = alloca i1, i1 0
+  %nop13276 = alloca i1, i1 0
+  %nop13277 = alloca i1, i1 0
+  %nop13278 = alloca i1, i1 0
+  %nop13279 = alloca i1, i1 0
+  %nop13280 = alloca i1, i1 0
+  %nop13281 = alloca i1, i1 0
+  %nop13282 = alloca i1, i1 0
+  %nop13283 = alloca i1, i1 0
+  %nop13284 = alloca i1, i1 0
+  %nop13285 = alloca i1, i1 0
+  %nop13286 = alloca i1, i1 0
+  %nop13287 = alloca i1, i1 0
+  %nop13288 = alloca i1, i1 0
+  %nop13289 = alloca i1, i1 0
+  %nop13290 = alloca i1, i1 0
+  %nop13291 = alloca i1, i1 0
+  %nop13292 = alloca i1, i1 0
+  %nop13293 = alloca i1, i1 0
+  %nop13294 = alloca i1, i1 0
+  %nop13295 = alloca i1, i1 0
+  %nop13296 = alloca i1, i1 0
+  %nop13297 = alloca i1, i1 0
+  %nop13298 = alloca i1, i1 0
+  %nop13299 = alloca i1, i1 0
+  %nop13300 = alloca i1, i1 0
+  %nop13301 = alloca i1, i1 0
+  %nop13302 = alloca i1, i1 0
+  %nop13303 = alloca i1, i1 0
+  %nop13304 = alloca i1, i1 0
+  %nop13305 = alloca i1, i1 0
+  %nop13306 = alloca i1, i1 0
+  %nop13307 = alloca i1, i1 0
+  %nop13308 = alloca i1, i1 0
+  %nop13309 = alloca i1, i1 0
+  %nop13310 = alloca i1, i1 0
+  %nop13311 = alloca i1, i1 0
+  %nop13312 = alloca i1, i1 0
+  %nop13313 = alloca i1, i1 0
+  %nop13314 = alloca i1, i1 0
+  %nop13315 = alloca i1, i1 0
+  %nop13316 = alloca i1, i1 0
+  %nop13317 = alloca i1, i1 0
+  %nop13318 = alloca i1, i1 0
+  %nop13319 = alloca i1, i1 0
+  %nop13320 = alloca i1, i1 0
+  %nop13321 = alloca i1, i1 0
+  %nop13322 = alloca i1, i1 0
+  %nop13323 = alloca i1, i1 0
+  %nop13324 = alloca i1, i1 0
+  %nop13325 = alloca i1, i1 0
+  %nop13326 = alloca i1, i1 0
+  %nop13327 = alloca i1, i1 0
+  %nop13328 = alloca i1, i1 0
+  %nop13329 = alloca i1, i1 0
+  %nop13330 = alloca i1, i1 0
+  %nop13331 = alloca i1, i1 0
+  %nop13332 = alloca i1, i1 0
+  %nop13333 = alloca i1, i1 0
+  %nop13334 = alloca i1, i1 0
+  %nop13335 = alloca i1, i1 0
+  %nop13336 = alloca i1, i1 0
+  %nop13337 = alloca i1, i1 0
+  %nop13338 = alloca i1, i1 0
+  %nop13339 = alloca i1, i1 0
+  %nop13340 = alloca i1, i1 0
+  %nop13341 = alloca i1, i1 0
+  %nop13342 = alloca i1, i1 0
+  %nop13343 = alloca i1, i1 0
+  %nop13344 = alloca i1, i1 0
+  %nop13345 = alloca i1, i1 0
+  %nop13346 = alloca i1, i1 0
+  %nop13347 = alloca i1, i1 0
+  %nop13348 = alloca i1, i1 0
+  %nop13349 = alloca i1, i1 0
+  %nop13350 = alloca i1, i1 0
+  %nop13351 = alloca i1, i1 0
+  %nop13352 = alloca i1, i1 0
+  %nop13353 = alloca i1, i1 0
+  %nop13354 = alloca i1, i1 0
+  %nop13355 = alloca i1, i1 0
+  %nop13356 = alloca i1, i1 0
+  %nop13357 = alloca i1, i1 0
+  %nop13358 = alloca i1, i1 0
+  %nop13359 = alloca i1, i1 0
+  %nop13360 = alloca i1, i1 0
+  %nop13361 = alloca i1, i1 0
+  %nop13362 = alloca i1, i1 0
+  %nop13363 = alloca i1, i1 0
+  %nop13364 = alloca i1, i1 0
+  %nop13365 = alloca i1, i1 0
+  %nop13366 = alloca i1, i1 0
+  %nop13367 = alloca i1, i1 0
+  %nop13368 = alloca i1, i1 0
+  %nop13369 = alloca i1, i1 0
+  %nop13370 = alloca i1, i1 0
+  %nop13371 = alloca i1, i1 0
+  %nop13372 = alloca i1, i1 0
+  %nop13373 = alloca i1, i1 0
+  %nop13374 = alloca i1, i1 0
+  %nop13375 = alloca i1, i1 0
+  %nop13376 = alloca i1, i1 0
+  %nop13377 = alloca i1, i1 0
+  %nop13378 = alloca i1, i1 0
+  %nop13379 = alloca i1, i1 0
+  %nop13380 = alloca i1, i1 0
+  %nop13381 = alloca i1, i1 0
+  %nop13382 = alloca i1, i1 0
+  %nop13383 = alloca i1, i1 0
+  %nop13384 = alloca i1, i1 0
+  %nop13385 = alloca i1, i1 0
+  %nop13386 = alloca i1, i1 0
+  %nop13387 = alloca i1, i1 0
+  %nop13388 = alloca i1, i1 0
+  %nop13389 = alloca i1, i1 0
+  %nop13390 = alloca i1, i1 0
+  %nop13391 = alloca i1, i1 0
+  %nop13392 = alloca i1, i1 0
+  %nop13393 = alloca i1, i1 0
+  %nop13394 = alloca i1, i1 0
+  %nop13395 = alloca i1, i1 0
+  %nop13396 = alloca i1, i1 0
+  %nop13397 = alloca i1, i1 0
+  %nop13398 = alloca i1, i1 0
+  %nop13399 = alloca i1, i1 0
+  %nop13400 = alloca i1, i1 0
+  %nop13401 = alloca i1, i1 0
+  %nop13402 = alloca i1, i1 0
+  %nop13403 = alloca i1, i1 0
+  %nop13404 = alloca i1, i1 0
+  %nop13405 = alloca i1, i1 0
+  %nop13406 = alloca i1, i1 0
+  %nop13407 = alloca i1, i1 0
+  %nop13408 = alloca i1, i1 0
+  %nop13409 = alloca i1, i1 0
+  %nop13410 = alloca i1, i1 0
+  %nop13411 = alloca i1, i1 0
+  %nop13412 = alloca i1, i1 0
+  %nop13413 = alloca i1, i1 0
+  %nop13414 = alloca i1, i1 0
+  %nop13415 = alloca i1, i1 0
+  %nop13416 = alloca i1, i1 0
+  %nop13417 = alloca i1, i1 0
+  %nop13418 = alloca i1, i1 0
+  %nop13419 = alloca i1, i1 0
+  %nop13420 = alloca i1, i1 0
+  %nop13421 = alloca i1, i1 0
+  %nop13422 = alloca i1, i1 0
+  %nop13423 = alloca i1, i1 0
+  %nop13424 = alloca i1, i1 0
+  %nop13425 = alloca i1, i1 0
+  %nop13426 = alloca i1, i1 0
+  %nop13427 = alloca i1, i1 0
+  %nop13428 = alloca i1, i1 0
+  %nop13429 = alloca i1, i1 0
+  %nop13430 = alloca i1, i1 0
+  %nop13431 = alloca i1, i1 0
+  %nop13432 = alloca i1, i1 0
+  %nop13433 = alloca i1, i1 0
+  %nop13434 = alloca i1, i1 0
+  %nop13435 = alloca i1, i1 0
+  %nop13436 = alloca i1, i1 0
+  %nop13437 = alloca i1, i1 0
+  %nop13438 = alloca i1, i1 0
+  %nop13439 = alloca i1, i1 0
+  %nop13440 = alloca i1, i1 0
+  %nop13441 = alloca i1, i1 0
+  %nop13442 = alloca i1, i1 0
+  %nop13443 = alloca i1, i1 0
+  %nop13444 = alloca i1, i1 0
+  %nop13445 = alloca i1, i1 0
+  %nop13446 = alloca i1, i1 0
+  %nop13447 = alloca i1, i1 0
+  %nop13448 = alloca i1, i1 0
+  %nop13449 = alloca i1, i1 0
+  %nop13450 = alloca i1, i1 0
+  %nop13451 = alloca i1, i1 0
+  %nop13452 = alloca i1, i1 0
+  %nop13453 = alloca i1, i1 0
+  %nop13454 = alloca i1, i1 0
+  %nop13455 = alloca i1, i1 0
+  %nop13456 = alloca i1, i1 0
+  %nop13457 = alloca i1, i1 0
+  %nop13458 = alloca i1, i1 0
+  %nop13459 = alloca i1, i1 0
+  %nop13460 = alloca i1, i1 0
+  %nop13461 = alloca i1, i1 0
+  %nop13462 = alloca i1, i1 0
+  %nop13463 = alloca i1, i1 0
+  %nop13464 = alloca i1, i1 0
+  %nop13465 = alloca i1, i1 0
+  %nop13466 = alloca i1, i1 0
+  %nop13467 = alloca i1, i1 0
+  %nop13468 = alloca i1, i1 0
+  %nop13469 = alloca i1, i1 0
+  %nop13470 = alloca i1, i1 0
+  %nop13471 = alloca i1, i1 0
+  %nop13472 = alloca i1, i1 0
+  %nop13473 = alloca i1, i1 0
+  %nop13474 = alloca i1, i1 0
+  %nop13475 = alloca i1, i1 0
+  %nop13476 = alloca i1, i1 0
+  %nop13477 = alloca i1, i1 0
+  %nop13478 = alloca i1, i1 0
+  %nop13479 = alloca i1, i1 0
+  %nop13480 = alloca i1, i1 0
+  %nop13481 = alloca i1, i1 0
+  %nop13482 = alloca i1, i1 0
+  %nop13483 = alloca i1, i1 0
+  %nop13484 = alloca i1, i1 0
+  %nop13485 = alloca i1, i1 0
+  %nop13486 = alloca i1, i1 0
+  %nop13487 = alloca i1, i1 0
+  %nop13488 = alloca i1, i1 0
+  %nop13489 = alloca i1, i1 0
+  %nop13490 = alloca i1, i1 0
+  %nop13491 = alloca i1, i1 0
+  %nop13492 = alloca i1, i1 0
+  %nop13493 = alloca i1, i1 0
+  %nop13494 = alloca i1, i1 0
+  %nop13495 = alloca i1, i1 0
+  %nop13496 = alloca i1, i1 0
+  %nop13497 = alloca i1, i1 0
+  %nop13498 = alloca i1, i1 0
+  %nop13499 = alloca i1, i1 0
+  %nop13500 = alloca i1, i1 0
+  %nop13501 = alloca i1, i1 0
+  %nop13502 = alloca i1, i1 0
+  %nop13503 = alloca i1, i1 0
+  %nop13504 = alloca i1, i1 0
+  %nop13505 = alloca i1, i1 0
+  %nop13506 = alloca i1, i1 0
+  %nop13507 = alloca i1, i1 0
+  %nop13508 = alloca i1, i1 0
+  %nop13509 = alloca i1, i1 0
+  %nop13510 = alloca i1, i1 0
+  %nop13511 = alloca i1, i1 0
+  %nop13512 = alloca i1, i1 0
+  %nop13513 = alloca i1, i1 0
+  %nop13514 = alloca i1, i1 0
+  %nop13515 = alloca i1, i1 0
+  %nop13516 = alloca i1, i1 0
+  %nop13517 = alloca i1, i1 0
+  %nop13518 = alloca i1, i1 0
+  %nop13519 = alloca i1, i1 0
+  %nop13520 = alloca i1, i1 0
+  %nop13521 = alloca i1, i1 0
+  %nop13522 = alloca i1, i1 0
+  %nop13523 = alloca i1, i1 0
+  %nop13524 = alloca i1, i1 0
+  %nop13525 = alloca i1, i1 0
+  %nop13526 = alloca i1, i1 0
+  %nop13527 = alloca i1, i1 0
+  %nop13528 = alloca i1, i1 0
+  %nop13529 = alloca i1, i1 0
+  %nop13530 = alloca i1, i1 0
+  %nop13531 = alloca i1, i1 0
+  %nop13532 = alloca i1, i1 0
+  %nop13533 = alloca i1, i1 0
+  %nop13534 = alloca i1, i1 0
+  %nop13535 = alloca i1, i1 0
+  %nop13536 = alloca i1, i1 0
+  %nop13537 = alloca i1, i1 0
+  %nop13538 = alloca i1, i1 0
+  %nop13539 = alloca i1, i1 0
+  %nop13540 = alloca i1, i1 0
+  %nop13541 = alloca i1, i1 0
+  %nop13542 = alloca i1, i1 0
+  %nop13543 = alloca i1, i1 0
+  %nop13544 = alloca i1, i1 0
+  %nop13545 = alloca i1, i1 0
+  %nop13546 = alloca i1, i1 0
+  %nop13547 = alloca i1, i1 0
+  %nop13548 = alloca i1, i1 0
+  %nop13549 = alloca i1, i1 0
+  %nop13550 = alloca i1, i1 0
+  %nop13551 = alloca i1, i1 0
+  %nop13552 = alloca i1, i1 0
+  %nop13553 = alloca i1, i1 0
+  %nop13554 = alloca i1, i1 0
+  %nop13555 = alloca i1, i1 0
+  %nop13556 = alloca i1, i1 0
+  %nop13557 = alloca i1, i1 0
+  %nop13558 = alloca i1, i1 0
+  %nop13559 = alloca i1, i1 0
+  %nop13560 = alloca i1, i1 0
+  %nop13561 = alloca i1, i1 0
+  %nop13562 = alloca i1, i1 0
+  %nop13563 = alloca i1, i1 0
+  %nop13564 = alloca i1, i1 0
+  %nop13565 = alloca i1, i1 0
+  %nop13566 = alloca i1, i1 0
+  %nop13567 = alloca i1, i1 0
+  %nop13568 = alloca i1, i1 0
+  %nop13569 = alloca i1, i1 0
+  %nop13570 = alloca i1, i1 0
+  %nop13571 = alloca i1, i1 0
+  %nop13572 = alloca i1, i1 0
+  %nop13573 = alloca i1, i1 0
+  %nop13574 = alloca i1, i1 0
+  %nop13575 = alloca i1, i1 0
+  %nop13576 = alloca i1, i1 0
+  %nop13577 = alloca i1, i1 0
+  %nop13578 = alloca i1, i1 0
+  %nop13579 = alloca i1, i1 0
+  %nop13580 = alloca i1, i1 0
+  %nop13581 = alloca i1, i1 0
+  %nop13582 = alloca i1, i1 0
+  %nop13583 = alloca i1, i1 0
+  %nop13584 = alloca i1, i1 0
+  %nop13585 = alloca i1, i1 0
+  %nop13586 = alloca i1, i1 0
+  %nop13587 = alloca i1, i1 0
+  %nop13588 = alloca i1, i1 0
+  %nop13589 = alloca i1, i1 0
+  %nop13590 = alloca i1, i1 0
+  %nop13591 = alloca i1, i1 0
+  %nop13592 = alloca i1, i1 0
+  %nop13593 = alloca i1, i1 0
+  %nop13594 = alloca i1, i1 0
+  %nop13595 = alloca i1, i1 0
+  %nop13596 = alloca i1, i1 0
+  %nop13597 = alloca i1, i1 0
+  %nop13598 = alloca i1, i1 0
+  %nop13599 = alloca i1, i1 0
+  %nop13600 = alloca i1, i1 0
+  %nop13601 = alloca i1, i1 0
+  %nop13602 = alloca i1, i1 0
+  %nop13603 = alloca i1, i1 0
+  %nop13604 = alloca i1, i1 0
+  %nop13605 = alloca i1, i1 0
+  %nop13606 = alloca i1, i1 0
+  %nop13607 = alloca i1, i1 0
+  %nop13608 = alloca i1, i1 0
+  %nop13609 = alloca i1, i1 0
+  %nop13610 = alloca i1, i1 0
+  %nop13611 = alloca i1, i1 0
+  %nop13612 = alloca i1, i1 0
+  %nop13613 = alloca i1, i1 0
+  %nop13614 = alloca i1, i1 0
+  %nop13615 = alloca i1, i1 0
+  %nop13616 = alloca i1, i1 0
+  %nop13617 = alloca i1, i1 0
+  %nop13618 = alloca i1, i1 0
+  %nop13619 = alloca i1, i1 0
+  %nop13620 = alloca i1, i1 0
+  %nop13621 = alloca i1, i1 0
+  %nop13622 = alloca i1, i1 0
+  %nop13623 = alloca i1, i1 0
+  %nop13624 = alloca i1, i1 0
+  %nop13625 = alloca i1, i1 0
+  %nop13626 = alloca i1, i1 0
+  %nop13627 = alloca i1, i1 0
+  %nop13628 = alloca i1, i1 0
+  %nop13629 = alloca i1, i1 0
+  %nop13630 = alloca i1, i1 0
+  %nop13631 = alloca i1, i1 0
+  %nop13632 = alloca i1, i1 0
+  %nop13633 = alloca i1, i1 0
+  %nop13634 = alloca i1, i1 0
+  %nop13635 = alloca i1, i1 0
+  %nop13636 = alloca i1, i1 0
+  %nop13637 = alloca i1, i1 0
+  %nop13638 = alloca i1, i1 0
+  %nop13639 = alloca i1, i1 0
+  %nop13640 = alloca i1, i1 0
+  %nop13641 = alloca i1, i1 0
+  %nop13642 = alloca i1, i1 0
+  %nop13643 = alloca i1, i1 0
+  %nop13644 = alloca i1, i1 0
+  %nop13645 = alloca i1, i1 0
+  %nop13646 = alloca i1, i1 0
+  %nop13647 = alloca i1, i1 0
+  %nop13648 = alloca i1, i1 0
+  %nop13649 = alloca i1, i1 0
+  %nop13650 = alloca i1, i1 0
+  %nop13651 = alloca i1, i1 0
+  %nop13652 = alloca i1, i1 0
+  %nop13653 = alloca i1, i1 0
+  %nop13654 = alloca i1, i1 0
+  %nop13655 = alloca i1, i1 0
+  %nop13656 = alloca i1, i1 0
+  %nop13657 = alloca i1, i1 0
+  %nop13658 = alloca i1, i1 0
+  %nop13659 = alloca i1, i1 0
+  %nop13660 = alloca i1, i1 0
+  %nop13661 = alloca i1, i1 0
+  %nop13662 = alloca i1, i1 0
+  %nop13663 = alloca i1, i1 0
+  %nop13664 = alloca i1, i1 0
+  %nop13665 = alloca i1, i1 0
+  %nop13666 = alloca i1, i1 0
+  %nop13667 = alloca i1, i1 0
+  %nop13668 = alloca i1, i1 0
+  %nop13669 = alloca i1, i1 0
+  %nop13670 = alloca i1, i1 0
+  %nop13671 = alloca i1, i1 0
+  %nop13672 = alloca i1, i1 0
+  %nop13673 = alloca i1, i1 0
+  %nop13674 = alloca i1, i1 0
+  %nop13675 = alloca i1, i1 0
+  %nop13676 = alloca i1, i1 0
+  %nop13677 = alloca i1, i1 0
+  %nop13678 = alloca i1, i1 0
+  %nop13679 = alloca i1, i1 0
+  %nop13680 = alloca i1, i1 0
+  %nop13681 = alloca i1, i1 0
+  %nop13682 = alloca i1, i1 0
+  %nop13683 = alloca i1, i1 0
+  %nop13684 = alloca i1, i1 0
+  %nop13685 = alloca i1, i1 0
+  %nop13686 = alloca i1, i1 0
+  %nop13687 = alloca i1, i1 0
+  %nop13688 = alloca i1, i1 0
+  %nop13689 = alloca i1, i1 0
+  %nop13690 = alloca i1, i1 0
+  %nop13691 = alloca i1, i1 0
+  %nop13692 = alloca i1, i1 0
+  %nop13693 = alloca i1, i1 0
+  %nop13694 = alloca i1, i1 0
+  %nop13695 = alloca i1, i1 0
+  %nop13696 = alloca i1, i1 0
+  %nop13697 = alloca i1, i1 0
+  %nop13698 = alloca i1, i1 0
+  %nop13699 = alloca i1, i1 0
+  %nop13700 = alloca i1, i1 0
+  %nop13701 = alloca i1, i1 0
+  %nop13702 = alloca i1, i1 0
+  %nop13703 = alloca i1, i1 0
+  %nop13704 = alloca i1, i1 0
+  %nop13705 = alloca i1, i1 0
+  %nop13706 = alloca i1, i1 0
+  %nop13707 = alloca i1, i1 0
+  %nop13708 = alloca i1, i1 0
+  %nop13709 = alloca i1, i1 0
+  %nop13710 = alloca i1, i1 0
+  %nop13711 = alloca i1, i1 0
+  %nop13712 = alloca i1, i1 0
+  %nop13713 = alloca i1, i1 0
+  %nop13714 = alloca i1, i1 0
+  %nop13715 = alloca i1, i1 0
+  %nop13716 = alloca i1, i1 0
+  %nop13717 = alloca i1, i1 0
+  %nop13718 = alloca i1, i1 0
+  %nop13719 = alloca i1, i1 0
+  %nop13720 = alloca i1, i1 0
+  %nop13721 = alloca i1, i1 0
+  %nop13722 = alloca i1, i1 0
+  %nop13723 = alloca i1, i1 0
+  %nop13724 = alloca i1, i1 0
+  %nop13725 = alloca i1, i1 0
+  %nop13726 = alloca i1, i1 0
+  %nop13727 = alloca i1, i1 0
+  %nop13728 = alloca i1, i1 0
+  %nop13729 = alloca i1, i1 0
+  %nop13730 = alloca i1, i1 0
+  %nop13731 = alloca i1, i1 0
+  %nop13732 = alloca i1, i1 0
+  %nop13733 = alloca i1, i1 0
+  %nop13734 = alloca i1, i1 0
+  %nop13735 = alloca i1, i1 0
+  %nop13736 = alloca i1, i1 0
+  %nop13737 = alloca i1, i1 0
+  %nop13738 = alloca i1, i1 0
+  %nop13739 = alloca i1, i1 0
+  %nop13740 = alloca i1, i1 0
+  %nop13741 = alloca i1, i1 0
+  %nop13742 = alloca i1, i1 0
+  %nop13743 = alloca i1, i1 0
+  %nop13744 = alloca i1, i1 0
+  %nop13745 = alloca i1, i1 0
+  %nop13746 = alloca i1, i1 0
+  %nop13747 = alloca i1, i1 0
+  %nop13748 = alloca i1, i1 0
+  %nop13749 = alloca i1, i1 0
+  %nop13750 = alloca i1, i1 0
+  %nop13751 = alloca i1, i1 0
+  %nop13752 = alloca i1, i1 0
+  %nop13753 = alloca i1, i1 0
+  %nop13754 = alloca i1, i1 0
+  %nop13755 = alloca i1, i1 0
+  %nop13756 = alloca i1, i1 0
+  %nop13757 = alloca i1, i1 0
+  %nop13758 = alloca i1, i1 0
+  %nop13759 = alloca i1, i1 0
+  %nop13760 = alloca i1, i1 0
+  %nop13761 = alloca i1, i1 0
+  %nop13762 = alloca i1, i1 0
+  %nop13763 = alloca i1, i1 0
+  %nop13764 = alloca i1, i1 0
+  %nop13765 = alloca i1, i1 0
+  %nop13766 = alloca i1, i1 0
+  %nop13767 = alloca i1, i1 0
+  %nop13768 = alloca i1, i1 0
+  %nop13769 = alloca i1, i1 0
+  %nop13770 = alloca i1, i1 0
+  %nop13771 = alloca i1, i1 0
+  %nop13772 = alloca i1, i1 0
+  %nop13773 = alloca i1, i1 0
+  %nop13774 = alloca i1, i1 0
+  %nop13775 = alloca i1, i1 0
+  %nop13776 = alloca i1, i1 0
+  %nop13777 = alloca i1, i1 0
+  %nop13778 = alloca i1, i1 0
+  %nop13779 = alloca i1, i1 0
+  %nop13780 = alloca i1, i1 0
+  %nop13781 = alloca i1, i1 0
+  %nop13782 = alloca i1, i1 0
+  %nop13783 = alloca i1, i1 0
+  %nop13784 = alloca i1, i1 0
+  %nop13785 = alloca i1, i1 0
+  %nop13786 = alloca i1, i1 0
+  %nop13787 = alloca i1, i1 0
+  %nop13788 = alloca i1, i1 0
+  %nop13789 = alloca i1, i1 0
+  %nop13790 = alloca i1, i1 0
+  %nop13791 = alloca i1, i1 0
+  %nop13792 = alloca i1, i1 0
+  %nop13793 = alloca i1, i1 0
+  %nop13794 = alloca i1, i1 0
+  %nop13795 = alloca i1, i1 0
+  %nop13796 = alloca i1, i1 0
+  %nop13797 = alloca i1, i1 0
+  %nop13798 = alloca i1, i1 0
+  %nop13799 = alloca i1, i1 0
+  %nop13800 = alloca i1, i1 0
+  %nop13801 = alloca i1, i1 0
+  %nop13802 = alloca i1, i1 0
+  %nop13803 = alloca i1, i1 0
+  %nop13804 = alloca i1, i1 0
+  %nop13805 = alloca i1, i1 0
+  %nop13806 = alloca i1, i1 0
+  %nop13807 = alloca i1, i1 0
+  %nop13808 = alloca i1, i1 0
+  %nop13809 = alloca i1, i1 0
+  %nop13810 = alloca i1, i1 0
+  %nop13811 = alloca i1, i1 0
+  %nop13812 = alloca i1, i1 0
+  %nop13813 = alloca i1, i1 0
+  %nop13814 = alloca i1, i1 0
+  %nop13815 = alloca i1, i1 0
+  %nop13816 = alloca i1, i1 0
+  %nop13817 = alloca i1, i1 0
+  %nop13818 = alloca i1, i1 0
+  %nop13819 = alloca i1, i1 0
+  %nop13820 = alloca i1, i1 0
+  %nop13821 = alloca i1, i1 0
+  %nop13822 = alloca i1, i1 0
+  %nop13823 = alloca i1, i1 0
+  %nop13824 = alloca i1, i1 0
+  %nop13825 = alloca i1, i1 0
+  %nop13826 = alloca i1, i1 0
+  %nop13827 = alloca i1, i1 0
+  %nop13828 = alloca i1, i1 0
+  %nop13829 = alloca i1, i1 0
+  %nop13830 = alloca i1, i1 0
+  %nop13831 = alloca i1, i1 0
+  %nop13832 = alloca i1, i1 0
+  %nop13833 = alloca i1, i1 0
+  %nop13834 = alloca i1, i1 0
+  %nop13835 = alloca i1, i1 0
+  %nop13836 = alloca i1, i1 0
+  %nop13837 = alloca i1, i1 0
+  %nop13838 = alloca i1, i1 0
+  %nop13839 = alloca i1, i1 0
+  %nop13840 = alloca i1, i1 0
+  %nop13841 = alloca i1, i1 0
+  %nop13842 = alloca i1, i1 0
+  %nop13843 = alloca i1, i1 0
+  %nop13844 = alloca i1, i1 0
+  %nop13845 = alloca i1, i1 0
+  %nop13846 = alloca i1, i1 0
+  %nop13847 = alloca i1, i1 0
+  %nop13848 = alloca i1, i1 0
+  %nop13849 = alloca i1, i1 0
+  %nop13850 = alloca i1, i1 0
+  %nop13851 = alloca i1, i1 0
+  %nop13852 = alloca i1, i1 0
+  %nop13853 = alloca i1, i1 0
+  %nop13854 = alloca i1, i1 0
+  %nop13855 = alloca i1, i1 0
+  %nop13856 = alloca i1, i1 0
+  %nop13857 = alloca i1, i1 0
+  %nop13858 = alloca i1, i1 0
+  %nop13859 = alloca i1, i1 0
+  %nop13860 = alloca i1, i1 0
+  %nop13861 = alloca i1, i1 0
+  %nop13862 = alloca i1, i1 0
+  %nop13863 = alloca i1, i1 0
+  %nop13864 = alloca i1, i1 0
+  %nop13865 = alloca i1, i1 0
+  %nop13866 = alloca i1, i1 0
+  %nop13867 = alloca i1, i1 0
+  %nop13868 = alloca i1, i1 0
+  %nop13869 = alloca i1, i1 0
+  %nop13870 = alloca i1, i1 0
+  %nop13871 = alloca i1, i1 0
+  %nop13872 = alloca i1, i1 0
+  %nop13873 = alloca i1, i1 0
+  %nop13874 = alloca i1, i1 0
+  %nop13875 = alloca i1, i1 0
+  %nop13876 = alloca i1, i1 0
+  %nop13877 = alloca i1, i1 0
+  %nop13878 = alloca i1, i1 0
+  %nop13879 = alloca i1, i1 0
+  %nop13880 = alloca i1, i1 0
+  %nop13881 = alloca i1, i1 0
+  %nop13882 = alloca i1, i1 0
+  %nop13883 = alloca i1, i1 0
+  %nop13884 = alloca i1, i1 0
+  %nop13885 = alloca i1, i1 0
+  %nop13886 = alloca i1, i1 0
+  %nop13887 = alloca i1, i1 0
+  %nop13888 = alloca i1, i1 0
+  %nop13889 = alloca i1, i1 0
+  %nop13890 = alloca i1, i1 0
+  %nop13891 = alloca i1, i1 0
+  %nop13892 = alloca i1, i1 0
+  %nop13893 = alloca i1, i1 0
+  %nop13894 = alloca i1, i1 0
+  %nop13895 = alloca i1, i1 0
+  %nop13896 = alloca i1, i1 0
+  %nop13897 = alloca i1, i1 0
+  %nop13898 = alloca i1, i1 0
+  %nop13899 = alloca i1, i1 0
+  %nop13900 = alloca i1, i1 0
+  %nop13901 = alloca i1, i1 0
+  %nop13902 = alloca i1, i1 0
+  %nop13903 = alloca i1, i1 0
+  %nop13904 = alloca i1, i1 0
+  %nop13905 = alloca i1, i1 0
+  %nop13906 = alloca i1, i1 0
+  %nop13907 = alloca i1, i1 0
+  %nop13908 = alloca i1, i1 0
+  %nop13909 = alloca i1, i1 0
+  %nop13910 = alloca i1, i1 0
+  %nop13911 = alloca i1, i1 0
+  %nop13912 = alloca i1, i1 0
+  %nop13913 = alloca i1, i1 0
+  %nop13914 = alloca i1, i1 0
+  %nop13915 = alloca i1, i1 0
+  %nop13916 = alloca i1, i1 0
+  %nop13917 = alloca i1, i1 0
+  %nop13918 = alloca i1, i1 0
+  %nop13919 = alloca i1, i1 0
+  %nop13920 = alloca i1, i1 0
+  %nop13921 = alloca i1, i1 0
+  %nop13922 = alloca i1, i1 0
+  %nop13923 = alloca i1, i1 0
+  %nop13924 = alloca i1, i1 0
+  %nop13925 = alloca i1, i1 0
+  %nop13926 = alloca i1, i1 0
+  %nop13927 = alloca i1, i1 0
+  %nop13928 = alloca i1, i1 0
+  %nop13929 = alloca i1, i1 0
+  %nop13930 = alloca i1, i1 0
+  %nop13931 = alloca i1, i1 0
+  %nop13932 = alloca i1, i1 0
+  %nop13933 = alloca i1, i1 0
+  %nop13934 = alloca i1, i1 0
+  %nop13935 = alloca i1, i1 0
+  %nop13936 = alloca i1, i1 0
+  %nop13937 = alloca i1, i1 0
+  %nop13938 = alloca i1, i1 0
+  %nop13939 = alloca i1, i1 0
+  %nop13940 = alloca i1, i1 0
+  %nop13941 = alloca i1, i1 0
+  %nop13942 = alloca i1, i1 0
+  %nop13943 = alloca i1, i1 0
+  %nop13944 = alloca i1, i1 0
+  %nop13945 = alloca i1, i1 0
+  %nop13946 = alloca i1, i1 0
+  %nop13947 = alloca i1, i1 0
+  %nop13948 = alloca i1, i1 0
+  %nop13949 = alloca i1, i1 0
+  %nop13950 = alloca i1, i1 0
+  %nop13951 = alloca i1, i1 0
+  %nop13952 = alloca i1, i1 0
+  %nop13953 = alloca i1, i1 0
+  %nop13954 = alloca i1, i1 0
+  %nop13955 = alloca i1, i1 0
+  %nop13956 = alloca i1, i1 0
+  %nop13957 = alloca i1, i1 0
+  %nop13958 = alloca i1, i1 0
+  %nop13959 = alloca i1, i1 0
+  %nop13960 = alloca i1, i1 0
+  %nop13961 = alloca i1, i1 0
+  %nop13962 = alloca i1, i1 0
+  %nop13963 = alloca i1, i1 0
+  %nop13964 = alloca i1, i1 0
+  %nop13965 = alloca i1, i1 0
+  %nop13966 = alloca i1, i1 0
+  %nop13967 = alloca i1, i1 0
+  %nop13968 = alloca i1, i1 0
+  %nop13969 = alloca i1, i1 0
+  %nop13970 = alloca i1, i1 0
+  %nop13971 = alloca i1, i1 0
+  %nop13972 = alloca i1, i1 0
+  %nop13973 = alloca i1, i1 0
+  %nop13974 = alloca i1, i1 0
+  %nop13975 = alloca i1, i1 0
+  %nop13976 = alloca i1, i1 0
+  %nop13977 = alloca i1, i1 0
+  %nop13978 = alloca i1, i1 0
+  %nop13979 = alloca i1, i1 0
+  %nop13980 = alloca i1, i1 0
+  %nop13981 = alloca i1, i1 0
+  %nop13982 = alloca i1, i1 0
+  %nop13983 = alloca i1, i1 0
+  %nop13984 = alloca i1, i1 0
+  %nop13985 = alloca i1, i1 0
+  %nop13986 = alloca i1, i1 0
+  %nop13987 = alloca i1, i1 0
+  %nop13988 = alloca i1, i1 0
+  %nop13989 = alloca i1, i1 0
+  %nop13990 = alloca i1, i1 0
+  %nop13991 = alloca i1, i1 0
+  %nop13992 = alloca i1, i1 0
+  %nop13993 = alloca i1, i1 0
+  %nop13994 = alloca i1, i1 0
+  %nop13995 = alloca i1, i1 0
+  %nop13996 = alloca i1, i1 0
+  %nop13997 = alloca i1, i1 0
+  %nop13998 = alloca i1, i1 0
+  %nop13999 = alloca i1, i1 0
+  %nop14000 = alloca i1, i1 0
+  %nop14001 = alloca i1, i1 0
+  %nop14002 = alloca i1, i1 0
+  %nop14003 = alloca i1, i1 0
+  %nop14004 = alloca i1, i1 0
+  %nop14005 = alloca i1, i1 0
+  %nop14006 = alloca i1, i1 0
+  %nop14007 = alloca i1, i1 0
+  %nop14008 = alloca i1, i1 0
+  %nop14009 = alloca i1, i1 0
+  %nop14010 = alloca i1, i1 0
+  %nop14011 = alloca i1, i1 0
+  %nop14012 = alloca i1, i1 0
+  %nop14013 = alloca i1, i1 0
+  %nop14014 = alloca i1, i1 0
+  %nop14015 = alloca i1, i1 0
+  %nop14016 = alloca i1, i1 0
+  %nop14017 = alloca i1, i1 0
+  %nop14018 = alloca i1, i1 0
+  %nop14019 = alloca i1, i1 0
+  %nop14020 = alloca i1, i1 0
+  %nop14021 = alloca i1, i1 0
+  %nop14022 = alloca i1, i1 0
+  %nop14023 = alloca i1, i1 0
+  %nop14024 = alloca i1, i1 0
+  %nop14025 = alloca i1, i1 0
+  %nop14026 = alloca i1, i1 0
+  %nop14027 = alloca i1, i1 0
+  %nop14028 = alloca i1, i1 0
+  %nop14029 = alloca i1, i1 0
+  %nop14030 = alloca i1, i1 0
+  %nop14031 = alloca i1, i1 0
+  %nop14032 = alloca i1, i1 0
+  %nop14033 = alloca i1, i1 0
+  %nop14034 = alloca i1, i1 0
+  %nop14035 = alloca i1, i1 0
+  %nop14036 = alloca i1, i1 0
+  %nop14037 = alloca i1, i1 0
+  %nop14038 = alloca i1, i1 0
+  %nop14039 = alloca i1, i1 0
+  %nop14040 = alloca i1, i1 0
+  %nop14041 = alloca i1, i1 0
+  %nop14042 = alloca i1, i1 0
+  %nop14043 = alloca i1, i1 0
+  %nop14044 = alloca i1, i1 0
+  %nop14045 = alloca i1, i1 0
+  %nop14046 = alloca i1, i1 0
+  %nop14047 = alloca i1, i1 0
+  %nop14048 = alloca i1, i1 0
+  %nop14049 = alloca i1, i1 0
+  %nop14050 = alloca i1, i1 0
+  %nop14051 = alloca i1, i1 0
+  %nop14052 = alloca i1, i1 0
+  %nop14053 = alloca i1, i1 0
+  %nop14054 = alloca i1, i1 0
+  %nop14055 = alloca i1, i1 0
+  %nop14056 = alloca i1, i1 0
+  %nop14057 = alloca i1, i1 0
+  %nop14058 = alloca i1, i1 0
+  %nop14059 = alloca i1, i1 0
+  %nop14060 = alloca i1, i1 0
+  %nop14061 = alloca i1, i1 0
+  %nop14062 = alloca i1, i1 0
+  %nop14063 = alloca i1, i1 0
+  %nop14064 = alloca i1, i1 0
+  %nop14065 = alloca i1, i1 0
+  %nop14066 = alloca i1, i1 0
+  %nop14067 = alloca i1, i1 0
+  %nop14068 = alloca i1, i1 0
+  %nop14069 = alloca i1, i1 0
+  %nop14070 = alloca i1, i1 0
+  %nop14071 = alloca i1, i1 0
+  %nop14072 = alloca i1, i1 0
+  %nop14073 = alloca i1, i1 0
+  %nop14074 = alloca i1, i1 0
+  %nop14075 = alloca i1, i1 0
+  %nop14076 = alloca i1, i1 0
+  %nop14077 = alloca i1, i1 0
+  %nop14078 = alloca i1, i1 0
+  %nop14079 = alloca i1, i1 0
+  %nop14080 = alloca i1, i1 0
+  %nop14081 = alloca i1, i1 0
+  %nop14082 = alloca i1, i1 0
+  %nop14083 = alloca i1, i1 0
+  %nop14084 = alloca i1, i1 0
+  %nop14085 = alloca i1, i1 0
+  %nop14086 = alloca i1, i1 0
+  %nop14087 = alloca i1, i1 0
+  %nop14088 = alloca i1, i1 0
+  %nop14089 = alloca i1, i1 0
+  %nop14090 = alloca i1, i1 0
+  %nop14091 = alloca i1, i1 0
+  %nop14092 = alloca i1, i1 0
+  %nop14093 = alloca i1, i1 0
+  %nop14094 = alloca i1, i1 0
+  %nop14095 = alloca i1, i1 0
+  %nop14096 = alloca i1, i1 0
+  %nop14097 = alloca i1, i1 0
+  %nop14098 = alloca i1, i1 0
+  %nop14099 = alloca i1, i1 0
+  %nop14100 = alloca i1, i1 0
+  %nop14101 = alloca i1, i1 0
+  %nop14102 = alloca i1, i1 0
+  %nop14103 = alloca i1, i1 0
+  %nop14104 = alloca i1, i1 0
+  %nop14105 = alloca i1, i1 0
+  %nop14106 = alloca i1, i1 0
+  %nop14107 = alloca i1, i1 0
+  %nop14108 = alloca i1, i1 0
+  %nop14109 = alloca i1, i1 0
+  %nop14110 = alloca i1, i1 0
+  %nop14111 = alloca i1, i1 0
+  %nop14112 = alloca i1, i1 0
+  %nop14113 = alloca i1, i1 0
+  %nop14114 = alloca i1, i1 0
+  %nop14115 = alloca i1, i1 0
+  %nop14116 = alloca i1, i1 0
+  %nop14117 = alloca i1, i1 0
+  %nop14118 = alloca i1, i1 0
+  %nop14119 = alloca i1, i1 0
+  %nop14120 = alloca i1, i1 0
+  %nop14121 = alloca i1, i1 0
+  %nop14122 = alloca i1, i1 0
+  %nop14123 = alloca i1, i1 0
+  %nop14124 = alloca i1, i1 0
+  %nop14125 = alloca i1, i1 0
+  %nop14126 = alloca i1, i1 0
+  %nop14127 = alloca i1, i1 0
+  %nop14128 = alloca i1, i1 0
+  %nop14129 = alloca i1, i1 0
+  %nop14130 = alloca i1, i1 0
+  %nop14131 = alloca i1, i1 0
+  %nop14132 = alloca i1, i1 0
+  %nop14133 = alloca i1, i1 0
+  %nop14134 = alloca i1, i1 0
+  %nop14135 = alloca i1, i1 0
+  %nop14136 = alloca i1, i1 0
+  %nop14137 = alloca i1, i1 0
+  %nop14138 = alloca i1, i1 0
+  %nop14139 = alloca i1, i1 0
+  %nop14140 = alloca i1, i1 0
+  %nop14141 = alloca i1, i1 0
+  %nop14142 = alloca i1, i1 0
+  %nop14143 = alloca i1, i1 0
+  %nop14144 = alloca i1, i1 0
+  %nop14145 = alloca i1, i1 0
+  %nop14146 = alloca i1, i1 0
+  %nop14147 = alloca i1, i1 0
+  %nop14148 = alloca i1, i1 0
+  %nop14149 = alloca i1, i1 0
+  %nop14150 = alloca i1, i1 0
+  %nop14151 = alloca i1, i1 0
+  %nop14152 = alloca i1, i1 0
+  %nop14153 = alloca i1, i1 0
+  %nop14154 = alloca i1, i1 0
+  %nop14155 = alloca i1, i1 0
+  %nop14156 = alloca i1, i1 0
+  %nop14157 = alloca i1, i1 0
+  %nop14158 = alloca i1, i1 0
+  %nop14159 = alloca i1, i1 0
+  %nop14160 = alloca i1, i1 0
+  %nop14161 = alloca i1, i1 0
+  %nop14162 = alloca i1, i1 0
+  %nop14163 = alloca i1, i1 0
+  %nop14164 = alloca i1, i1 0
+  %nop14165 = alloca i1, i1 0
+  %nop14166 = alloca i1, i1 0
+  %nop14167 = alloca i1, i1 0
+  %nop14168 = alloca i1, i1 0
+  %nop14169 = alloca i1, i1 0
+  %nop14170 = alloca i1, i1 0
+  %nop14171 = alloca i1, i1 0
+  %nop14172 = alloca i1, i1 0
+  %nop14173 = alloca i1, i1 0
+  %nop14174 = alloca i1, i1 0
+  %nop14175 = alloca i1, i1 0
+  %nop14176 = alloca i1, i1 0
+  %nop14177 = alloca i1, i1 0
+  %nop14178 = alloca i1, i1 0
+  %nop14179 = alloca i1, i1 0
+  %nop14180 = alloca i1, i1 0
+  %nop14181 = alloca i1, i1 0
+  %nop14182 = alloca i1, i1 0
+  %nop14183 = alloca i1, i1 0
+  %nop14184 = alloca i1, i1 0
+  %nop14185 = alloca i1, i1 0
+  %nop14186 = alloca i1, i1 0
+  %nop14187 = alloca i1, i1 0
+  %nop14188 = alloca i1, i1 0
+  %nop14189 = alloca i1, i1 0
+  %nop14190 = alloca i1, i1 0
+  %nop14191 = alloca i1, i1 0
+  %nop14192 = alloca i1, i1 0
+  %nop14193 = alloca i1, i1 0
+  %nop14194 = alloca i1, i1 0
+  %nop14195 = alloca i1, i1 0
+  %nop14196 = alloca i1, i1 0
+  %nop14197 = alloca i1, i1 0
+  %nop14198 = alloca i1, i1 0
+  %nop14199 = alloca i1, i1 0
+  %nop14200 = alloca i1, i1 0
+  %nop14201 = alloca i1, i1 0
+  %nop14202 = alloca i1, i1 0
+  %nop14203 = alloca i1, i1 0
+  %nop14204 = alloca i1, i1 0
+  %nop14205 = alloca i1, i1 0
+  %nop14206 = alloca i1, i1 0
+  %nop14207 = alloca i1, i1 0
+  %nop14208 = alloca i1, i1 0
+  %nop14209 = alloca i1, i1 0
+  %nop14210 = alloca i1, i1 0
+  %nop14211 = alloca i1, i1 0
+  %nop14212 = alloca i1, i1 0
+  %nop14213 = alloca i1, i1 0
+  %nop14214 = alloca i1, i1 0
+  %nop14215 = alloca i1, i1 0
+  %nop14216 = alloca i1, i1 0
+  %nop14217 = alloca i1, i1 0
+  %nop14218 = alloca i1, i1 0
+  %nop14219 = alloca i1, i1 0
+  %nop14220 = alloca i1, i1 0
+  %nop14221 = alloca i1, i1 0
+  %nop14222 = alloca i1, i1 0
+  %nop14223 = alloca i1, i1 0
+  %nop14224 = alloca i1, i1 0
+  %nop14225 = alloca i1, i1 0
+  %nop14226 = alloca i1, i1 0
+  %nop14227 = alloca i1, i1 0
+  %nop14228 = alloca i1, i1 0
+  %nop14229 = alloca i1, i1 0
+  %nop14230 = alloca i1, i1 0
+  %nop14231 = alloca i1, i1 0
+  %nop14232 = alloca i1, i1 0
+  %nop14233 = alloca i1, i1 0
+  %nop14234 = alloca i1, i1 0
+  %nop14235 = alloca i1, i1 0
+  %nop14236 = alloca i1, i1 0
+  %nop14237 = alloca i1, i1 0
+  %nop14238 = alloca i1, i1 0
+  %nop14239 = alloca i1, i1 0
+  %nop14240 = alloca i1, i1 0
+  %nop14241 = alloca i1, i1 0
+  %nop14242 = alloca i1, i1 0
+  %nop14243 = alloca i1, i1 0
+  %nop14244 = alloca i1, i1 0
+  %nop14245 = alloca i1, i1 0
+  %nop14246 = alloca i1, i1 0
+  %nop14247 = alloca i1, i1 0
+  %nop14248 = alloca i1, i1 0
+  %nop14249 = alloca i1, i1 0
+  %nop14250 = alloca i1, i1 0
+  %nop14251 = alloca i1, i1 0
+  %nop14252 = alloca i1, i1 0
+  %nop14253 = alloca i1, i1 0
+  %nop14254 = alloca i1, i1 0
+  %nop14255 = alloca i1, i1 0
+  %nop14256 = alloca i1, i1 0
+  %nop14257 = alloca i1, i1 0
+  %nop14258 = alloca i1, i1 0
+  %nop14259 = alloca i1, i1 0
+  %nop14260 = alloca i1, i1 0
+  %nop14261 = alloca i1, i1 0
+  %nop14262 = alloca i1, i1 0
+  %nop14263 = alloca i1, i1 0
+  %nop14264 = alloca i1, i1 0
+  %nop14265 = alloca i1, i1 0
+  %nop14266 = alloca i1, i1 0
+  %nop14267 = alloca i1, i1 0
+  %nop14268 = alloca i1, i1 0
+  %nop14269 = alloca i1, i1 0
+  %nop14270 = alloca i1, i1 0
+  %nop14271 = alloca i1, i1 0
+  %nop14272 = alloca i1, i1 0
+  %nop14273 = alloca i1, i1 0
+  %nop14274 = alloca i1, i1 0
+  %nop14275 = alloca i1, i1 0
+  %nop14276 = alloca i1, i1 0
+  %nop14277 = alloca i1, i1 0
+  %nop14278 = alloca i1, i1 0
+  %nop14279 = alloca i1, i1 0
+  %nop14280 = alloca i1, i1 0
+  %nop14281 = alloca i1, i1 0
+  %nop14282 = alloca i1, i1 0
+  %nop14283 = alloca i1, i1 0
+  %nop14284 = alloca i1, i1 0
+  %nop14285 = alloca i1, i1 0
+  %nop14286 = alloca i1, i1 0
+  %nop14287 = alloca i1, i1 0
+  %nop14288 = alloca i1, i1 0
+  %nop14289 = alloca i1, i1 0
+  %nop14290 = alloca i1, i1 0
+  %nop14291 = alloca i1, i1 0
+  %nop14292 = alloca i1, i1 0
+  %nop14293 = alloca i1, i1 0
+  %nop14294 = alloca i1, i1 0
+  %nop14295 = alloca i1, i1 0
+  %nop14296 = alloca i1, i1 0
+  %nop14297 = alloca i1, i1 0
+  %nop14298 = alloca i1, i1 0
+  %nop14299 = alloca i1, i1 0
+  %nop14300 = alloca i1, i1 0
+  %nop14301 = alloca i1, i1 0
+  %nop14302 = alloca i1, i1 0
+  %nop14303 = alloca i1, i1 0
+  %nop14304 = alloca i1, i1 0
+  %nop14305 = alloca i1, i1 0
+  %nop14306 = alloca i1, i1 0
+  %nop14307 = alloca i1, i1 0
+  %nop14308 = alloca i1, i1 0
+  %nop14309 = alloca i1, i1 0
+  %nop14310 = alloca i1, i1 0
+  %nop14311 = alloca i1, i1 0
+  %nop14312 = alloca i1, i1 0
+  %nop14313 = alloca i1, i1 0
+  %nop14314 = alloca i1, i1 0
+  %nop14315 = alloca i1, i1 0
+  %nop14316 = alloca i1, i1 0
+  %nop14317 = alloca i1, i1 0
+  %nop14318 = alloca i1, i1 0
+  %nop14319 = alloca i1, i1 0
+  %nop14320 = alloca i1, i1 0
+  %nop14321 = alloca i1, i1 0
+  %nop14322 = alloca i1, i1 0
+  %nop14323 = alloca i1, i1 0
+  %nop14324 = alloca i1, i1 0
+  %nop14325 = alloca i1, i1 0
+  %nop14326 = alloca i1, i1 0
+  %nop14327 = alloca i1, i1 0
+  %nop14328 = alloca i1, i1 0
+  %nop14329 = alloca i1, i1 0
+  %nop14330 = alloca i1, i1 0
+  %nop14331 = alloca i1, i1 0
+  %nop14332 = alloca i1, i1 0
+  %nop14333 = alloca i1, i1 0
+  %nop14334 = alloca i1, i1 0
+  %nop14335 = alloca i1, i1 0
+  %nop14336 = alloca i1, i1 0
+  %nop14337 = alloca i1, i1 0
+  %nop14338 = alloca i1, i1 0
+  %nop14339 = alloca i1, i1 0
+  %nop14340 = alloca i1, i1 0
+  %nop14341 = alloca i1, i1 0
+  %nop14342 = alloca i1, i1 0
+  %nop14343 = alloca i1, i1 0
+  %nop14344 = alloca i1, i1 0
+  %nop14345 = alloca i1, i1 0
+  %nop14346 = alloca i1, i1 0
+  %nop14347 = alloca i1, i1 0
+  %nop14348 = alloca i1, i1 0
+  %nop14349 = alloca i1, i1 0
+  %nop14350 = alloca i1, i1 0
+  %nop14351 = alloca i1, i1 0
+  %nop14352 = alloca i1, i1 0
+  %nop14353 = alloca i1, i1 0
+  %nop14354 = alloca i1, i1 0
+  %nop14355 = alloca i1, i1 0
+  %nop14356 = alloca i1, i1 0
+  %nop14357 = alloca i1, i1 0
+  %nop14358 = alloca i1, i1 0
+  %nop14359 = alloca i1, i1 0
+  %nop14360 = alloca i1, i1 0
+  %nop14361 = alloca i1, i1 0
+  %nop14362 = alloca i1, i1 0
+  %nop14363 = alloca i1, i1 0
+  %nop14364 = alloca i1, i1 0
+  %nop14365 = alloca i1, i1 0
+  %nop14366 = alloca i1, i1 0
+  %nop14367 = alloca i1, i1 0
+  %nop14368 = alloca i1, i1 0
+  %nop14369 = alloca i1, i1 0
+  %nop14370 = alloca i1, i1 0
+  %nop14371 = alloca i1, i1 0
+  %nop14372 = alloca i1, i1 0
+  %nop14373 = alloca i1, i1 0
+  %nop14374 = alloca i1, i1 0
+  %nop14375 = alloca i1, i1 0
+  %nop14376 = alloca i1, i1 0
+  %nop14377 = alloca i1, i1 0
+  %nop14378 = alloca i1, i1 0
+  %nop14379 = alloca i1, i1 0
+  %nop14380 = alloca i1, i1 0
+  %nop14381 = alloca i1, i1 0
+  %nop14382 = alloca i1, i1 0
+  %nop14383 = alloca i1, i1 0
+  %nop14384 = alloca i1, i1 0
+  %nop14385 = alloca i1, i1 0
+  %nop14386 = alloca i1, i1 0
+  %nop14387 = alloca i1, i1 0
+  %nop14388 = alloca i1, i1 0
+  %nop14389 = alloca i1, i1 0
+  %nop14390 = alloca i1, i1 0
+  %nop14391 = alloca i1, i1 0
+  %nop14392 = alloca i1, i1 0
+  %nop14393 = alloca i1, i1 0
+  %nop14394 = alloca i1, i1 0
+  %nop14395 = alloca i1, i1 0
+  %nop14396 = alloca i1, i1 0
+  %nop14397 = alloca i1, i1 0
+  %nop14398 = alloca i1, i1 0
+  %nop14399 = alloca i1, i1 0
+  %nop14400 = alloca i1, i1 0
+  %nop14401 = alloca i1, i1 0
+  %nop14402 = alloca i1, i1 0
+  %nop14403 = alloca i1, i1 0
+  %nop14404 = alloca i1, i1 0
+  %nop14405 = alloca i1, i1 0
+  %nop14406 = alloca i1, i1 0
+  %nop14407 = alloca i1, i1 0
+  %nop14408 = alloca i1, i1 0
+  %nop14409 = alloca i1, i1 0
+  %nop14410 = alloca i1, i1 0
+  %nop14411 = alloca i1, i1 0
+  %nop14412 = alloca i1, i1 0
+  %nop14413 = alloca i1, i1 0
+  %nop14414 = alloca i1, i1 0
+  %nop14415 = alloca i1, i1 0
+  %nop14416 = alloca i1, i1 0
+  %nop14417 = alloca i1, i1 0
+  %nop14418 = alloca i1, i1 0
+  %nop14419 = alloca i1, i1 0
+  %nop14420 = alloca i1, i1 0
+  %nop14421 = alloca i1, i1 0
+  %nop14422 = alloca i1, i1 0
+  %nop14423 = alloca i1, i1 0
+  %nop14424 = alloca i1, i1 0
+  %nop14425 = alloca i1, i1 0
+  %nop14426 = alloca i1, i1 0
+  %nop14427 = alloca i1, i1 0
+  %nop14428 = alloca i1, i1 0
+  %nop14429 = alloca i1, i1 0
+  %nop14430 = alloca i1, i1 0
+  %nop14431 = alloca i1, i1 0
+  %nop14432 = alloca i1, i1 0
+  %nop14433 = alloca i1, i1 0
+  %nop14434 = alloca i1, i1 0
+  %nop14435 = alloca i1, i1 0
+  %nop14436 = alloca i1, i1 0
+  %nop14437 = alloca i1, i1 0
+  %nop14438 = alloca i1, i1 0
+  %nop14439 = alloca i1, i1 0
+  %nop14440 = alloca i1, i1 0
+  %nop14441 = alloca i1, i1 0
+  %nop14442 = alloca i1, i1 0
+  %nop14443 = alloca i1, i1 0
+  %nop14444 = alloca i1, i1 0
+  %nop14445 = alloca i1, i1 0
+  %nop14446 = alloca i1, i1 0
+  %nop14447 = alloca i1, i1 0
+  %nop14448 = alloca i1, i1 0
+  %nop14449 = alloca i1, i1 0
+  %nop14450 = alloca i1, i1 0
+  %nop14451 = alloca i1, i1 0
+  %nop14452 = alloca i1, i1 0
+  %nop14453 = alloca i1, i1 0
+  %nop14454 = alloca i1, i1 0
+  %nop14455 = alloca i1, i1 0
+  %nop14456 = alloca i1, i1 0
+  %nop14457 = alloca i1, i1 0
+  %nop14458 = alloca i1, i1 0
+  %nop14459 = alloca i1, i1 0
+  %nop14460 = alloca i1, i1 0
+  %nop14461 = alloca i1, i1 0
+  %nop14462 = alloca i1, i1 0
+  %nop14463 = alloca i1, i1 0
+  %nop14464 = alloca i1, i1 0
+  %nop14465 = alloca i1, i1 0
+  %nop14466 = alloca i1, i1 0
+  %nop14467 = alloca i1, i1 0
+  %nop14468 = alloca i1, i1 0
+  %nop14469 = alloca i1, i1 0
+  %nop14470 = alloca i1, i1 0
+  %nop14471 = alloca i1, i1 0
+  %nop14472 = alloca i1, i1 0
+  %nop14473 = alloca i1, i1 0
+  %nop14474 = alloca i1, i1 0
+  %nop14475 = alloca i1, i1 0
+  %nop14476 = alloca i1, i1 0
+  %nop14477 = alloca i1, i1 0
+  %nop14478 = alloca i1, i1 0
+  %nop14479 = alloca i1, i1 0
+  %nop14480 = alloca i1, i1 0
+  %nop14481 = alloca i1, i1 0
+  %nop14482 = alloca i1, i1 0
+  %nop14483 = alloca i1, i1 0
+  %nop14484 = alloca i1, i1 0
+  %nop14485 = alloca i1, i1 0
+  %nop14486 = alloca i1, i1 0
+  %nop14487 = alloca i1, i1 0
+  %nop14488 = alloca i1, i1 0
+  %nop14489 = alloca i1, i1 0
+  %nop14490 = alloca i1, i1 0
+  %nop14491 = alloca i1, i1 0
+  %nop14492 = alloca i1, i1 0
+  %nop14493 = alloca i1, i1 0
+  %nop14494 = alloca i1, i1 0
+  %nop14495 = alloca i1, i1 0
+  %nop14496 = alloca i1, i1 0
+  %nop14497 = alloca i1, i1 0
+  %nop14498 = alloca i1, i1 0
+  %nop14499 = alloca i1, i1 0
+  %nop14500 = alloca i1, i1 0
+  %nop14501 = alloca i1, i1 0
+  %nop14502 = alloca i1, i1 0
+  %nop14503 = alloca i1, i1 0
+  %nop14504 = alloca i1, i1 0
+  %nop14505 = alloca i1, i1 0
+  %nop14506 = alloca i1, i1 0
+  %nop14507 = alloca i1, i1 0
+  %nop14508 = alloca i1, i1 0
+  %nop14509 = alloca i1, i1 0
+  %nop14510 = alloca i1, i1 0
+  %nop14511 = alloca i1, i1 0
+  %nop14512 = alloca i1, i1 0
+  %nop14513 = alloca i1, i1 0
+  %nop14514 = alloca i1, i1 0
+  %nop14515 = alloca i1, i1 0
+  %nop14516 = alloca i1, i1 0
+  %nop14517 = alloca i1, i1 0
+  %nop14518 = alloca i1, i1 0
+  %nop14519 = alloca i1, i1 0
+  %nop14520 = alloca i1, i1 0
+  %nop14521 = alloca i1, i1 0
+  %nop14522 = alloca i1, i1 0
+  %nop14523 = alloca i1, i1 0
+  %nop14524 = alloca i1, i1 0
+  %nop14525 = alloca i1, i1 0
+  %nop14526 = alloca i1, i1 0
+  %nop14527 = alloca i1, i1 0
+  %nop14528 = alloca i1, i1 0
+  %nop14529 = alloca i1, i1 0
+  %nop14530 = alloca i1, i1 0
+  %nop14531 = alloca i1, i1 0
+  %nop14532 = alloca i1, i1 0
+  %nop14533 = alloca i1, i1 0
+  %nop14534 = alloca i1, i1 0
+  %nop14535 = alloca i1, i1 0
+  %nop14536 = alloca i1, i1 0
+  %nop14537 = alloca i1, i1 0
+  %nop14538 = alloca i1, i1 0
+  %nop14539 = alloca i1, i1 0
+  %nop14540 = alloca i1, i1 0
+  %nop14541 = alloca i1, i1 0
+  %nop14542 = alloca i1, i1 0
+  %nop14543 = alloca i1, i1 0
+  %nop14544 = alloca i1, i1 0
+  %nop14545 = alloca i1, i1 0
+  %nop14546 = alloca i1, i1 0
+  %nop14547 = alloca i1, i1 0
+  %nop14548 = alloca i1, i1 0
+  %nop14549 = alloca i1, i1 0
+  %nop14550 = alloca i1, i1 0
+  %nop14551 = alloca i1, i1 0
+  %nop14552 = alloca i1, i1 0
+  %nop14553 = alloca i1, i1 0
+  %nop14554 = alloca i1, i1 0
+  %nop14555 = alloca i1, i1 0
+  %nop14556 = alloca i1, i1 0
+  %nop14557 = alloca i1, i1 0
+  %nop14558 = alloca i1, i1 0
+  %nop14559 = alloca i1, i1 0
+  %nop14560 = alloca i1, i1 0
+  %nop14561 = alloca i1, i1 0
+  %nop14562 = alloca i1, i1 0
+  %nop14563 = alloca i1, i1 0
+  %nop14564 = alloca i1, i1 0
+  %nop14565 = alloca i1, i1 0
+  %nop14566 = alloca i1, i1 0
+  %nop14567 = alloca i1, i1 0
+  %nop14568 = alloca i1, i1 0
+  %nop14569 = alloca i1, i1 0
+  %nop14570 = alloca i1, i1 0
+  %nop14571 = alloca i1, i1 0
+  %nop14572 = alloca i1, i1 0
+  %nop14573 = alloca i1, i1 0
+  %nop14574 = alloca i1, i1 0
+  %nop14575 = alloca i1, i1 0
+  %nop14576 = alloca i1, i1 0
+  %nop14577 = alloca i1, i1 0
+  %nop14578 = alloca i1, i1 0
+  %nop14579 = alloca i1, i1 0
+  %nop14580 = alloca i1, i1 0
+  %nop14581 = alloca i1, i1 0
+  %nop14582 = alloca i1, i1 0
+  %nop14583 = alloca i1, i1 0
+  %nop14584 = alloca i1, i1 0
+  %nop14585 = alloca i1, i1 0
+  %nop14586 = alloca i1, i1 0
+  %nop14587 = alloca i1, i1 0
+  %nop14588 = alloca i1, i1 0
+  %nop14589 = alloca i1, i1 0
+  %nop14590 = alloca i1, i1 0
+  %nop14591 = alloca i1, i1 0
+  %nop14592 = alloca i1, i1 0
+  %nop14593 = alloca i1, i1 0
+  %nop14594 = alloca i1, i1 0
+  %nop14595 = alloca i1, i1 0
+  %nop14596 = alloca i1, i1 0
+  %nop14597 = alloca i1, i1 0
+  %nop14598 = alloca i1, i1 0
+  %nop14599 = alloca i1, i1 0
+  %nop14600 = alloca i1, i1 0
+  %nop14601 = alloca i1, i1 0
+  %nop14602 = alloca i1, i1 0
+  %nop14603 = alloca i1, i1 0
+  %nop14604 = alloca i1, i1 0
+  %nop14605 = alloca i1, i1 0
+  %nop14606 = alloca i1, i1 0
+  %nop14607 = alloca i1, i1 0
+  %nop14608 = alloca i1, i1 0
+  %nop14609 = alloca i1, i1 0
+  %nop14610 = alloca i1, i1 0
+  %nop14611 = alloca i1, i1 0
+  %nop14612 = alloca i1, i1 0
+  %nop14613 = alloca i1, i1 0
+  %nop14614 = alloca i1, i1 0
+  %nop14615 = alloca i1, i1 0
+  %nop14616 = alloca i1, i1 0
+  %nop14617 = alloca i1, i1 0
+  %nop14618 = alloca i1, i1 0
+  %nop14619 = alloca i1, i1 0
+  %nop14620 = alloca i1, i1 0
+  %nop14621 = alloca i1, i1 0
+  %nop14622 = alloca i1, i1 0
+  %nop14623 = alloca i1, i1 0
+  %nop14624 = alloca i1, i1 0
+  %nop14625 = alloca i1, i1 0
+  %nop14626 = alloca i1, i1 0
+  %nop14627 = alloca i1, i1 0
+  %nop14628 = alloca i1, i1 0
+  %nop14629 = alloca i1, i1 0
+  %nop14630 = alloca i1, i1 0
+  %nop14631 = alloca i1, i1 0
+  %nop14632 = alloca i1, i1 0
+  %nop14633 = alloca i1, i1 0
+  %nop14634 = alloca i1, i1 0
+  %nop14635 = alloca i1, i1 0
+  %nop14636 = alloca i1, i1 0
+  %nop14637 = alloca i1, i1 0
+  %nop14638 = alloca i1, i1 0
+  %nop14639 = alloca i1, i1 0
+  %nop14640 = alloca i1, i1 0
+  %nop14641 = alloca i1, i1 0
+  %nop14642 = alloca i1, i1 0
+  %nop14643 = alloca i1, i1 0
+  %nop14644 = alloca i1, i1 0
+  %nop14645 = alloca i1, i1 0
+  %nop14646 = alloca i1, i1 0
+  %nop14647 = alloca i1, i1 0
+  %nop14648 = alloca i1, i1 0
+  %nop14649 = alloca i1, i1 0
+  %nop14650 = alloca i1, i1 0
+  %nop14651 = alloca i1, i1 0
+  %nop14652 = alloca i1, i1 0
+  %nop14653 = alloca i1, i1 0
+  %nop14654 = alloca i1, i1 0
+  %nop14655 = alloca i1, i1 0
+  %nop14656 = alloca i1, i1 0
+  %nop14657 = alloca i1, i1 0
+  %nop14658 = alloca i1, i1 0
+  %nop14659 = alloca i1, i1 0
+  %nop14660 = alloca i1, i1 0
+  %nop14661 = alloca i1, i1 0
+  %nop14662 = alloca i1, i1 0
+  %nop14663 = alloca i1, i1 0
+  %nop14664 = alloca i1, i1 0
+  %nop14665 = alloca i1, i1 0
+  %nop14666 = alloca i1, i1 0
+  %nop14667 = alloca i1, i1 0
+  %nop14668 = alloca i1, i1 0
+  %nop14669 = alloca i1, i1 0
+  %nop14670 = alloca i1, i1 0
+  %nop14671 = alloca i1, i1 0
+  %nop14672 = alloca i1, i1 0
+  %nop14673 = alloca i1, i1 0
+  %nop14674 = alloca i1, i1 0
+  %nop14675 = alloca i1, i1 0
+  %nop14676 = alloca i1, i1 0
+  %nop14677 = alloca i1, i1 0
+  %nop14678 = alloca i1, i1 0
+  %nop14679 = alloca i1, i1 0
+  %nop14680 = alloca i1, i1 0
+  %nop14681 = alloca i1, i1 0
+  %nop14682 = alloca i1, i1 0
+  %nop14683 = alloca i1, i1 0
+  %nop14684 = alloca i1, i1 0
+  %nop14685 = alloca i1, i1 0
+  %nop14686 = alloca i1, i1 0
+  %nop14687 = alloca i1, i1 0
+  %nop14688 = alloca i1, i1 0
+  %nop14689 = alloca i1, i1 0
+  %nop14690 = alloca i1, i1 0
+  %nop14691 = alloca i1, i1 0
+  %nop14692 = alloca i1, i1 0
+  %nop14693 = alloca i1, i1 0
+  %nop14694 = alloca i1, i1 0
+  %nop14695 = alloca i1, i1 0
+  %nop14696 = alloca i1, i1 0
+  %nop14697 = alloca i1, i1 0
+  %nop14698 = alloca i1, i1 0
+  %nop14699 = alloca i1, i1 0
+  %nop14700 = alloca i1, i1 0
+  %nop14701 = alloca i1, i1 0
+  %nop14702 = alloca i1, i1 0
+  %nop14703 = alloca i1, i1 0
+  %nop14704 = alloca i1, i1 0
+  %nop14705 = alloca i1, i1 0
+  %nop14706 = alloca i1, i1 0
+  %nop14707 = alloca i1, i1 0
+  %nop14708 = alloca i1, i1 0
+  %nop14709 = alloca i1, i1 0
+  %nop14710 = alloca i1, i1 0
+  %nop14711 = alloca i1, i1 0
+  %nop14712 = alloca i1, i1 0
+  %nop14713 = alloca i1, i1 0
+  %nop14714 = alloca i1, i1 0
+  %nop14715 = alloca i1, i1 0
+  %nop14716 = alloca i1, i1 0
+  %nop14717 = alloca i1, i1 0
+  %nop14718 = alloca i1, i1 0
+  %nop14719 = alloca i1, i1 0
+  %nop14720 = alloca i1, i1 0
+  %nop14721 = alloca i1, i1 0
+  %nop14722 = alloca i1, i1 0
+  %nop14723 = alloca i1, i1 0
+  %nop14724 = alloca i1, i1 0
+  %nop14725 = alloca i1, i1 0
+  %nop14726 = alloca i1, i1 0
+  %nop14727 = alloca i1, i1 0
+  %nop14728 = alloca i1, i1 0
+  %nop14729 = alloca i1, i1 0
+  %nop14730 = alloca i1, i1 0
+  %nop14731 = alloca i1, i1 0
+  %nop14732 = alloca i1, i1 0
+  %nop14733 = alloca i1, i1 0
+  %nop14734 = alloca i1, i1 0
+  %nop14735 = alloca i1, i1 0
+  %nop14736 = alloca i1, i1 0
+  %nop14737 = alloca i1, i1 0
+  %nop14738 = alloca i1, i1 0
+  %nop14739 = alloca i1, i1 0
+  %nop14740 = alloca i1, i1 0
+  %nop14741 = alloca i1, i1 0
+  %nop14742 = alloca i1, i1 0
+  %nop14743 = alloca i1, i1 0
+  %nop14744 = alloca i1, i1 0
+  %nop14745 = alloca i1, i1 0
+  %nop14746 = alloca i1, i1 0
+  %nop14747 = alloca i1, i1 0
+  %nop14748 = alloca i1, i1 0
+  %nop14749 = alloca i1, i1 0
+  %nop14750 = alloca i1, i1 0
+  %nop14751 = alloca i1, i1 0
+  %nop14752 = alloca i1, i1 0
+  %nop14753 = alloca i1, i1 0
+  %nop14754 = alloca i1, i1 0
+  %nop14755 = alloca i1, i1 0
+  %nop14756 = alloca i1, i1 0
+  %nop14757 = alloca i1, i1 0
+  %nop14758 = alloca i1, i1 0
+  %nop14759 = alloca i1, i1 0
+  %nop14760 = alloca i1, i1 0
+  %nop14761 = alloca i1, i1 0
+  %nop14762 = alloca i1, i1 0
+  %nop14763 = alloca i1, i1 0
+  %nop14764 = alloca i1, i1 0
+  %nop14765 = alloca i1, i1 0
+  %nop14766 = alloca i1, i1 0
+  %nop14767 = alloca i1, i1 0
+  %nop14768 = alloca i1, i1 0
+  %nop14769 = alloca i1, i1 0
+  %nop14770 = alloca i1, i1 0
+  %nop14771 = alloca i1, i1 0
+  %nop14772 = alloca i1, i1 0
+  %nop14773 = alloca i1, i1 0
+  %nop14774 = alloca i1, i1 0
+  %nop14775 = alloca i1, i1 0
+  %nop14776 = alloca i1, i1 0
+  %nop14777 = alloca i1, i1 0
+  %nop14778 = alloca i1, i1 0
+  %nop14779 = alloca i1, i1 0
+  %nop14780 = alloca i1, i1 0
+  %nop14781 = alloca i1, i1 0
+  %nop14782 = alloca i1, i1 0
+  %nop14783 = alloca i1, i1 0
+  %nop14784 = alloca i1, i1 0
+  %nop14785 = alloca i1, i1 0
+  %nop14786 = alloca i1, i1 0
+  %nop14787 = alloca i1, i1 0
+  %nop14788 = alloca i1, i1 0
+  %nop14789 = alloca i1, i1 0
+  %nop14790 = alloca i1, i1 0
+  %nop14791 = alloca i1, i1 0
+  %nop14792 = alloca i1, i1 0
+  %nop14793 = alloca i1, i1 0
+  %nop14794 = alloca i1, i1 0
+  %nop14795 = alloca i1, i1 0
+  %nop14796 = alloca i1, i1 0
+  %nop14797 = alloca i1, i1 0
+  %nop14798 = alloca i1, i1 0
+  %nop14799 = alloca i1, i1 0
+  %nop14800 = alloca i1, i1 0
+  %nop14801 = alloca i1, i1 0
+  %nop14802 = alloca i1, i1 0
+  %nop14803 = alloca i1, i1 0
+  %nop14804 = alloca i1, i1 0
+  %nop14805 = alloca i1, i1 0
+  %nop14806 = alloca i1, i1 0
+  %nop14807 = alloca i1, i1 0
+  %nop14808 = alloca i1, i1 0
+  %nop14809 = alloca i1, i1 0
+  %nop14810 = alloca i1, i1 0
+  %nop14811 = alloca i1, i1 0
+  %nop14812 = alloca i1, i1 0
+  %nop14813 = alloca i1, i1 0
+  %nop14814 = alloca i1, i1 0
+  %nop14815 = alloca i1, i1 0
+  %nop14816 = alloca i1, i1 0
+  %nop14817 = alloca i1, i1 0
+  %nop14818 = alloca i1, i1 0
+  %nop14819 = alloca i1, i1 0
+  %nop14820 = alloca i1, i1 0
+  %nop14821 = alloca i1, i1 0
+  %nop14822 = alloca i1, i1 0
+  %nop14823 = alloca i1, i1 0
+  %nop14824 = alloca i1, i1 0
+  %nop14825 = alloca i1, i1 0
+  %nop14826 = alloca i1, i1 0
+  %nop14827 = alloca i1, i1 0
+  %nop14828 = alloca i1, i1 0
+  %nop14829 = alloca i1, i1 0
+  %nop14830 = alloca i1, i1 0
+  %nop14831 = alloca i1, i1 0
+  %nop14832 = alloca i1, i1 0
+  %nop14833 = alloca i1, i1 0
+  %nop14834 = alloca i1, i1 0
+  %nop14835 = alloca i1, i1 0
+  %nop14836 = alloca i1, i1 0
+  %nop14837 = alloca i1, i1 0
+  %nop14838 = alloca i1, i1 0
+  %nop14839 = alloca i1, i1 0
+  %nop14840 = alloca i1, i1 0
+  %nop14841 = alloca i1, i1 0
+  %nop14842 = alloca i1, i1 0
+  %nop14843 = alloca i1, i1 0
+  %nop14844 = alloca i1, i1 0
+  %nop14845 = alloca i1, i1 0
+  %nop14846 = alloca i1, i1 0
+  %nop14847 = alloca i1, i1 0
+  %nop14848 = alloca i1, i1 0
+  %nop14849 = alloca i1, i1 0
+  %nop14850 = alloca i1, i1 0
+  %nop14851 = alloca i1, i1 0
+  %nop14852 = alloca i1, i1 0
+  %nop14853 = alloca i1, i1 0
+  %nop14854 = alloca i1, i1 0
+  %nop14855 = alloca i1, i1 0
+  %nop14856 = alloca i1, i1 0
+  %nop14857 = alloca i1, i1 0
+  %nop14858 = alloca i1, i1 0
+  %nop14859 = alloca i1, i1 0
+  %nop14860 = alloca i1, i1 0
+  %nop14861 = alloca i1, i1 0
+  %nop14862 = alloca i1, i1 0
+  %nop14863 = alloca i1, i1 0
+  %nop14864 = alloca i1, i1 0
+  %nop14865 = alloca i1, i1 0
+  %nop14866 = alloca i1, i1 0
+  %nop14867 = alloca i1, i1 0
+  %nop14868 = alloca i1, i1 0
+  %nop14869 = alloca i1, i1 0
+  %nop14870 = alloca i1, i1 0
+  %nop14871 = alloca i1, i1 0
+  %nop14872 = alloca i1, i1 0
+  %nop14873 = alloca i1, i1 0
+  %nop14874 = alloca i1, i1 0
+  %nop14875 = alloca i1, i1 0
+  %nop14876 = alloca i1, i1 0
+  %nop14877 = alloca i1, i1 0
+  %nop14878 = alloca i1, i1 0
+  %nop14879 = alloca i1, i1 0
+  %nop14880 = alloca i1, i1 0
+  %nop14881 = alloca i1, i1 0
+  %nop14882 = alloca i1, i1 0
+  %nop14883 = alloca i1, i1 0
+  %nop14884 = alloca i1, i1 0
+  %nop14885 = alloca i1, i1 0
+  %nop14886 = alloca i1, i1 0
+  %nop14887 = alloca i1, i1 0
+  %nop14888 = alloca i1, i1 0
+  %nop14889 = alloca i1, i1 0
+  %nop14890 = alloca i1, i1 0
+  %nop14891 = alloca i1, i1 0
+  %nop14892 = alloca i1, i1 0
+  %nop14893 = alloca i1, i1 0
+  %nop14894 = alloca i1, i1 0
+  %nop14895 = alloca i1, i1 0
+  %nop14896 = alloca i1, i1 0
+  %nop14897 = alloca i1, i1 0
+  %nop14898 = alloca i1, i1 0
+  %nop14899 = alloca i1, i1 0
+  %nop14900 = alloca i1, i1 0
+  %nop14901 = alloca i1, i1 0
+  %nop14902 = alloca i1, i1 0
+  %nop14903 = alloca i1, i1 0
+  %nop14904 = alloca i1, i1 0
+  %nop14905 = alloca i1, i1 0
+  %nop14906 = alloca i1, i1 0
+  %nop14907 = alloca i1, i1 0
+  %nop14908 = alloca i1, i1 0
+  %nop14909 = alloca i1, i1 0
+  %nop14910 = alloca i1, i1 0
+  %nop14911 = alloca i1, i1 0
+  %nop14912 = alloca i1, i1 0
+  %nop14913 = alloca i1, i1 0
+  %nop14914 = alloca i1, i1 0
+  %nop14915 = alloca i1, i1 0
+  %nop14916 = alloca i1, i1 0
+  %nop14917 = alloca i1, i1 0
+  %nop14918 = alloca i1, i1 0
+  %nop14919 = alloca i1, i1 0
+  %nop14920 = alloca i1, i1 0
+  %nop14921 = alloca i1, i1 0
+  %nop14922 = alloca i1, i1 0
+  %nop14923 = alloca i1, i1 0
+  %nop14924 = alloca i1, i1 0
+  %nop14925 = alloca i1, i1 0
+  %nop14926 = alloca i1, i1 0
+  %nop14927 = alloca i1, i1 0
+  %nop14928 = alloca i1, i1 0
+  %nop14929 = alloca i1, i1 0
+  %nop14930 = alloca i1, i1 0
+  %nop14931 = alloca i1, i1 0
+  %nop14932 = alloca i1, i1 0
+  %nop14933 = alloca i1, i1 0
+  %nop14934 = alloca i1, i1 0
+  %nop14935 = alloca i1, i1 0
+  %nop14936 = alloca i1, i1 0
+  %nop14937 = alloca i1, i1 0
+  %nop14938 = alloca i1, i1 0
+  %nop14939 = alloca i1, i1 0
+  %nop14940 = alloca i1, i1 0
+  %nop14941 = alloca i1, i1 0
+  %nop14942 = alloca i1, i1 0
+  %nop14943 = alloca i1, i1 0
+  %nop14944 = alloca i1, i1 0
+  %nop14945 = alloca i1, i1 0
+  %nop14946 = alloca i1, i1 0
+  %nop14947 = alloca i1, i1 0
+  %nop14948 = alloca i1, i1 0
+  %nop14949 = alloca i1, i1 0
+  %nop14950 = alloca i1, i1 0
+  %nop14951 = alloca i1, i1 0
+  %nop14952 = alloca i1, i1 0
+  %nop14953 = alloca i1, i1 0
+  %nop14954 = alloca i1, i1 0
+  %nop14955 = alloca i1, i1 0
+  %nop14956 = alloca i1, i1 0
+  %nop14957 = alloca i1, i1 0
+  %nop14958 = alloca i1, i1 0
+  %nop14959 = alloca i1, i1 0
+  %nop14960 = alloca i1, i1 0
+  %nop14961 = alloca i1, i1 0
+  %nop14962 = alloca i1, i1 0
+  %nop14963 = alloca i1, i1 0
+  %nop14964 = alloca i1, i1 0
+  %nop14965 = alloca i1, i1 0
+  %nop14966 = alloca i1, i1 0
+  %nop14967 = alloca i1, i1 0
+  %nop14968 = alloca i1, i1 0
+  %nop14969 = alloca i1, i1 0
+  %nop14970 = alloca i1, i1 0
+  %nop14971 = alloca i1, i1 0
+  %nop14972 = alloca i1, i1 0
+  %nop14973 = alloca i1, i1 0
+  %nop14974 = alloca i1, i1 0
+  %nop14975 = alloca i1, i1 0
+  %nop14976 = alloca i1, i1 0
+  %nop14977 = alloca i1, i1 0
+  %nop14978 = alloca i1, i1 0
+  %nop14979 = alloca i1, i1 0
+  %nop14980 = alloca i1, i1 0
+  %nop14981 = alloca i1, i1 0
+  %nop14982 = alloca i1, i1 0
+  %nop14983 = alloca i1, i1 0
+  %nop14984 = alloca i1, i1 0
+  %nop14985 = alloca i1, i1 0
+  %nop14986 = alloca i1, i1 0
+  %nop14987 = alloca i1, i1 0
+  %nop14988 = alloca i1, i1 0
+  %nop14989 = alloca i1, i1 0
+  %nop14990 = alloca i1, i1 0
+  %nop14991 = alloca i1, i1 0
+  %nop14992 = alloca i1, i1 0
+  %nop14993 = alloca i1, i1 0
+  %nop14994 = alloca i1, i1 0
+  %nop14995 = alloca i1, i1 0
+  %nop14996 = alloca i1, i1 0
+  %nop14997 = alloca i1, i1 0
+  %nop14998 = alloca i1, i1 0
+  %nop14999 = alloca i1, i1 0
+  %nop15000 = alloca i1, i1 0
+  %nop15001 = alloca i1, i1 0
+  %nop15002 = alloca i1, i1 0
+  %nop15003 = alloca i1, i1 0
+  %nop15004 = alloca i1, i1 0
+  %nop15005 = alloca i1, i1 0
+  %nop15006 = alloca i1, i1 0
+  %nop15007 = alloca i1, i1 0
+  %nop15008 = alloca i1, i1 0
+  %nop15009 = alloca i1, i1 0
+  %nop15010 = alloca i1, i1 0
+  %nop15011 = alloca i1, i1 0
+  %nop15012 = alloca i1, i1 0
+  %nop15013 = alloca i1, i1 0
+  %nop15014 = alloca i1, i1 0
+  %nop15015 = alloca i1, i1 0
+  %nop15016 = alloca i1, i1 0
+  %nop15017 = alloca i1, i1 0
+  %nop15018 = alloca i1, i1 0
+  %nop15019 = alloca i1, i1 0
+  %nop15020 = alloca i1, i1 0
+  %nop15021 = alloca i1, i1 0
+  %nop15022 = alloca i1, i1 0
+  %nop15023 = alloca i1, i1 0
+  %nop15024 = alloca i1, i1 0
+  %nop15025 = alloca i1, i1 0
+  %nop15026 = alloca i1, i1 0
+  %nop15027 = alloca i1, i1 0
+  %nop15028 = alloca i1, i1 0
+  %nop15029 = alloca i1, i1 0
+  %nop15030 = alloca i1, i1 0
+  %nop15031 = alloca i1, i1 0
+  %nop15032 = alloca i1, i1 0
+  %nop15033 = alloca i1, i1 0
+  %nop15034 = alloca i1, i1 0
+  %nop15035 = alloca i1, i1 0
+  %nop15036 = alloca i1, i1 0
+  %nop15037 = alloca i1, i1 0
+  %nop15038 = alloca i1, i1 0
+  %nop15039 = alloca i1, i1 0
+  %nop15040 = alloca i1, i1 0
+  %nop15041 = alloca i1, i1 0
+  %nop15042 = alloca i1, i1 0
+  %nop15043 = alloca i1, i1 0
+  %nop15044 = alloca i1, i1 0
+  %nop15045 = alloca i1, i1 0
+  %nop15046 = alloca i1, i1 0
+  %nop15047 = alloca i1, i1 0
+  %nop15048 = alloca i1, i1 0
+  %nop15049 = alloca i1, i1 0
+  %nop15050 = alloca i1, i1 0
+  %nop15051 = alloca i1, i1 0
+  %nop15052 = alloca i1, i1 0
+  %nop15053 = alloca i1, i1 0
+  %nop15054 = alloca i1, i1 0
+  %nop15055 = alloca i1, i1 0
+  %nop15056 = alloca i1, i1 0
+  %nop15057 = alloca i1, i1 0
+  %nop15058 = alloca i1, i1 0
+  %nop15059 = alloca i1, i1 0
+  %nop15060 = alloca i1, i1 0
+  %nop15061 = alloca i1, i1 0
+  %nop15062 = alloca i1, i1 0
+  %nop15063 = alloca i1, i1 0
+  %nop15064 = alloca i1, i1 0
+  %nop15065 = alloca i1, i1 0
+  %nop15066 = alloca i1, i1 0
+  %nop15067 = alloca i1, i1 0
+  %nop15068 = alloca i1, i1 0
+  %nop15069 = alloca i1, i1 0
+  %nop15070 = alloca i1, i1 0
+  %nop15071 = alloca i1, i1 0
+  %nop15072 = alloca i1, i1 0
+  %nop15073 = alloca i1, i1 0
+  %nop15074 = alloca i1, i1 0
+  %nop15075 = alloca i1, i1 0
+  %nop15076 = alloca i1, i1 0
+  %nop15077 = alloca i1, i1 0
+  %nop15078 = alloca i1, i1 0
+  %nop15079 = alloca i1, i1 0
+  %nop15080 = alloca i1, i1 0
+  %nop15081 = alloca i1, i1 0
+  %nop15082 = alloca i1, i1 0
+  %nop15083 = alloca i1, i1 0
+  %nop15084 = alloca i1, i1 0
+  %nop15085 = alloca i1, i1 0
+  %nop15086 = alloca i1, i1 0
+  %nop15087 = alloca i1, i1 0
+  %nop15088 = alloca i1, i1 0
+  %nop15089 = alloca i1, i1 0
+  %nop15090 = alloca i1, i1 0
+  %nop15091 = alloca i1, i1 0
+  %nop15092 = alloca i1, i1 0
+  %nop15093 = alloca i1, i1 0
+  %nop15094 = alloca i1, i1 0
+  %nop15095 = alloca i1, i1 0
+  %nop15096 = alloca i1, i1 0
+  %nop15097 = alloca i1, i1 0
+  %nop15098 = alloca i1, i1 0
+  %nop15099 = alloca i1, i1 0
+  %nop15100 = alloca i1, i1 0
+  %nop15101 = alloca i1, i1 0
+  %nop15102 = alloca i1, i1 0
+  %nop15103 = alloca i1, i1 0
+  %nop15104 = alloca i1, i1 0
+  %nop15105 = alloca i1, i1 0
+  %nop15106 = alloca i1, i1 0
+  %nop15107 = alloca i1, i1 0
+  %nop15108 = alloca i1, i1 0
+  %nop15109 = alloca i1, i1 0
+  %nop15110 = alloca i1, i1 0
+  %nop15111 = alloca i1, i1 0
+  %nop15112 = alloca i1, i1 0
+  %nop15113 = alloca i1, i1 0
+  %nop15114 = alloca i1, i1 0
+  %nop15115 = alloca i1, i1 0
+  %nop15116 = alloca i1, i1 0
+  %nop15117 = alloca i1, i1 0
+  %nop15118 = alloca i1, i1 0
+  %nop15119 = alloca i1, i1 0
+  %nop15120 = alloca i1, i1 0
+  %nop15121 = alloca i1, i1 0
+  %nop15122 = alloca i1, i1 0
+  %nop15123 = alloca i1, i1 0
+  %nop15124 = alloca i1, i1 0
+  %nop15125 = alloca i1, i1 0
+  %nop15126 = alloca i1, i1 0
+  %nop15127 = alloca i1, i1 0
+  %nop15128 = alloca i1, i1 0
+  %nop15129 = alloca i1, i1 0
+  %nop15130 = alloca i1, i1 0
+  %nop15131 = alloca i1, i1 0
+  %nop15132 = alloca i1, i1 0
+  %nop15133 = alloca i1, i1 0
+  %nop15134 = alloca i1, i1 0
+  %nop15135 = alloca i1, i1 0
+  %nop15136 = alloca i1, i1 0
+  %nop15137 = alloca i1, i1 0
+  %nop15138 = alloca i1, i1 0
+  %nop15139 = alloca i1, i1 0
+  %nop15140 = alloca i1, i1 0
+  %nop15141 = alloca i1, i1 0
+  %nop15142 = alloca i1, i1 0
+  %nop15143 = alloca i1, i1 0
+  %nop15144 = alloca i1, i1 0
+  %nop15145 = alloca i1, i1 0
+  %nop15146 = alloca i1, i1 0
+  %nop15147 = alloca i1, i1 0
+  %nop15148 = alloca i1, i1 0
+  %nop15149 = alloca i1, i1 0
+  %nop15150 = alloca i1, i1 0
+  %nop15151 = alloca i1, i1 0
+  %nop15152 = alloca i1, i1 0
+  %nop15153 = alloca i1, i1 0
+  %nop15154 = alloca i1, i1 0
+  %nop15155 = alloca i1, i1 0
+  %nop15156 = alloca i1, i1 0
+  %nop15157 = alloca i1, i1 0
+  %nop15158 = alloca i1, i1 0
+  %nop15159 = alloca i1, i1 0
+  %nop15160 = alloca i1, i1 0
+  %nop15161 = alloca i1, i1 0
+  %nop15162 = alloca i1, i1 0
+  %nop15163 = alloca i1, i1 0
+  %nop15164 = alloca i1, i1 0
+  %nop15165 = alloca i1, i1 0
+  %nop15166 = alloca i1, i1 0
+  %nop15167 = alloca i1, i1 0
+  %nop15168 = alloca i1, i1 0
+  %nop15169 = alloca i1, i1 0
+  %nop15170 = alloca i1, i1 0
+  %nop15171 = alloca i1, i1 0
+  %nop15172 = alloca i1, i1 0
+  %nop15173 = alloca i1, i1 0
+  %nop15174 = alloca i1, i1 0
+  %nop15175 = alloca i1, i1 0
+  %nop15176 = alloca i1, i1 0
+  %nop15177 = alloca i1, i1 0
+  %nop15178 = alloca i1, i1 0
+  %nop15179 = alloca i1, i1 0
+  %nop15180 = alloca i1, i1 0
+  %nop15181 = alloca i1, i1 0
+  %nop15182 = alloca i1, i1 0
+  %nop15183 = alloca i1, i1 0
+  %nop15184 = alloca i1, i1 0
+  %nop15185 = alloca i1, i1 0
+  %nop15186 = alloca i1, i1 0
+  %nop15187 = alloca i1, i1 0
+  %nop15188 = alloca i1, i1 0
+  %nop15189 = alloca i1, i1 0
+  %nop15190 = alloca i1, i1 0
+  %nop15191 = alloca i1, i1 0
+  %nop15192 = alloca i1, i1 0
+  %nop15193 = alloca i1, i1 0
+  %nop15194 = alloca i1, i1 0
+  %nop15195 = alloca i1, i1 0
+  %nop15196 = alloca i1, i1 0
+  %nop15197 = alloca i1, i1 0
+  %nop15198 = alloca i1, i1 0
+  %nop15199 = alloca i1, i1 0
+  %nop15200 = alloca i1, i1 0
+  %nop15201 = alloca i1, i1 0
+  %nop15202 = alloca i1, i1 0
+  %nop15203 = alloca i1, i1 0
+  %nop15204 = alloca i1, i1 0
+  %nop15205 = alloca i1, i1 0
+  %nop15206 = alloca i1, i1 0
+  %nop15207 = alloca i1, i1 0
+  %nop15208 = alloca i1, i1 0
+  %nop15209 = alloca i1, i1 0
+  %nop15210 = alloca i1, i1 0
+  %nop15211 = alloca i1, i1 0
+  %nop15212 = alloca i1, i1 0
+  %nop15213 = alloca i1, i1 0
+  %nop15214 = alloca i1, i1 0
+  %nop15215 = alloca i1, i1 0
+  %nop15216 = alloca i1, i1 0
+  %nop15217 = alloca i1, i1 0
+  %nop15218 = alloca i1, i1 0
+  %nop15219 = alloca i1, i1 0
+  %nop15220 = alloca i1, i1 0
+  %nop15221 = alloca i1, i1 0
+  %nop15222 = alloca i1, i1 0
+  %nop15223 = alloca i1, i1 0
+  %nop15224 = alloca i1, i1 0
+  %nop15225 = alloca i1, i1 0
+  %nop15226 = alloca i1, i1 0
+  %nop15227 = alloca i1, i1 0
+  %nop15228 = alloca i1, i1 0
+  %nop15229 = alloca i1, i1 0
+  %nop15230 = alloca i1, i1 0
+  %nop15231 = alloca i1, i1 0
+  %nop15232 = alloca i1, i1 0
+  %nop15233 = alloca i1, i1 0
+  %nop15234 = alloca i1, i1 0
+  %nop15235 = alloca i1, i1 0
+  %nop15236 = alloca i1, i1 0
+  %nop15237 = alloca i1, i1 0
+  %nop15238 = alloca i1, i1 0
+  %nop15239 = alloca i1, i1 0
+  %nop15240 = alloca i1, i1 0
+  %nop15241 = alloca i1, i1 0
+  %nop15242 = alloca i1, i1 0
+  %nop15243 = alloca i1, i1 0
+  %nop15244 = alloca i1, i1 0
+  %nop15245 = alloca i1, i1 0
+  %nop15246 = alloca i1, i1 0
+  %nop15247 = alloca i1, i1 0
+  %nop15248 = alloca i1, i1 0
+  %nop15249 = alloca i1, i1 0
+  %nop15250 = alloca i1, i1 0
+  %nop15251 = alloca i1, i1 0
+  %nop15252 = alloca i1, i1 0
+  %nop15253 = alloca i1, i1 0
+  %nop15254 = alloca i1, i1 0
+  %nop15255 = alloca i1, i1 0
+  %nop15256 = alloca i1, i1 0
+  %nop15257 = alloca i1, i1 0
+  %nop15258 = alloca i1, i1 0
+  %nop15259 = alloca i1, i1 0
+  %nop15260 = alloca i1, i1 0
+  %nop15261 = alloca i1, i1 0
+  %nop15262 = alloca i1, i1 0
+  %nop15263 = alloca i1, i1 0
+  %nop15264 = alloca i1, i1 0
+  %nop15265 = alloca i1, i1 0
+  %nop15266 = alloca i1, i1 0
+  %nop15267 = alloca i1, i1 0
+  %nop15268 = alloca i1, i1 0
+  %nop15269 = alloca i1, i1 0
+  %nop15270 = alloca i1, i1 0
+  %nop15271 = alloca i1, i1 0
+  %nop15272 = alloca i1, i1 0
+  %nop15273 = alloca i1, i1 0
+  %nop15274 = alloca i1, i1 0
+  %nop15275 = alloca i1, i1 0
+  %nop15276 = alloca i1, i1 0
+  %nop15277 = alloca i1, i1 0
+  %nop15278 = alloca i1, i1 0
+  %nop15279 = alloca i1, i1 0
+  %nop15280 = alloca i1, i1 0
+  %nop15281 = alloca i1, i1 0
+  %nop15282 = alloca i1, i1 0
+  %nop15283 = alloca i1, i1 0
+  %nop15284 = alloca i1, i1 0
+  %nop15285 = alloca i1, i1 0
+  %nop15286 = alloca i1, i1 0
+  %nop15287 = alloca i1, i1 0
+  %nop15288 = alloca i1, i1 0
+  %nop15289 = alloca i1, i1 0
+  %nop15290 = alloca i1, i1 0
+  %nop15291 = alloca i1, i1 0
+  %nop15292 = alloca i1, i1 0
+  %nop15293 = alloca i1, i1 0
+  %nop15294 = alloca i1, i1 0
+  %nop15295 = alloca i1, i1 0
+  %nop15296 = alloca i1, i1 0
+  %nop15297 = alloca i1, i1 0
+  %nop15298 = alloca i1, i1 0
+  %nop15299 = alloca i1, i1 0
+  %nop15300 = alloca i1, i1 0
+  %nop15301 = alloca i1, i1 0
+  %nop15302 = alloca i1, i1 0
+  %nop15303 = alloca i1, i1 0
+  %nop15304 = alloca i1, i1 0
+  %nop15305 = alloca i1, i1 0
+  %nop15306 = alloca i1, i1 0
+  %nop15307 = alloca i1, i1 0
+  %nop15308 = alloca i1, i1 0
+  %nop15309 = alloca i1, i1 0
+  %nop15310 = alloca i1, i1 0
+  %nop15311 = alloca i1, i1 0
+  %nop15312 = alloca i1, i1 0
+  %nop15313 = alloca i1, i1 0
+  %nop15314 = alloca i1, i1 0
+  %nop15315 = alloca i1, i1 0
+  %nop15316 = alloca i1, i1 0
+  %nop15317 = alloca i1, i1 0
+  %nop15318 = alloca i1, i1 0
+  %nop15319 = alloca i1, i1 0
+  %nop15320 = alloca i1, i1 0
+  %nop15321 = alloca i1, i1 0
+  %nop15322 = alloca i1, i1 0
+  %nop15323 = alloca i1, i1 0
+  %nop15324 = alloca i1, i1 0
+  %nop15325 = alloca i1, i1 0
+  %nop15326 = alloca i1, i1 0
+  %nop15327 = alloca i1, i1 0
+  %nop15328 = alloca i1, i1 0
+  %nop15329 = alloca i1, i1 0
+  %nop15330 = alloca i1, i1 0
+  %nop15331 = alloca i1, i1 0
+  %nop15332 = alloca i1, i1 0
+  %nop15333 = alloca i1, i1 0
+  %nop15334 = alloca i1, i1 0
+  %nop15335 = alloca i1, i1 0
+  %nop15336 = alloca i1, i1 0
+  %nop15337 = alloca i1, i1 0
+  %nop15338 = alloca i1, i1 0
+  %nop15339 = alloca i1, i1 0
+  %nop15340 = alloca i1, i1 0
+  %nop15341 = alloca i1, i1 0
+  %nop15342 = alloca i1, i1 0
+  %nop15343 = alloca i1, i1 0
+  %nop15344 = alloca i1, i1 0
+  %nop15345 = alloca i1, i1 0
+  %nop15346 = alloca i1, i1 0
+  %nop15347 = alloca i1, i1 0
+  %nop15348 = alloca i1, i1 0
+  %nop15349 = alloca i1, i1 0
+  %nop15350 = alloca i1, i1 0
+  %nop15351 = alloca i1, i1 0
+  %nop15352 = alloca i1, i1 0
+  %nop15353 = alloca i1, i1 0
+  %nop15354 = alloca i1, i1 0
+  %nop15355 = alloca i1, i1 0
+  %nop15356 = alloca i1, i1 0
+  %nop15357 = alloca i1, i1 0
+  %nop15358 = alloca i1, i1 0
+  %nop15359 = alloca i1, i1 0
+  %nop15360 = alloca i1, i1 0
+  %nop15361 = alloca i1, i1 0
+  %nop15362 = alloca i1, i1 0
+  %nop15363 = alloca i1, i1 0
+  %nop15364 = alloca i1, i1 0
+  %nop15365 = alloca i1, i1 0
+  %nop15366 = alloca i1, i1 0
+  %nop15367 = alloca i1, i1 0
+  %nop15368 = alloca i1, i1 0
+  %nop15369 = alloca i1, i1 0
+  %nop15370 = alloca i1, i1 0
+  %nop15371 = alloca i1, i1 0
+  %nop15372 = alloca i1, i1 0
+  %nop15373 = alloca i1, i1 0
+  %nop15374 = alloca i1, i1 0
+  %nop15375 = alloca i1, i1 0
+  %nop15376 = alloca i1, i1 0
+  %nop15377 = alloca i1, i1 0
+  %nop15378 = alloca i1, i1 0
+  %nop15379 = alloca i1, i1 0
+  %nop15380 = alloca i1, i1 0
+  %nop15381 = alloca i1, i1 0
+  %nop15382 = alloca i1, i1 0
+  %nop15383 = alloca i1, i1 0
+  %nop15384 = alloca i1, i1 0
+  %nop15385 = alloca i1, i1 0
+  %nop15386 = alloca i1, i1 0
+  %nop15387 = alloca i1, i1 0
+  %nop15388 = alloca i1, i1 0
+  %nop15389 = alloca i1, i1 0
+  %nop15390 = alloca i1, i1 0
+  %nop15391 = alloca i1, i1 0
+  %nop15392 = alloca i1, i1 0
+  %nop15393 = alloca i1, i1 0
+  %nop15394 = alloca i1, i1 0
+  %nop15395 = alloca i1, i1 0
+  %nop15396 = alloca i1, i1 0
+  %nop15397 = alloca i1, i1 0
+  %nop15398 = alloca i1, i1 0
+  %nop15399 = alloca i1, i1 0
+  %nop15400 = alloca i1, i1 0
+  %nop15401 = alloca i1, i1 0
+  %nop15402 = alloca i1, i1 0
+  %nop15403 = alloca i1, i1 0
+  %nop15404 = alloca i1, i1 0
+  %nop15405 = alloca i1, i1 0
+  %nop15406 = alloca i1, i1 0
+  %nop15407 = alloca i1, i1 0
+  %nop15408 = alloca i1, i1 0
+  %nop15409 = alloca i1, i1 0
+  %nop15410 = alloca i1, i1 0
+  %nop15411 = alloca i1, i1 0
+  %nop15412 = alloca i1, i1 0
+  %nop15413 = alloca i1, i1 0
+  %nop15414 = alloca i1, i1 0
+  %nop15415 = alloca i1, i1 0
+  %nop15416 = alloca i1, i1 0
+  %nop15417 = alloca i1, i1 0
+  %nop15418 = alloca i1, i1 0
+  %nop15419 = alloca i1, i1 0
+  %nop15420 = alloca i1, i1 0
+  %nop15421 = alloca i1, i1 0
+  %nop15422 = alloca i1, i1 0
+  %nop15423 = alloca i1, i1 0
+  %nop15424 = alloca i1, i1 0
+  %nop15425 = alloca i1, i1 0
+  %nop15426 = alloca i1, i1 0
+  %nop15427 = alloca i1, i1 0
+  %nop15428 = alloca i1, i1 0
+  %nop15429 = alloca i1, i1 0
+  %nop15430 = alloca i1, i1 0
+  %nop15431 = alloca i1, i1 0
+  %nop15432 = alloca i1, i1 0
+  %nop15433 = alloca i1, i1 0
+  %nop15434 = alloca i1, i1 0
+  %nop15435 = alloca i1, i1 0
+  %nop15436 = alloca i1, i1 0
+  %nop15437 = alloca i1, i1 0
+  %nop15438 = alloca i1, i1 0
+  %nop15439 = alloca i1, i1 0
+  %nop15440 = alloca i1, i1 0
+  %nop15441 = alloca i1, i1 0
+  %nop15442 = alloca i1, i1 0
+  %nop15443 = alloca i1, i1 0
+  %nop15444 = alloca i1, i1 0
+  %nop15445 = alloca i1, i1 0
+  %nop15446 = alloca i1, i1 0
+  %nop15447 = alloca i1, i1 0
+  %nop15448 = alloca i1, i1 0
+  %nop15449 = alloca i1, i1 0
+  %nop15450 = alloca i1, i1 0
+  %nop15451 = alloca i1, i1 0
+  %nop15452 = alloca i1, i1 0
+  %nop15453 = alloca i1, i1 0
+  %nop15454 = alloca i1, i1 0
+  %nop15455 = alloca i1, i1 0
+  %nop15456 = alloca i1, i1 0
+  %nop15457 = alloca i1, i1 0
+  %nop15458 = alloca i1, i1 0
+  %nop15459 = alloca i1, i1 0
+  %nop15460 = alloca i1, i1 0
+  %nop15461 = alloca i1, i1 0
+  %nop15462 = alloca i1, i1 0
+  %nop15463 = alloca i1, i1 0
+  %nop15464 = alloca i1, i1 0
+  %nop15465 = alloca i1, i1 0
+  %nop15466 = alloca i1, i1 0
+  %nop15467 = alloca i1, i1 0
+  %nop15468 = alloca i1, i1 0
+  %nop15469 = alloca i1, i1 0
+  %nop15470 = alloca i1, i1 0
+  %nop15471 = alloca i1, i1 0
+  %nop15472 = alloca i1, i1 0
+  %nop15473 = alloca i1, i1 0
+  %nop15474 = alloca i1, i1 0
+  %nop15475 = alloca i1, i1 0
+  %nop15476 = alloca i1, i1 0
+  %nop15477 = alloca i1, i1 0
+  %nop15478 = alloca i1, i1 0
+  %nop15479 = alloca i1, i1 0
+  %nop15480 = alloca i1, i1 0
+  %nop15481 = alloca i1, i1 0
+  %nop15482 = alloca i1, i1 0
+  %nop15483 = alloca i1, i1 0
+  %nop15484 = alloca i1, i1 0
+  %nop15485 = alloca i1, i1 0
+  %nop15486 = alloca i1, i1 0
+  %nop15487 = alloca i1, i1 0
+  %nop15488 = alloca i1, i1 0
+  %nop15489 = alloca i1, i1 0
+  %nop15490 = alloca i1, i1 0
+  %nop15491 = alloca i1, i1 0
+  %nop15492 = alloca i1, i1 0
+  %nop15493 = alloca i1, i1 0
+  %nop15494 = alloca i1, i1 0
+  %nop15495 = alloca i1, i1 0
+  %nop15496 = alloca i1, i1 0
+  %nop15497 = alloca i1, i1 0
+  %nop15498 = alloca i1, i1 0
+  %nop15499 = alloca i1, i1 0
+  %nop15500 = alloca i1, i1 0
+  %nop15501 = alloca i1, i1 0
+  %nop15502 = alloca i1, i1 0
+  %nop15503 = alloca i1, i1 0
+  %nop15504 = alloca i1, i1 0
+  %nop15505 = alloca i1, i1 0
+  %nop15506 = alloca i1, i1 0
+  %nop15507 = alloca i1, i1 0
+  %nop15508 = alloca i1, i1 0
+  %nop15509 = alloca i1, i1 0
+  %nop15510 = alloca i1, i1 0
+  %nop15511 = alloca i1, i1 0
+  %nop15512 = alloca i1, i1 0
+  %nop15513 = alloca i1, i1 0
+  %nop15514 = alloca i1, i1 0
+  %nop15515 = alloca i1, i1 0
+  %nop15516 = alloca i1, i1 0
+  %nop15517 = alloca i1, i1 0
+  %nop15518 = alloca i1, i1 0
+  %nop15519 = alloca i1, i1 0
+  %nop15520 = alloca i1, i1 0
+  %nop15521 = alloca i1, i1 0
+  %nop15522 = alloca i1, i1 0
+  %nop15523 = alloca i1, i1 0
+  %nop15524 = alloca i1, i1 0
+  %nop15525 = alloca i1, i1 0
+  %nop15526 = alloca i1, i1 0
+  %nop15527 = alloca i1, i1 0
+  %nop15528 = alloca i1, i1 0
+  %nop15529 = alloca i1, i1 0
+  %nop15530 = alloca i1, i1 0
+  %nop15531 = alloca i1, i1 0
+  %nop15532 = alloca i1, i1 0
+  %nop15533 = alloca i1, i1 0
+  %nop15534 = alloca i1, i1 0
+  %nop15535 = alloca i1, i1 0
+  %nop15536 = alloca i1, i1 0
+  %nop15537 = alloca i1, i1 0
+  %nop15538 = alloca i1, i1 0
+  %nop15539 = alloca i1, i1 0
+  %nop15540 = alloca i1, i1 0
+  %nop15541 = alloca i1, i1 0
+  %nop15542 = alloca i1, i1 0
+  %nop15543 = alloca i1, i1 0
+  %nop15544 = alloca i1, i1 0
+  %nop15545 = alloca i1, i1 0
+  %nop15546 = alloca i1, i1 0
+  %nop15547 = alloca i1, i1 0
+  %nop15548 = alloca i1, i1 0
+  %nop15549 = alloca i1, i1 0
+  %nop15550 = alloca i1, i1 0
+  %nop15551 = alloca i1, i1 0
+  %nop15552 = alloca i1, i1 0
+  %nop15553 = alloca i1, i1 0
+  %nop15554 = alloca i1, i1 0
+  %nop15555 = alloca i1, i1 0
+  %nop15556 = alloca i1, i1 0
+  %nop15557 = alloca i1, i1 0
+  %nop15558 = alloca i1, i1 0
+  %nop15559 = alloca i1, i1 0
+  %nop15560 = alloca i1, i1 0
+  %nop15561 = alloca i1, i1 0
+  %nop15562 = alloca i1, i1 0
+  %nop15563 = alloca i1, i1 0
+  %nop15564 = alloca i1, i1 0
+  %nop15565 = alloca i1, i1 0
+  %nop15566 = alloca i1, i1 0
+  %nop15567 = alloca i1, i1 0
+  %nop15568 = alloca i1, i1 0
+  %nop15569 = alloca i1, i1 0
+  %nop15570 = alloca i1, i1 0
+  %nop15571 = alloca i1, i1 0
+  %nop15572 = alloca i1, i1 0
+  %nop15573 = alloca i1, i1 0
+  %nop15574 = alloca i1, i1 0
+  %nop15575 = alloca i1, i1 0
+  %nop15576 = alloca i1, i1 0
+  %nop15577 = alloca i1, i1 0
+  %nop15578 = alloca i1, i1 0
+  %nop15579 = alloca i1, i1 0
+  %nop15580 = alloca i1, i1 0
+  %nop15581 = alloca i1, i1 0
+  %nop15582 = alloca i1, i1 0
+  %nop15583 = alloca i1, i1 0
+  %nop15584 = alloca i1, i1 0
+  %nop15585 = alloca i1, i1 0
+  %nop15586 = alloca i1, i1 0
+  %nop15587 = alloca i1, i1 0
+  %nop15588 = alloca i1, i1 0
+  %nop15589 = alloca i1, i1 0
+  %nop15590 = alloca i1, i1 0
+  %nop15591 = alloca i1, i1 0
+  %nop15592 = alloca i1, i1 0
+  %nop15593 = alloca i1, i1 0
+  %nop15594 = alloca i1, i1 0
+  %nop15595 = alloca i1, i1 0
+  %nop15596 = alloca i1, i1 0
+  %nop15597 = alloca i1, i1 0
+  %nop15598 = alloca i1, i1 0
+  %nop15599 = alloca i1, i1 0
+  %nop15600 = alloca i1, i1 0
+  %nop15601 = alloca i1, i1 0
+  %nop15602 = alloca i1, i1 0
+  %nop15603 = alloca i1, i1 0
+  %nop15604 = alloca i1, i1 0
+  %nop15605 = alloca i1, i1 0
+  %nop15606 = alloca i1, i1 0
+  %nop15607 = alloca i1, i1 0
+  %nop15608 = alloca i1, i1 0
+  %nop15609 = alloca i1, i1 0
+  %nop15610 = alloca i1, i1 0
+  %nop15611 = alloca i1, i1 0
+  %nop15612 = alloca i1, i1 0
+  %nop15613 = alloca i1, i1 0
+  %nop15614 = alloca i1, i1 0
+  %nop15615 = alloca i1, i1 0
+  %nop15616 = alloca i1, i1 0
+  %nop15617 = alloca i1, i1 0
+  %nop15618 = alloca i1, i1 0
+  %nop15619 = alloca i1, i1 0
+  %nop15620 = alloca i1, i1 0
+  %nop15621 = alloca i1, i1 0
+  %nop15622 = alloca i1, i1 0
+  %nop15623 = alloca i1, i1 0
+  %nop15624 = alloca i1, i1 0
+  %nop15625 = alloca i1, i1 0
+  %nop15626 = alloca i1, i1 0
+  %nop15627 = alloca i1, i1 0
+  %nop15628 = alloca i1, i1 0
+  %nop15629 = alloca i1, i1 0
+  %nop15630 = alloca i1, i1 0
+  %nop15631 = alloca i1, i1 0
+  %nop15632 = alloca i1, i1 0
+  %nop15633 = alloca i1, i1 0
+  %nop15634 = alloca i1, i1 0
+  %nop15635 = alloca i1, i1 0
+  %nop15636 = alloca i1, i1 0
+  %nop15637 = alloca i1, i1 0
+  %nop15638 = alloca i1, i1 0
+  %nop15639 = alloca i1, i1 0
+  %nop15640 = alloca i1, i1 0
+  %nop15641 = alloca i1, i1 0
+  %nop15642 = alloca i1, i1 0
+  %nop15643 = alloca i1, i1 0
+  %nop15644 = alloca i1, i1 0
+  %nop15645 = alloca i1, i1 0
+  %nop15646 = alloca i1, i1 0
+  %nop15647 = alloca i1, i1 0
+  %nop15648 = alloca i1, i1 0
+  %nop15649 = alloca i1, i1 0
+  %nop15650 = alloca i1, i1 0
+  %nop15651 = alloca i1, i1 0
+  %nop15652 = alloca i1, i1 0
+  %nop15653 = alloca i1, i1 0
+  %nop15654 = alloca i1, i1 0
+  %nop15655 = alloca i1, i1 0
+  %nop15656 = alloca i1, i1 0
+  %nop15657 = alloca i1, i1 0
+  %nop15658 = alloca i1, i1 0
+  %nop15659 = alloca i1, i1 0
+  %nop15660 = alloca i1, i1 0
+  %nop15661 = alloca i1, i1 0
+  %nop15662 = alloca i1, i1 0
+  %nop15663 = alloca i1, i1 0
+  %nop15664 = alloca i1, i1 0
+  %nop15665 = alloca i1, i1 0
+  %nop15666 = alloca i1, i1 0
+  %nop15667 = alloca i1, i1 0
+  %nop15668 = alloca i1, i1 0
+  %nop15669 = alloca i1, i1 0
+  %nop15670 = alloca i1, i1 0
+  %nop15671 = alloca i1, i1 0
+  %nop15672 = alloca i1, i1 0
+  %nop15673 = alloca i1, i1 0
+  %nop15674 = alloca i1, i1 0
+  %nop15675 = alloca i1, i1 0
+  %nop15676 = alloca i1, i1 0
+  %nop15677 = alloca i1, i1 0
+  %nop15678 = alloca i1, i1 0
+  %nop15679 = alloca i1, i1 0
+  %nop15680 = alloca i1, i1 0
+  %nop15681 = alloca i1, i1 0
+  %nop15682 = alloca i1, i1 0
+  %nop15683 = alloca i1, i1 0
+  %nop15684 = alloca i1, i1 0
+  %nop15685 = alloca i1, i1 0
+  %nop15686 = alloca i1, i1 0
+  %nop15687 = alloca i1, i1 0
+  %nop15688 = alloca i1, i1 0
+  %nop15689 = alloca i1, i1 0
+  %nop15690 = alloca i1, i1 0
+  %nop15691 = alloca i1, i1 0
+  %nop15692 = alloca i1, i1 0
+  %nop15693 = alloca i1, i1 0
+  %nop15694 = alloca i1, i1 0
+  %nop15695 = alloca i1, i1 0
+  %nop15696 = alloca i1, i1 0
+  %nop15697 = alloca i1, i1 0
+  %nop15698 = alloca i1, i1 0
+  %nop15699 = alloca i1, i1 0
+  %nop15700 = alloca i1, i1 0
+  %nop15701 = alloca i1, i1 0
+  %nop15702 = alloca i1, i1 0
+  %nop15703 = alloca i1, i1 0
+  %nop15704 = alloca i1, i1 0
+  %nop15705 = alloca i1, i1 0
+  %nop15706 = alloca i1, i1 0
+  %nop15707 = alloca i1, i1 0
+  %nop15708 = alloca i1, i1 0
+  %nop15709 = alloca i1, i1 0
+  %nop15710 = alloca i1, i1 0
+  %nop15711 = alloca i1, i1 0
+  %nop15712 = alloca i1, i1 0
+  %nop15713 = alloca i1, i1 0
+  %nop15714 = alloca i1, i1 0
+  %nop15715 = alloca i1, i1 0
+  %nop15716 = alloca i1, i1 0
+  %nop15717 = alloca i1, i1 0
+  %nop15718 = alloca i1, i1 0
+  %nop15719 = alloca i1, i1 0
+  %nop15720 = alloca i1, i1 0
+  %nop15721 = alloca i1, i1 0
+  %nop15722 = alloca i1, i1 0
+  %nop15723 = alloca i1, i1 0
+  %nop15724 = alloca i1, i1 0
+  %nop15725 = alloca i1, i1 0
+  %nop15726 = alloca i1, i1 0
+  %nop15727 = alloca i1, i1 0
+  %nop15728 = alloca i1, i1 0
+  %nop15729 = alloca i1, i1 0
+  %nop15730 = alloca i1, i1 0
+  %nop15731 = alloca i1, i1 0
+  %nop15732 = alloca i1, i1 0
+  %nop15733 = alloca i1, i1 0
+  %nop15734 = alloca i1, i1 0
+  %nop15735 = alloca i1, i1 0
+  %nop15736 = alloca i1, i1 0
+  %nop15737 = alloca i1, i1 0
+  %nop15738 = alloca i1, i1 0
+  %nop15739 = alloca i1, i1 0
+  %nop15740 = alloca i1, i1 0
+  %nop15741 = alloca i1, i1 0
+  %nop15742 = alloca i1, i1 0
+  %nop15743 = alloca i1, i1 0
+  %nop15744 = alloca i1, i1 0
+  %nop15745 = alloca i1, i1 0
+  %nop15746 = alloca i1, i1 0
+  %nop15747 = alloca i1, i1 0
+  %nop15748 = alloca i1, i1 0
+  %nop15749 = alloca i1, i1 0
+  %nop15750 = alloca i1, i1 0
+  %nop15751 = alloca i1, i1 0
+  %nop15752 = alloca i1, i1 0
+  %nop15753 = alloca i1, i1 0
+  %nop15754 = alloca i1, i1 0
+  %nop15755 = alloca i1, i1 0
+  %nop15756 = alloca i1, i1 0
+  %nop15757 = alloca i1, i1 0
+  %nop15758 = alloca i1, i1 0
+  %nop15759 = alloca i1, i1 0
+  %nop15760 = alloca i1, i1 0
+  %nop15761 = alloca i1, i1 0
+  %nop15762 = alloca i1, i1 0
+  %nop15763 = alloca i1, i1 0
+  %nop15764 = alloca i1, i1 0
+  %nop15765 = alloca i1, i1 0
+  %nop15766 = alloca i1, i1 0
+  %nop15767 = alloca i1, i1 0
+  %nop15768 = alloca i1, i1 0
+  %nop15769 = alloca i1, i1 0
+  %nop15770 = alloca i1, i1 0
+  %nop15771 = alloca i1, i1 0
+  %nop15772 = alloca i1, i1 0
+  %nop15773 = alloca i1, i1 0
+  %nop15774 = alloca i1, i1 0
+  %nop15775 = alloca i1, i1 0
+  %nop15776 = alloca i1, i1 0
+  %nop15777 = alloca i1, i1 0
+  %nop15778 = alloca i1, i1 0
+  %nop15779 = alloca i1, i1 0
+  %nop15780 = alloca i1, i1 0
+  %nop15781 = alloca i1, i1 0
+  %nop15782 = alloca i1, i1 0
+  %nop15783 = alloca i1, i1 0
+  %nop15784 = alloca i1, i1 0
+  %nop15785 = alloca i1, i1 0
+  %nop15786 = alloca i1, i1 0
+  %nop15787 = alloca i1, i1 0
+  %nop15788 = alloca i1, i1 0
+  %nop15789 = alloca i1, i1 0
+  %nop15790 = alloca i1, i1 0
+  %nop15791 = alloca i1, i1 0
+  %nop15792 = alloca i1, i1 0
+  %nop15793 = alloca i1, i1 0
+  %nop15794 = alloca i1, i1 0
+  %nop15795 = alloca i1, i1 0
+  %nop15796 = alloca i1, i1 0
+  %nop15797 = alloca i1, i1 0
+  %nop15798 = alloca i1, i1 0
+  %nop15799 = alloca i1, i1 0
+  %nop15800 = alloca i1, i1 0
+  %nop15801 = alloca i1, i1 0
+  %nop15802 = alloca i1, i1 0
+  %nop15803 = alloca i1, i1 0
+  %nop15804 = alloca i1, i1 0
+  %nop15805 = alloca i1, i1 0
+  %nop15806 = alloca i1, i1 0
+  %nop15807 = alloca i1, i1 0
+  %nop15808 = alloca i1, i1 0
+  %nop15809 = alloca i1, i1 0
+  %nop15810 = alloca i1, i1 0
+  %nop15811 = alloca i1, i1 0
+  %nop15812 = alloca i1, i1 0
+  %nop15813 = alloca i1, i1 0
+  %nop15814 = alloca i1, i1 0
+  %nop15815 = alloca i1, i1 0
+  %nop15816 = alloca i1, i1 0
+  %nop15817 = alloca i1, i1 0
+  %nop15818 = alloca i1, i1 0
+  %nop15819 = alloca i1, i1 0
+  %nop15820 = alloca i1, i1 0
+  %nop15821 = alloca i1, i1 0
+  %nop15822 = alloca i1, i1 0
+  %nop15823 = alloca i1, i1 0
+  %nop15824 = alloca i1, i1 0
+  %nop15825 = alloca i1, i1 0
+  %nop15826 = alloca i1, i1 0
+  %nop15827 = alloca i1, i1 0
+  %nop15828 = alloca i1, i1 0
+  %nop15829 = alloca i1, i1 0
+  %nop15830 = alloca i1, i1 0
+  %nop15831 = alloca i1, i1 0
+  %nop15832 = alloca i1, i1 0
+  %nop15833 = alloca i1, i1 0
+  %nop15834 = alloca i1, i1 0
+  %nop15835 = alloca i1, i1 0
+  %nop15836 = alloca i1, i1 0
+  %nop15837 = alloca i1, i1 0
+  %nop15838 = alloca i1, i1 0
+  %nop15839 = alloca i1, i1 0
+  %nop15840 = alloca i1, i1 0
+  %nop15841 = alloca i1, i1 0
+  %nop15842 = alloca i1, i1 0
+  %nop15843 = alloca i1, i1 0
+  %nop15844 = alloca i1, i1 0
+  %nop15845 = alloca i1, i1 0
+  %nop15846 = alloca i1, i1 0
+  %nop15847 = alloca i1, i1 0
+  %nop15848 = alloca i1, i1 0
+  %nop15849 = alloca i1, i1 0
+  %nop15850 = alloca i1, i1 0
+  %nop15851 = alloca i1, i1 0
+  %nop15852 = alloca i1, i1 0
+  %nop15853 = alloca i1, i1 0
+  %nop15854 = alloca i1, i1 0
+  %nop15855 = alloca i1, i1 0
+  %nop15856 = alloca i1, i1 0
+  %nop15857 = alloca i1, i1 0
+  %nop15858 = alloca i1, i1 0
+  %nop15859 = alloca i1, i1 0
+  %nop15860 = alloca i1, i1 0
+  %nop15861 = alloca i1, i1 0
+  %nop15862 = alloca i1, i1 0
+  %nop15863 = alloca i1, i1 0
+  %nop15864 = alloca i1, i1 0
+  %nop15865 = alloca i1, i1 0
+  %nop15866 = alloca i1, i1 0
+  %nop15867 = alloca i1, i1 0
+  %nop15868 = alloca i1, i1 0
+  %nop15869 = alloca i1, i1 0
+  %nop15870 = alloca i1, i1 0
+  %nop15871 = alloca i1, i1 0
+  %nop15872 = alloca i1, i1 0
+  %nop15873 = alloca i1, i1 0
+  %nop15874 = alloca i1, i1 0
+  %nop15875 = alloca i1, i1 0
+  %nop15876 = alloca i1, i1 0
+  %nop15877 = alloca i1, i1 0
+  %nop15878 = alloca i1, i1 0
+  %nop15879 = alloca i1, i1 0
+  %nop15880 = alloca i1, i1 0
+  %nop15881 = alloca i1, i1 0
+  %nop15882 = alloca i1, i1 0
+  %nop15883 = alloca i1, i1 0
+  %nop15884 = alloca i1, i1 0
+  %nop15885 = alloca i1, i1 0
+  %nop15886 = alloca i1, i1 0
+  %nop15887 = alloca i1, i1 0
+  %nop15888 = alloca i1, i1 0
+  %nop15889 = alloca i1, i1 0
+  %nop15890 = alloca i1, i1 0
+  %nop15891 = alloca i1, i1 0
+  %nop15892 = alloca i1, i1 0
+  %nop15893 = alloca i1, i1 0
+  %nop15894 = alloca i1, i1 0
+  %nop15895 = alloca i1, i1 0
+  %nop15896 = alloca i1, i1 0
+  %nop15897 = alloca i1, i1 0
+  %nop15898 = alloca i1, i1 0
+  %nop15899 = alloca i1, i1 0
+  %nop15900 = alloca i1, i1 0
+  %nop15901 = alloca i1, i1 0
+  %nop15902 = alloca i1, i1 0
+  %nop15903 = alloca i1, i1 0
+  %nop15904 = alloca i1, i1 0
+  %nop15905 = alloca i1, i1 0
+  %nop15906 = alloca i1, i1 0
+  %nop15907 = alloca i1, i1 0
+  %nop15908 = alloca i1, i1 0
+  %nop15909 = alloca i1, i1 0
+  %nop15910 = alloca i1, i1 0
+  %nop15911 = alloca i1, i1 0
+  %nop15912 = alloca i1, i1 0
+  %nop15913 = alloca i1, i1 0
+  %nop15914 = alloca i1, i1 0
+  %nop15915 = alloca i1, i1 0
+  %nop15916 = alloca i1, i1 0
+  %nop15917 = alloca i1, i1 0
+  %nop15918 = alloca i1, i1 0
+  %nop15919 = alloca i1, i1 0
+  %nop15920 = alloca i1, i1 0
+  %nop15921 = alloca i1, i1 0
+  %nop15922 = alloca i1, i1 0
+  %nop15923 = alloca i1, i1 0
+  %nop15924 = alloca i1, i1 0
+  %nop15925 = alloca i1, i1 0
+  %nop15926 = alloca i1, i1 0
+  %nop15927 = alloca i1, i1 0
+  %nop15928 = alloca i1, i1 0
+  %nop15929 = alloca i1, i1 0
+  %nop15930 = alloca i1, i1 0
+  %nop15931 = alloca i1, i1 0
+  %nop15932 = alloca i1, i1 0
+  %nop15933 = alloca i1, i1 0
+  %nop15934 = alloca i1, i1 0
+  %nop15935 = alloca i1, i1 0
+  %nop15936 = alloca i1, i1 0
+  %nop15937 = alloca i1, i1 0
+  %nop15938 = alloca i1, i1 0
+  %nop15939 = alloca i1, i1 0
+  %nop15940 = alloca i1, i1 0
+  %nop15941 = alloca i1, i1 0
+  %nop15942 = alloca i1, i1 0
+  %nop15943 = alloca i1, i1 0
+  %nop15944 = alloca i1, i1 0
+  %nop15945 = alloca i1, i1 0
+  %nop15946 = alloca i1, i1 0
+  %nop15947 = alloca i1, i1 0
+  %nop15948 = alloca i1, i1 0
+  %nop15949 = alloca i1, i1 0
+  %nop15950 = alloca i1, i1 0
+  %nop15951 = alloca i1, i1 0
+  %nop15952 = alloca i1, i1 0
+  %nop15953 = alloca i1, i1 0
+  %nop15954 = alloca i1, i1 0
+  %nop15955 = alloca i1, i1 0
+  %nop15956 = alloca i1, i1 0
+  %nop15957 = alloca i1, i1 0
+  %nop15958 = alloca i1, i1 0
+  %nop15959 = alloca i1, i1 0
+  %nop15960 = alloca i1, i1 0
+  %nop15961 = alloca i1, i1 0
+  %nop15962 = alloca i1, i1 0
+  %nop15963 = alloca i1, i1 0
+  %nop15964 = alloca i1, i1 0
+  %nop15965 = alloca i1, i1 0
+  %nop15966 = alloca i1, i1 0
+  %nop15967 = alloca i1, i1 0
+  %nop15968 = alloca i1, i1 0
+  %nop15969 = alloca i1, i1 0
+  %nop15970 = alloca i1, i1 0
+  %nop15971 = alloca i1, i1 0
+  %nop15972 = alloca i1, i1 0
+  %nop15973 = alloca i1, i1 0
+  %nop15974 = alloca i1, i1 0
+  %nop15975 = alloca i1, i1 0
+  %nop15976 = alloca i1, i1 0
+  %nop15977 = alloca i1, i1 0
+  %nop15978 = alloca i1, i1 0
+  %nop15979 = alloca i1, i1 0
+  %nop15980 = alloca i1, i1 0
+  %nop15981 = alloca i1, i1 0
+  %nop15982 = alloca i1, i1 0
+  %nop15983 = alloca i1, i1 0
+  %nop15984 = alloca i1, i1 0
+  %nop15985 = alloca i1, i1 0
+  %nop15986 = alloca i1, i1 0
+  %nop15987 = alloca i1, i1 0
+  %nop15988 = alloca i1, i1 0
+  %nop15989 = alloca i1, i1 0
+  %nop15990 = alloca i1, i1 0
+  %nop15991 = alloca i1, i1 0
+  %nop15992 = alloca i1, i1 0
+  %nop15993 = alloca i1, i1 0
+  %nop15994 = alloca i1, i1 0
+  %nop15995 = alloca i1, i1 0
+  %nop15996 = alloca i1, i1 0
+  %nop15997 = alloca i1, i1 0
+  %nop15998 = alloca i1, i1 0
+  %nop15999 = alloca i1, i1 0
+  %nop16000 = alloca i1, i1 0
+  %nop16001 = alloca i1, i1 0
+  %nop16002 = alloca i1, i1 0
+  %nop16003 = alloca i1, i1 0
+  %nop16004 = alloca i1, i1 0
+  %nop16005 = alloca i1, i1 0
+  %nop16006 = alloca i1, i1 0
+  %nop16007 = alloca i1, i1 0
+  %nop16008 = alloca i1, i1 0
+  %nop16009 = alloca i1, i1 0
+  %nop16010 = alloca i1, i1 0
+  %nop16011 = alloca i1, i1 0
+  %nop16012 = alloca i1, i1 0
+  %nop16013 = alloca i1, i1 0
+  %nop16014 = alloca i1, i1 0
+  %nop16015 = alloca i1, i1 0
+  %nop16016 = alloca i1, i1 0
+  %nop16017 = alloca i1, i1 0
+  %nop16018 = alloca i1, i1 0
+  %nop16019 = alloca i1, i1 0
+  %nop16020 = alloca i1, i1 0
+  %nop16021 = alloca i1, i1 0
+  %nop16022 = alloca i1, i1 0
+  %nop16023 = alloca i1, i1 0
+  %nop16024 = alloca i1, i1 0
+  %nop16025 = alloca i1, i1 0
+  %nop16026 = alloca i1, i1 0
+  %nop16027 = alloca i1, i1 0
+  %nop16028 = alloca i1, i1 0
+  %nop16029 = alloca i1, i1 0
+  %nop16030 = alloca i1, i1 0
+  %nop16031 = alloca i1, i1 0
+  %nop16032 = alloca i1, i1 0
+  %nop16033 = alloca i1, i1 0
+  %nop16034 = alloca i1, i1 0
+  %nop16035 = alloca i1, i1 0
+  %nop16036 = alloca i1, i1 0
+  %nop16037 = alloca i1, i1 0
+  %nop16038 = alloca i1, i1 0
+  %nop16039 = alloca i1, i1 0
+  %nop16040 = alloca i1, i1 0
+  %nop16041 = alloca i1, i1 0
+  %nop16042 = alloca i1, i1 0
+  %nop16043 = alloca i1, i1 0
+  %nop16044 = alloca i1, i1 0
+  %nop16045 = alloca i1, i1 0
+  %nop16046 = alloca i1, i1 0
+  %nop16047 = alloca i1, i1 0
+  %nop16048 = alloca i1, i1 0
+  %nop16049 = alloca i1, i1 0
+  %nop16050 = alloca i1, i1 0
+  %nop16051 = alloca i1, i1 0
+  %nop16052 = alloca i1, i1 0
+  %nop16053 = alloca i1, i1 0
+  %nop16054 = alloca i1, i1 0
+  %nop16055 = alloca i1, i1 0
+  %nop16056 = alloca i1, i1 0
+  %nop16057 = alloca i1, i1 0
+  %nop16058 = alloca i1, i1 0
+  %nop16059 = alloca i1, i1 0
+  %nop16060 = alloca i1, i1 0
+  %nop16061 = alloca i1, i1 0
+  %nop16062 = alloca i1, i1 0
+  %nop16063 = alloca i1, i1 0
+  %nop16064 = alloca i1, i1 0
+  %nop16065 = alloca i1, i1 0
+  %nop16066 = alloca i1, i1 0
+  %nop16067 = alloca i1, i1 0
+  %nop16068 = alloca i1, i1 0
+  %nop16069 = alloca i1, i1 0
+  %nop16070 = alloca i1, i1 0
+  %nop16071 = alloca i1, i1 0
+  %nop16072 = alloca i1, i1 0
+  %nop16073 = alloca i1, i1 0
+  %nop16074 = alloca i1, i1 0
+  %nop16075 = alloca i1, i1 0
+  %nop16076 = alloca i1, i1 0
+  %nop16077 = alloca i1, i1 0
+  %nop16078 = alloca i1, i1 0
+  %nop16079 = alloca i1, i1 0
+  %nop16080 = alloca i1, i1 0
+  %nop16081 = alloca i1, i1 0
+  %nop16082 = alloca i1, i1 0
+  %nop16083 = alloca i1, i1 0
+  %nop16084 = alloca i1, i1 0
+  %nop16085 = alloca i1, i1 0
+  %nop16086 = alloca i1, i1 0
+  %nop16087 = alloca i1, i1 0
+  %nop16088 = alloca i1, i1 0
+  %nop16089 = alloca i1, i1 0
+  %nop16090 = alloca i1, i1 0
+  %nop16091 = alloca i1, i1 0
+  %nop16092 = alloca i1, i1 0
+  %nop16093 = alloca i1, i1 0
+  %nop16094 = alloca i1, i1 0
+  %nop16095 = alloca i1, i1 0
+  %nop16096 = alloca i1, i1 0
+  %nop16097 = alloca i1, i1 0
+  %nop16098 = alloca i1, i1 0
+  %nop16099 = alloca i1, i1 0
+  %nop16100 = alloca i1, i1 0
+  %nop16101 = alloca i1, i1 0
+  %nop16102 = alloca i1, i1 0
+  %nop16103 = alloca i1, i1 0
+  %nop16104 = alloca i1, i1 0
+  %nop16105 = alloca i1, i1 0
+  %nop16106 = alloca i1, i1 0
+  %nop16107 = alloca i1, i1 0
+  %nop16108 = alloca i1, i1 0
+  %nop16109 = alloca i1, i1 0
+  %nop16110 = alloca i1, i1 0
+  %nop16111 = alloca i1, i1 0
+  %nop16112 = alloca i1, i1 0
+  %nop16113 = alloca i1, i1 0
+  %nop16114 = alloca i1, i1 0
+  %nop16115 = alloca i1, i1 0
+  %nop16116 = alloca i1, i1 0
+  %nop16117 = alloca i1, i1 0
+  %nop16118 = alloca i1, i1 0
+  %nop16119 = alloca i1, i1 0
+  %nop16120 = alloca i1, i1 0
+  %nop16121 = alloca i1, i1 0
+  %nop16122 = alloca i1, i1 0
+  %nop16123 = alloca i1, i1 0
+  %nop16124 = alloca i1, i1 0
+  %nop16125 = alloca i1, i1 0
+  %nop16126 = alloca i1, i1 0
+  %nop16127 = alloca i1, i1 0
+  %nop16128 = alloca i1, i1 0
+  %nop16129 = alloca i1, i1 0
+  %nop16130 = alloca i1, i1 0
+  %nop16131 = alloca i1, i1 0
+  %nop16132 = alloca i1, i1 0
+  %nop16133 = alloca i1, i1 0
+  %nop16134 = alloca i1, i1 0
+  %nop16135 = alloca i1, i1 0
+  %nop16136 = alloca i1, i1 0
+  %nop16137 = alloca i1, i1 0
+  %nop16138 = alloca i1, i1 0
+  %nop16139 = alloca i1, i1 0
+  %nop16140 = alloca i1, i1 0
+  %nop16141 = alloca i1, i1 0
+  %nop16142 = alloca i1, i1 0
+  %nop16143 = alloca i1, i1 0
+  %nop16144 = alloca i1, i1 0
+  %nop16145 = alloca i1, i1 0
+  %nop16146 = alloca i1, i1 0
+  %nop16147 = alloca i1, i1 0
+  %nop16148 = alloca i1, i1 0
+  %nop16149 = alloca i1, i1 0
+  %nop16150 = alloca i1, i1 0
+  %nop16151 = alloca i1, i1 0
+  %nop16152 = alloca i1, i1 0
+  %nop16153 = alloca i1, i1 0
+  %nop16154 = alloca i1, i1 0
+  %nop16155 = alloca i1, i1 0
+  %nop16156 = alloca i1, i1 0
+  %nop16157 = alloca i1, i1 0
+  %nop16158 = alloca i1, i1 0
+  %nop16159 = alloca i1, i1 0
+  %nop16160 = alloca i1, i1 0
+  %nop16161 = alloca i1, i1 0
+  %nop16162 = alloca i1, i1 0
+  %nop16163 = alloca i1, i1 0
+  %nop16164 = alloca i1, i1 0
+  %nop16165 = alloca i1, i1 0
+  %nop16166 = alloca i1, i1 0
+  %nop16167 = alloca i1, i1 0
+  %nop16168 = alloca i1, i1 0
+  %nop16169 = alloca i1, i1 0
+  %nop16170 = alloca i1, i1 0
+  %nop16171 = alloca i1, i1 0
+  %nop16172 = alloca i1, i1 0
+  %nop16173 = alloca i1, i1 0
+  %nop16174 = alloca i1, i1 0
+  %nop16175 = alloca i1, i1 0
+  %nop16176 = alloca i1, i1 0
+  %nop16177 = alloca i1, i1 0
+  %nop16178 = alloca i1, i1 0
+  %nop16179 = alloca i1, i1 0
+  %nop16180 = alloca i1, i1 0
+  %nop16181 = alloca i1, i1 0
+  %nop16182 = alloca i1, i1 0
+  %nop16183 = alloca i1, i1 0
+  %nop16184 = alloca i1, i1 0
+  %nop16185 = alloca i1, i1 0
+  %nop16186 = alloca i1, i1 0
+  %nop16187 = alloca i1, i1 0
+  %nop16188 = alloca i1, i1 0
+  %nop16189 = alloca i1, i1 0
+  %nop16190 = alloca i1, i1 0
+  %nop16191 = alloca i1, i1 0
+  %nop16192 = alloca i1, i1 0
+  %nop16193 = alloca i1, i1 0
+  %nop16194 = alloca i1, i1 0
+  %nop16195 = alloca i1, i1 0
+  %nop16196 = alloca i1, i1 0
+  %nop16197 = alloca i1, i1 0
+  %nop16198 = alloca i1, i1 0
+  %nop16199 = alloca i1, i1 0
+  %nop16200 = alloca i1, i1 0
+  %nop16201 = alloca i1, i1 0
+  %nop16202 = alloca i1, i1 0
+  %nop16203 = alloca i1, i1 0
+  %nop16204 = alloca i1, i1 0
+  %nop16205 = alloca i1, i1 0
+  %nop16206 = alloca i1, i1 0
+  %nop16207 = alloca i1, i1 0
+  %nop16208 = alloca i1, i1 0
+  %nop16209 = alloca i1, i1 0
+  %nop16210 = alloca i1, i1 0
+  %nop16211 = alloca i1, i1 0
+  %nop16212 = alloca i1, i1 0
+  %nop16213 = alloca i1, i1 0
+  %nop16214 = alloca i1, i1 0
+  %nop16215 = alloca i1, i1 0
+  %nop16216 = alloca i1, i1 0
+  %nop16217 = alloca i1, i1 0
+  %nop16218 = alloca i1, i1 0
+  %nop16219 = alloca i1, i1 0
+  %nop16220 = alloca i1, i1 0
+  %nop16221 = alloca i1, i1 0
+  %nop16222 = alloca i1, i1 0
+  %nop16223 = alloca i1, i1 0
+  %nop16224 = alloca i1, i1 0
+  %nop16225 = alloca i1, i1 0
+  %nop16226 = alloca i1, i1 0
+  %nop16227 = alloca i1, i1 0
+  %nop16228 = alloca i1, i1 0
+  %nop16229 = alloca i1, i1 0
+  %nop16230 = alloca i1, i1 0
+  %nop16231 = alloca i1, i1 0
+  %nop16232 = alloca i1, i1 0
+  %nop16233 = alloca i1, i1 0
+  %nop16234 = alloca i1, i1 0
+  %nop16235 = alloca i1, i1 0
+  %nop16236 = alloca i1, i1 0
+  %nop16237 = alloca i1, i1 0
+  %nop16238 = alloca i1, i1 0
+  %nop16239 = alloca i1, i1 0
+  %nop16240 = alloca i1, i1 0
+  %nop16241 = alloca i1, i1 0
+  %nop16242 = alloca i1, i1 0
+  %nop16243 = alloca i1, i1 0
+  %nop16244 = alloca i1, i1 0
+  %nop16245 = alloca i1, i1 0
+  %nop16246 = alloca i1, i1 0
+  %nop16247 = alloca i1, i1 0
+  %nop16248 = alloca i1, i1 0
+  %nop16249 = alloca i1, i1 0
+  %nop16250 = alloca i1, i1 0
+  %nop16251 = alloca i1, i1 0
+  %nop16252 = alloca i1, i1 0
+  %nop16253 = alloca i1, i1 0
+  %nop16254 = alloca i1, i1 0
+  %nop16255 = alloca i1, i1 0
+  %nop16256 = alloca i1, i1 0
+  %nop16257 = alloca i1, i1 0
+  %nop16258 = alloca i1, i1 0
+  %nop16259 = alloca i1, i1 0
+  %nop16260 = alloca i1, i1 0
+  %nop16261 = alloca i1, i1 0
+  %nop16262 = alloca i1, i1 0
+  %nop16263 = alloca i1, i1 0
+  %nop16264 = alloca i1, i1 0
+  %nop16265 = alloca i1, i1 0
+  %nop16266 = alloca i1, i1 0
+  %nop16267 = alloca i1, i1 0
+  %nop16268 = alloca i1, i1 0
+  %nop16269 = alloca i1, i1 0
+  %nop16270 = alloca i1, i1 0
+  %nop16271 = alloca i1, i1 0
+  %nop16272 = alloca i1, i1 0
+  %nop16273 = alloca i1, i1 0
+  %nop16274 = alloca i1, i1 0
+  %nop16275 = alloca i1, i1 0
+  %nop16276 = alloca i1, i1 0
+  %nop16277 = alloca i1, i1 0
+  %nop16278 = alloca i1, i1 0
+  %nop16279 = alloca i1, i1 0
+  %nop16280 = alloca i1, i1 0
+  %nop16281 = alloca i1, i1 0
+  %nop16282 = alloca i1, i1 0
+  %nop16283 = alloca i1, i1 0
+  %nop16284 = alloca i1, i1 0
+  %nop16285 = alloca i1, i1 0
+  %nop16286 = alloca i1, i1 0
+  %nop16287 = alloca i1, i1 0
+  %nop16288 = alloca i1, i1 0
+  %nop16289 = alloca i1, i1 0
+  %nop16290 = alloca i1, i1 0
+  %nop16291 = alloca i1, i1 0
+  %nop16292 = alloca i1, i1 0
+  %nop16293 = alloca i1, i1 0
+  %nop16294 = alloca i1, i1 0
+  %nop16295 = alloca i1, i1 0
+  %nop16296 = alloca i1, i1 0
+  %nop16297 = alloca i1, i1 0
+  %nop16298 = alloca i1, i1 0
+  %nop16299 = alloca i1, i1 0
+  %nop16300 = alloca i1, i1 0
+  %nop16301 = alloca i1, i1 0
+  %nop16302 = alloca i1, i1 0
+  %nop16303 = alloca i1, i1 0
+  %nop16304 = alloca i1, i1 0
+  %nop16305 = alloca i1, i1 0
+  %nop16306 = alloca i1, i1 0
+  %nop16307 = alloca i1, i1 0
+  %nop16308 = alloca i1, i1 0
+  %nop16309 = alloca i1, i1 0
+  %nop16310 = alloca i1, i1 0
+  %nop16311 = alloca i1, i1 0
+  %nop16312 = alloca i1, i1 0
+  %nop16313 = alloca i1, i1 0
+  %nop16314 = alloca i1, i1 0
+  %nop16315 = alloca i1, i1 0
+  %nop16316 = alloca i1, i1 0
+  %nop16317 = alloca i1, i1 0
+  %nop16318 = alloca i1, i1 0
+  %nop16319 = alloca i1, i1 0
+  %nop16320 = alloca i1, i1 0
+  %nop16321 = alloca i1, i1 0
+  %nop16322 = alloca i1, i1 0
+  %nop16323 = alloca i1, i1 0
+  %nop16324 = alloca i1, i1 0
+  %nop16325 = alloca i1, i1 0
+  %nop16326 = alloca i1, i1 0
+  %nop16327 = alloca i1, i1 0
+  %nop16328 = alloca i1, i1 0
+  %nop16329 = alloca i1, i1 0
+  %nop16330 = alloca i1, i1 0
+  %nop16331 = alloca i1, i1 0
+  %nop16332 = alloca i1, i1 0
+  %nop16333 = alloca i1, i1 0
+  %nop16334 = alloca i1, i1 0
+  %nop16335 = alloca i1, i1 0
+  %nop16336 = alloca i1, i1 0
+  %nop16337 = alloca i1, i1 0
+  %nop16338 = alloca i1, i1 0
+  %nop16339 = alloca i1, i1 0
+  %nop16340 = alloca i1, i1 0
+  %nop16341 = alloca i1, i1 0
+  %nop16342 = alloca i1, i1 0
+  %nop16343 = alloca i1, i1 0
+  %nop16344 = alloca i1, i1 0
+  %nop16345 = alloca i1, i1 0
+  %nop16346 = alloca i1, i1 0
+  %nop16347 = alloca i1, i1 0
+  %nop16348 = alloca i1, i1 0
+  %nop16349 = alloca i1, i1 0
+  %nop16350 = alloca i1, i1 0
+  %nop16351 = alloca i1, i1 0
+  %nop16352 = alloca i1, i1 0
+  %nop16353 = alloca i1, i1 0
+  %nop16354 = alloca i1, i1 0
+  %nop16355 = alloca i1, i1 0
+  %nop16356 = alloca i1, i1 0
+  %nop16357 = alloca i1, i1 0
+  %nop16358 = alloca i1, i1 0
+  %nop16359 = alloca i1, i1 0
+  %nop16360 = alloca i1, i1 0
+  %nop16361 = alloca i1, i1 0
+  %nop16362 = alloca i1, i1 0
+  %nop16363 = alloca i1, i1 0
+  %nop16364 = alloca i1, i1 0
+  %nop16365 = alloca i1, i1 0
+  %nop16366 = alloca i1, i1 0
+  %nop16367 = alloca i1, i1 0
+  %nop16368 = alloca i1, i1 0
+  %nop16369 = alloca i1, i1 0
+  %nop16370 = alloca i1, i1 0
+  %nop16371 = alloca i1, i1 0
+  %nop16372 = alloca i1, i1 0
+  %nop16373 = alloca i1, i1 0
+  %nop16374 = alloca i1, i1 0
+  %nop16375 = alloca i1, i1 0
+  %nop16376 = alloca i1, i1 0
+  %nop16377 = alloca i1, i1 0
+  br label %for.inc
+
+for.inc:
+  %3 = load i32* %i, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+; CHECK:  addiu $sp, $sp, -8
+; CHECK:  sw  $ra, 0($sp)
+; CHECK:  lui $[[REG1:[0-9]+]], 65534
+; CHECK:  addiu $[[REG1]], $[[REG1]], -12
+; CHECK:  addu  $[[REG1]], $ra, $[[REG1]]
+; CHECK:  lw  $ra, 0($sp)
+; CHECK:  jr  $[[REG1]]
+; CHECK:  addiu $sp, $sp, 8
+
+for.end:
+  ret i32 0
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false"
+  "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"
+  "no-infs-fp-math"="false" "no-nans-fp-math"="false"
+  "stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
+  "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16-hf-attr.ll b/test/CodeGen/Mips/mips16-hf-attr.ll
new file mode 100644
index 0000000..d9ad629
--- /dev/null
+++ b/test/CodeGen/Mips/mips16-hf-attr.ll
@@ -0,0 +1,45 @@
+; Check that stubs generation for mips16 hard-float mode does not depend
+; on the function 'use-soft-float' attribute's value.
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel \
+; RUN:     -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s
+
+define void @bar_sf() #0 {
+; CHECK: bar_sf:
+entry:
+  %call1 = call float @foo(float 1.000000e+00)
+; CHECK: lw $2, %call16(foo)($3)
+; CHECK: lw $5, %got(__mips16_call_stub_sf_1)($3)
+  ret void
+}
+
+define void @bar_hf() #1 {
+; CHECK: bar_hf:
+entry:
+  %call1 = call float @foo(float 1.000000e+00)
+; CHECK: lw $2, %call16(foo)($3)
+; CHECK: lw $5, %got(__mips16_call_stub_sf_1)($3)
+  ret void
+}
+
+declare float @foo(float) #2
+
+attributes #0 = {
+  nounwind
+  "less-precise-fpmad"="false" "no-frame-pointer-elim"="true"
+  "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false"
+  "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
+  "unsafe-fp-math"="false" "use-soft-float"="false"
+}
+attributes #1 = {
+  nounwind
+  "less-precise-fpmad"="false" "no-frame-pointer-elim"="true"
+  "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false"
+  "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
+  "unsafe-fp-math"="false" "use-soft-float"="true"
+}
+attributes #2 = {
+  "less-precise-fpmad"="false" "no-frame-pointer-elim"="true"
+  "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false"
+  "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
+  "unsafe-fp-math"="false" "use-soft-float"="true"
+}
diff --git a/test/CodeGen/Mips/mips16_32_1.ll b/test/CodeGen/Mips/mips16_32_1.ll
index e156641..f6096b4 100644
--- a/test/CodeGen/Mips/mips16_32_1.ll
+++ b/test/CodeGen/Mips/mips16_32_1.ll
@@ -6,9 +6,8 @@ entry:
   ret void
 }
 
-; CHECK: 	.set	mips16                  # @foo
+; CHECK: 	.set	mips16
 ; CHECK:	.ent	foo
-; CHECK:	save	{{.+}}
-; CHECK:	restore	{{.+}} 
+; CHECK:	jrc $ra
 ; CHECK:	.end	foo
 attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_10.ll b/test/CodeGen/Mips/mips16_32_10.ll
index 7c017b8..ff9831e 100644
--- a/test/CodeGen/Mips/mips16_32_10.ll
+++ b/test/CodeGen/Mips/mips16_32_10.ll
@@ -4,7 +4,7 @@ define void @foo() #0 {
 entry:
   ret void
 }
-; 16: 	.set	nomips16                  # @foo
+; 16: 	.set	nomips16
 ; 16: 	.ent	foo
 ; 16:	.set	noreorder
 ; 16:	.set	nomacro
@@ -21,11 +21,10 @@ entry:
   ret void
 }
 
-; 16: 	.set	mips16                  # @nofoo
+; 16: 	.set	mips16
 ; 16: 	.ent	nofoo
 
-; 16:	save	{{.+}}
-; 16:	restore	{{.+}} 
+; 16:	jrc $ra
 ; 16:	.end	nofoo
 
 define i32 @main() #2 {
@@ -33,7 +32,7 @@ entry:
   ret i32 0
 }
 
-; 16: 	.set	nomips16                  # @main
+; 16: 	.set	nomips16
 ; 16: 	.ent	main
 ; 16:	.set	noreorder
 ; 16:	.set	nomacro
diff --git a/test/CodeGen/Mips/mips16_32_3.ll b/test/CodeGen/Mips/mips16_32_3.ll
index dd94ec1..c5a29a0 100644
--- a/test/CodeGen/Mips/mips16_32_3.ll
+++ b/test/CodeGen/Mips/mips16_32_3.ll
@@ -6,22 +6,20 @@ entry:
   ret void
 }
 
-; 16: 	.set	mips16                  # @foo
+; 16: 	.set	mips16
 ; 16: 	.ent	foo
-; 16:	save	{{.+}}
-; 16:	restore	{{.+}} 
+; 16:	jrc $ra
 ; 16:	.end	foo
-; 32: 	.set	mips16                  # @foo
+; 32: 	.set	mips16
 ; 32: 	.ent	foo
-; 32:	save	{{.+}}
-; 32:	restore	{{.+}} 
+; 32:	jrc $ra
 ; 32:	.end	foo
 define void @nofoo() #1 {
 entry:
   ret void
 }
 
-; 16: 	.set	nomips16                  # @nofoo
+; 16: 	.set	nomips16
 ; 16: 	.ent	nofoo
 ; 16:	.set	noreorder
 ; 16:	.set	nomacro
@@ -32,7 +30,7 @@ entry:
 ; 16:	.set	macro
 ; 16:	.set	reorder
 ; 16:	.end	nofoo
-; 32: 	.set	nomips16                  # @nofoo
+; 32: 	.set	nomips16
 ; 32: 	.ent	nofoo
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -48,12 +46,11 @@ entry:
   ret i32 0
 }
 
-; 16: 	.set	mips16                  # @main
+; 16: 	.set	mips16
 ; 16: 	.ent	main
-; 16:	save	{{.+}}
-; 16:	restore	{{.+}} 
+; 16:	jrc $ra
 ; 16:	.end	main
-; 32: 	.set	nomips16                  # @main
+; 32: 	.set	nomips16
 ; 32: 	.ent	main
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
diff --git a/test/CodeGen/Mips/mips16_32_4.ll b/test/CodeGen/Mips/mips16_32_4.ll
index 5e49071..1238363 100644
--- a/test/CodeGen/Mips/mips16_32_4.ll
+++ b/test/CodeGen/Mips/mips16_32_4.ll
@@ -6,22 +6,20 @@ entry:
   ret void
 }
 
-; 16: 	.set	mips16                  # @foo
+; 16: 	.set	mips16
 ; 16: 	.ent	foo
-; 16:	save	{{.+}}
-; 16:	restore	{{.+}} 
+; 16:	jrc $ra
 ; 16:	.end	foo
-; 32: 	.set	mips16                  # @foo
+; 32: 	.set	mips16
 ; 32: 	.ent	foo
-; 32:	save	{{.+}}
-; 32:	restore	{{.+}} 
+; 32:	jrc $ra
 ; 32:	.end	foo
 define void @nofoo() #1 {
 entry:
   ret void
 }
 
-; 16: 	.set	nomips16                  # @nofoo
+; 16: 	.set	nomips16
 ; 16: 	.ent	nofoo
 ; 16:	.set	noreorder
 ; 16:	.set	nomacro
@@ -32,7 +30,7 @@ entry:
 ; 16:	.set	macro
 ; 16:	.set	reorder
 ; 16:	.end	nofoo
-; 32: 	.set	nomips16                  # @nofoo
+; 32: 	.set	nomips16
 ; 32: 	.ent	nofoo
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -48,15 +46,13 @@ entry:
   ret i32 0
 }
 
-; 16: 	.set	mips16                  # @main
+; 16: 	.set	mips16
 ; 16: 	.ent	main
-; 16:	save	{{.+}}
-; 16:	restore	{{.+}} 
+; 16:	jrc $ra
 ; 16:	.end	main
-; 32: 	.set	mips16                  # @main
+; 32: 	.set	mips16
 ; 32: 	.ent	main
-; 32:	save	{{.+}}
-; 32:	restore	{{.+}} 
+; 32:	jrc $ra
 ; 32:	.end	main
 
 
diff --git a/test/CodeGen/Mips/mips16_32_5.ll b/test/CodeGen/Mips/mips16_32_5.ll
index 17900a2..5d4c8a1 100644
--- a/test/CodeGen/Mips/mips16_32_5.ll
+++ b/test/CodeGen/Mips/mips16_32_5.ll
@@ -6,22 +6,20 @@ entry:
   ret void
 }
 
-; 16: 	.set	mips16                  # @foo
+; 16: 	.set	mips16
 ; 16: 	.ent	foo
-; 16:	save	{{.+}}
-; 16:	restore	{{.+}} 
+; 16:	jrc $ra
 ; 16:	.end	foo
-; 32: 	.set	mips16                  # @foo
+; 32: 	.set	mips16
 ; 32: 	.ent	foo
-; 32:	save	{{.+}}
-; 32:	restore	{{.+}} 
+; 32:	jrc $ra
 ; 32:	.end	foo
 define void @nofoo() #1 {
 entry:
   ret void
 }
 
-; 16: 	.set	nomips16                  # @nofoo
+; 16: 	.set	nomips16
 ; 16: 	.ent	nofoo
 ; 16:	.set	noreorder
 ; 16:	.set	nomacro
@@ -32,7 +30,7 @@ entry:
 ; 16:	.set	macro
 ; 16:	.set	reorder
 ; 16:	.end	nofoo
-; 32: 	.set	nomips16                  # @nofoo
+; 32: 	.set	nomips16
 ; 32: 	.ent	nofoo
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -48,7 +46,7 @@ entry:
   ret i32 0
 }
 
-; 16: 	.set	nomips16                  # @main
+; 16: 	.set	nomips16
 ; 16: 	.ent	main
 ; 16:	.set	noreorder
 ; 16:	.set	nomacro
@@ -60,7 +58,7 @@ entry:
 ; 16:	.set	reorder
 ; 16:	.end	main
 
-; 32: 	.set	nomips16                  # @main
+; 32: 	.set	nomips16
 ; 32: 	.ent	main
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
diff --git a/test/CodeGen/Mips/mips16_32_6.ll b/test/CodeGen/Mips/mips16_32_6.ll
index a77031a..63323b6 100644
--- a/test/CodeGen/Mips/mips16_32_6.ll
+++ b/test/CodeGen/Mips/mips16_32_6.ll
@@ -6,12 +6,11 @@ entry:
   ret void
 }
 
-; 16: 	.set	mips16                  # @foo
+; 16: 	.set	mips16
 ; 16: 	.ent	foo
-; 16:	save	{{.+}}
-; 16:	restore	{{.+}} 
+; 16:	jrc $ra
 ; 16:	.end	foo
-; 32: 	.set	nomips16                  # @foo
+; 32: 	.set	nomips16
 ; 32: 	.ent	foo
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -27,7 +26,7 @@ entry:
   ret void
 }
 
-; 16: 	.set	nomips16                  # @nofoo
+; 16: 	.set	nomips16
 ; 16: 	.ent	nofoo
 ; 16:	.set	noreorder
 ; 16:	.set	nomacro
@@ -38,7 +37,7 @@ entry:
 ; 16:	.set	macro
 ; 16:	.set	reorder
 ; 16:	.end	nofoo
-; 32: 	.set	nomips16                  # @nofoo
+; 32: 	.set	nomips16
 ; 32: 	.ent	nofoo
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -54,7 +53,7 @@ entry:
   ret i32 0
 }
 
-; 16: 	.set	nomips16                  # @main
+; 16: 	.set	nomips16
 ; 16: 	.ent	main
 ; 16:	.set	noreorder
 ; 16:	.set	nomacro
@@ -66,7 +65,7 @@ entry:
 ; 16:	.set	reorder
 ; 16:	.end	main
 
-; 32: 	.set	nomips16                  # @main
+; 32: 	.set	nomips16
 ; 32: 	.ent	main
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
diff --git a/test/CodeGen/Mips/mips16_32_7.ll b/test/CodeGen/Mips/mips16_32_7.ll
index 895b5d4..480a23c 100644
--- a/test/CodeGen/Mips/mips16_32_7.ll
+++ b/test/CodeGen/Mips/mips16_32_7.ll
@@ -6,12 +6,11 @@ entry:
   ret void
 }
 
-; 16: 	.set	mips16                  # @foo
+; 16: 	.set	mips16
 ; 16: 	.ent	foo
-; 16:	save	{{.+}}
-; 16:	restore	{{.+}} 
+; 16:	jrc $ra
 ; 16:	.end	foo
-; 32: 	.set	nomips16                  # @foo
+; 32: 	.set	nomips16
 ; 32: 	.ent	foo
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -27,7 +26,7 @@ entry:
   ret void
 }
 
-; 16: 	.set	nomips16                  # @nofoo
+; 16: 	.set	nomips16
 ; 16: 	.ent	nofoo
 ; 16:	.set	noreorder
 ; 16:	.set	nomacro
@@ -38,7 +37,7 @@ entry:
 ; 16:	.set	macro
 ; 16:	.set	reorder
 ; 16:	.end	nofoo
-; 32: 	.set	nomips16                  # @nofoo
+; 32: 	.set	nomips16
 ; 32: 	.ent	nofoo
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -54,16 +53,14 @@ entry:
   ret i32 0
 }
 
-; 16: 	.set	mips16                  # @main
+; 16: 	.set	mips16
 ; 16: 	.ent	main
-; 16:	save	{{.+}}
-; 16:	restore	{{.+}} 
+; 16:	jrc $ra
 ; 16:	.end	main
 
-; 32: 	.set	mips16                  # @main
+; 32: 	.set	mips16
 ; 32: 	.ent	main
-; 32:	save	{{.+}}
-; 32:	restore	{{.+}} 
+; 32:	jrc $ra
 ; 32:	.end	main
 
 
diff --git a/test/CodeGen/Mips/mips16_32_8.ll b/test/CodeGen/Mips/mips16_32_8.ll
index 4152d68..2f5bc21 100644
--- a/test/CodeGen/Mips/mips16_32_8.ll
+++ b/test/CodeGen/Mips/mips16_32_8.ll
@@ -14,10 +14,9 @@ entry:
   ret void
 }
 
-; 32: 	.set	mips16                  # @foo
+; 32: 	.set	mips16
 ; 32: 	.ent	foo
-; 32:	save	{{.+}}
-; 32:	restore	{{.+}} 
+; 32:	jrc $ra
 ; 32:	.end	foo
 
 define void @nofoo() #1 {
@@ -33,7 +32,7 @@ entry:
   ret void
 }
 
-; 32: 	.set	nomips16                  # @nofoo
+; 32: 	.set	nomips16
 ; 32: 	.ent	nofoo
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -57,7 +56,7 @@ entry:
   ret i32 0
 }
 
-; 32: 	.set	nomips16                  # @main
+; 32: 	.set	nomips16
 ; 32: 	.ent	main
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
diff --git a/test/CodeGen/Mips/mips16_32_9.ll b/test/CodeGen/Mips/mips16_32_9.ll
index c9b494f..8543147 100644
--- a/test/CodeGen/Mips/mips16_32_9.ll
+++ b/test/CodeGen/Mips/mips16_32_9.ll
@@ -5,17 +5,16 @@ entry:
   ret void
 }
 
-; 32: 	.set	mips16                  # @foo
+; 32: 	.set	mips16
 ; 32: 	.ent	foo
-; 32:	save	{{.+}}
-; 32:	restore	{{.+}} 
+; 32:	jrc $ra
 ; 32:	.end	foo
 define void @nofoo() #1 {
 entry:
   ret void
 }
 
-; 32: 	.set	nomips16                  # @nofoo
+; 32: 	.set	nomips16
 ; 32: 	.ent	nofoo
 ; 32:	.set	noreorder
 ; 32:	.set	nomacro
@@ -31,10 +30,9 @@ entry:
   ret i32 0
 }
 
-; 32: 	.set	mips16                  # @main
+; 32: 	.set	mips16
 ; 32: 	.ent	main
-; 32:	save	{{.+}}
-; 32:	restore	{{.+}} 
+; 32:	jrc $ra
 ; 32:	.end	main
 
 
diff --git a/test/CodeGen/Mips/mips16_fpret.ll b/test/CodeGen/Mips/mips16_fpret.ll
index c132f63..fe87604 100644
--- a/test/CodeGen/Mips/mips16_fpret.ll
+++ b/test/CodeGen/Mips/mips16_fpret.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=1
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=2
-; RUN: llc -mtriple=mipsel-linux-gnu  -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=3
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=4
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=2
+; RUN: llc -mtriple=mipsel-linux-gnu  -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=3
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=4
 
 
 @x = global float 0x41F487E980000000, align 4
diff --git a/test/CodeGen/Mips/mips16fpe.ll b/test/CodeGen/Mips/mips16fpe.ll
index 10c5163..987980e 100644
--- a/test/CodeGen/Mips/mips16fpe.ll
+++ b/test/CodeGen/Mips/mips16fpe.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 -soft-float -mips16-hard-float < %s | FileCheck %s -check-prefix=16hf
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16hf
 
 @x = global float 5.000000e+00, align 4
 @y = global float 1.500000e+01, align 4
diff --git a/test/CodeGen/Mips/mips64fpldst.ll b/test/CodeGen/Mips/mips64fpldst.ll
index 24647b2..368ab83 100644
--- a/test/CodeGen/Mips/mips64fpldst.ll
+++ b/test/CodeGen/Mips/mips64fpldst.ll
@@ -1,5 +1,5 @@
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n32 | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
 
 @f0 = common global float 0.000000e+00, align 4
 @d0 = common global double 0.000000e+00, align 8
diff --git a/test/CodeGen/Mips/mips64intldst.ll b/test/CodeGen/Mips/mips64intldst.ll
index 0e310a8..62244f6 100644
--- a/test/CodeGen/Mips/mips64intldst.ll
+++ b/test/CodeGen/Mips/mips64intldst.ll
@@ -1,5 +1,5 @@
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n32 | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
 
 @c = common global i8 0, align 4
 @s = common global i16 0, align 4
diff --git a/test/CodeGen/Mips/msa/2r_vector_scalar.ll b/test/CodeGen/Mips/msa/2r_vector_scalar.ll
index 6f6e1b9..64e459e 100644
--- a/test/CodeGen/Mips/msa/2r_vector_scalar.ll
+++ b/test/CodeGen/Mips/msa/2r_vector_scalar.ll
@@ -1,8 +1,14 @@
 ; Test the MSA intrinsics that are encoded with the 2R instruction format and
 ; convert scalars to vectors.
 
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
+; RUN: llc -march=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
 
 @llvm_mips_fill_b_ARG1 = global i32 23, align 16
 @llvm_mips_fill_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
@@ -17,11 +23,12 @@ entry:
 
 declare <16 x i8> @llvm.mips.fill.b(i32) nounwind
 
-; CHECK: llvm_mips_fill_b_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]],
-; CHECK-DAG: fill.b [[R2:\$w[0-9]+]], [[R1]]
-; CHECK-DAG: st.b [[R2]],
-; CHECK: .size llvm_mips_fill_b_test
+; MIPS-ANY: llvm_mips_fill_b_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]],
+; MIPS64-DAG: ld [[R1:\$[0-9]+]],
+; MIPS-ANY-DAG: fill.b [[R2:\$w[0-9]+]], [[R1]]
+; MIPS-ANY-DAG: st.b [[R2]],
+; MIPS-ANY: .size llvm_mips_fill_b_test
 ;
 @llvm_mips_fill_h_ARG1 = global i32 23, align 16
 @llvm_mips_fill_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
@@ -36,11 +43,12 @@ entry:
 
 declare <8 x i16> @llvm.mips.fill.h(i32) nounwind
 
-; CHECK: llvm_mips_fill_h_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]],
-; CHECK-DAG: fill.h [[R2:\$w[0-9]+]], [[R1]]
-; CHECK-DAG: st.h [[R2]],
-; CHECK: .size llvm_mips_fill_h_test
+; MIPS-ANY: llvm_mips_fill_h_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]],
+; MIPS64-DAG: ld [[R1:\$[0-9]+]],
+; MIPS-ANY-DAG: fill.h [[R2:\$w[0-9]+]], [[R1]]
+; MIPS-ANY-DAG: st.h [[R2]],
+; MIPS-ANY: .size llvm_mips_fill_h_test
 ;
 @llvm_mips_fill_w_ARG1 = global i32 23, align 16
 @llvm_mips_fill_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
@@ -55,11 +63,12 @@ entry:
 
 declare <4 x i32> @llvm.mips.fill.w(i32) nounwind
 
-; CHECK: llvm_mips_fill_w_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]],
-; CHECK-DAG: fill.w [[R2:\$w[0-9]+]], [[R1]]
-; CHECK-DAG: st.w [[R2]],
-; CHECK: .size llvm_mips_fill_w_test
+; MIPS-ANY: llvm_mips_fill_w_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]],
+; MIPS64-DAG: ld [[R1:\$[0-9]+]],
+; MIPS-ANY-DAG: fill.w [[R2:\$w[0-9]+]], [[R1]]
+; MIPS-ANY-DAG: st.w [[R2]],
+; MIPS-ANY: .size llvm_mips_fill_w_test
 ;
 @llvm_mips_fill_d_ARG1 = global i64 23, align 16
 @llvm_mips_fill_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
@@ -74,14 +83,18 @@ entry:
 
 declare <2 x i64> @llvm.mips.fill.d(i64) nounwind
 
-; CHECK: llvm_mips_fill_d_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
-; CHECK-DAG: lw [[R2:\$[0-9]+]], 4(
-; CHECK-DAG: ldi.b [[R3:\$w[0-9]+]], 0
-; CHECK-DAG: insert.w [[R3]][0], [[R1]]
-; CHECK-DAG: insert.w [[R3]][1], [[R2]]
-; CHECK-DAG: insert.w [[R3]][2], [[R1]]
-; CHECK-DAG: insert.w [[R3]][3], [[R2]]
-; CHECK-DAG: st.w [[R3]],
-; CHECK: .size llvm_mips_fill_d_test
-;
+; MIPS-ANY: llvm_mips_fill_d_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], 0(
+; MIPS32-DAG: lw [[R2:\$[0-9]+]], 4(
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_fill_d_ARG1)
+; MIPS32-DAG: ldi.b [[R3:\$w[0-9]+]], 0
+; MIPS32-DAG: insert.w [[R3]][0], [[R1]]
+; MIPS32-DAG: insert.w [[R3]][1], [[R2]]
+; MIPS32-DAG: insert.w [[R3]][2], [[R1]]
+; MIPS32-DAG: insert.w [[R3]][3], [[R2]]
+; MIPS64-DAG: fill.d [[WD:\$w[0-9]+]], [[R1]]
+; MIPS32-DAG: st.w [[R3]],
+; MIPS64-DAG: ld [[RD:\$[0-9]+]], %got_disp(llvm_mips_fill_d_RES)
+; MIPS64-DAG: st.d [[WD]], 0([[RD]])
+; MIPS-ANY: .size llvm_mips_fill_d_test
+;
+\ No newline at end of file
diff --git a/test/CodeGen/Mips/msa/3r-s.ll b/test/CodeGen/Mips/msa/3r-s.ll
index 30cf265..581c3bf 100644
--- a/test/CodeGen/Mips/msa/3r-s.ll
+++ b/test/CodeGen/Mips/msa/3r-s.ll
@@ -5,98 +5,114 @@
 ; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
 
 @llvm_mips_sld_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
-@llvm_mips_sld_b_ARG2 = global i32 10, align 16
+@llvm_mips_sld_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sld_b_ARG3 = global i32 10, align 16
 @llvm_mips_sld_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
 
 define void @llvm_mips_sld_b_test() nounwind {
 entry:
   %0 = load <16 x i8>* @llvm_mips_sld_b_ARG1
-  %1 = load i32* @llvm_mips_sld_b_ARG2
-  %2 = tail call <16 x i8> @llvm.mips.sld.b(<16 x i8> %0, i32 %1)
-  store <16 x i8> %2, <16 x i8>* @llvm_mips_sld_b_RES
+  %1 = load <16 x i8>* @llvm_mips_sld_b_ARG2
+  %2 = load i32* @llvm_mips_sld_b_ARG3
+  %3 = tail call <16 x i8> @llvm.mips.sld.b(<16 x i8> %0, <16 x i8> %1, i32 %2)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_sld_b_RES
   ret void
 }
 
-declare <16 x i8> @llvm.mips.sld.b(<16 x i8>, i32) nounwind
+declare <16 x i8> @llvm.mips.sld.b(<16 x i8>, <16 x i8>, i32) nounwind
 
 ; CHECK: llvm_mips_sld_b_test:
 ; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_b_ARG1)
 ; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sld_b_ARG2)
-; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
-; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
-; CHECK-DAG: sld.b [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_sld_b_ARG3)
+; CHECK-DAG: ld.b [[WD:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R3]])
+; CHECK-DAG: sld.b [[WD]], [[WS]]{{\[}}[[RT]]{{\]}}
 ; CHECK-DAG: st.b [[WD]]
 ; CHECK: .size llvm_mips_sld_b_test
 ;
 @llvm_mips_sld_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
-@llvm_mips_sld_h_ARG2 = global i32 10, align 16
+@llvm_mips_sld_h_ARG2 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sld_h_ARG3 = global i32 10, align 16
 @llvm_mips_sld_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
 
 define void @llvm_mips_sld_h_test() nounwind {
 entry:
   %0 = load <8 x i16>* @llvm_mips_sld_h_ARG1
-  %1 = load i32* @llvm_mips_sld_h_ARG2
-  %2 = tail call <8 x i16> @llvm.mips.sld.h(<8 x i16> %0, i32 %1)
-  store <8 x i16> %2, <8 x i16>* @llvm_mips_sld_h_RES
+  %1 = load <8 x i16>* @llvm_mips_sld_h_ARG2
+  %2 = load i32* @llvm_mips_sld_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.sld.h(<8 x i16> %0, <8 x i16> %1, i32 %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_sld_h_RES
   ret void
 }
 
-declare <8 x i16> @llvm.mips.sld.h(<8 x i16>, i32) nounwind
+declare <8 x i16> @llvm.mips.sld.h(<8 x i16>, <8 x i16>, i32) nounwind
 
 ; CHECK: llvm_mips_sld_h_test:
 ; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_h_ARG1)
-; CHECK-DAG: lw [[RT:\$[0-9]+]], %got(llvm_mips_sld_h_ARG2)
-; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
-; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
-; CHECK-DAG: sld.h [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sld_h_ARG2)
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_sld_h_ARG3)
+; CHECK-DAG: ld.h [[WD:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R3]])
+; CHECK-DAG: sld.h [[WD]], [[WS]]{{\[}}[[RT]]{{\]}}
 ; CHECK-DAG: st.h [[WD]]
 ; CHECK: .size llvm_mips_sld_h_test
 ;
 @llvm_mips_sld_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
-@llvm_mips_sld_w_ARG2 = global i32 10, align 16
+@llvm_mips_sld_w_ARG2 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sld_w_ARG3 = global i32 10, align 16
 @llvm_mips_sld_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
 
 define void @llvm_mips_sld_w_test() nounwind {
 entry:
   %0 = load <4 x i32>* @llvm_mips_sld_w_ARG1
-  %1 = load i32* @llvm_mips_sld_w_ARG2
-  %2 = tail call <4 x i32> @llvm.mips.sld.w(<4 x i32> %0, i32 %1)
-  store <4 x i32> %2, <4 x i32>* @llvm_mips_sld_w_RES
+  %1 = load <4 x i32>* @llvm_mips_sld_w_ARG2
+  %2 = load i32* @llvm_mips_sld_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.sld.w(<4 x i32> %0, <4 x i32> %1, i32 %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_sld_w_RES
   ret void
 }
 
-declare <4 x i32> @llvm.mips.sld.w(<4 x i32>, i32) nounwind
+declare <4 x i32> @llvm.mips.sld.w(<4 x i32>, <4 x i32>, i32) nounwind
 
 ; CHECK: llvm_mips_sld_w_test:
 ; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_w_ARG1)
-; CHECK-DAG: lw [[RT:\$[0-9]+]], %got(llvm_mips_sld_w_ARG2)
-; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
-; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
-; CHECK-DAG: sld.w [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sld_w_ARG2)
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_sld_w_ARG3)
+; CHECK-DAG: ld.w [[WD:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R3]])
+; CHECK-DAG: sld.w [[WD]], [[WS]]{{\[}}[[RT]]{{\]}}
 ; CHECK-DAG: st.w [[WD]]
 ; CHECK: .size llvm_mips_sld_w_test
 ;
 @llvm_mips_sld_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
-@llvm_mips_sld_d_ARG2 = global i32 10, align 16
+@llvm_mips_sld_d_ARG2 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sld_d_ARG3 = global i32 10, align 16
 @llvm_mips_sld_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
 
 define void @llvm_mips_sld_d_test() nounwind {
 entry:
   %0 = load <2 x i64>* @llvm_mips_sld_d_ARG1
-  %1 = load i32* @llvm_mips_sld_d_ARG2
-  %2 = tail call <2 x i64> @llvm.mips.sld.d(<2 x i64> %0, i32 %1)
-  store <2 x i64> %2, <2 x i64>* @llvm_mips_sld_d_RES
+  %1 = load <2 x i64>* @llvm_mips_sld_d_ARG2
+  %2 = load i32* @llvm_mips_sld_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.sld.d(<2 x i64> %0, <2 x i64> %1, i32 %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_sld_d_RES
   ret void
 }
 
-declare <2 x i64> @llvm.mips.sld.d(<2 x i64>, i32) nounwind
+declare <2 x i64> @llvm.mips.sld.d(<2 x i64>, <2 x i64>, i32) nounwind
 
 ; CHECK: llvm_mips_sld_d_test:
 ; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_d_ARG1)
-; CHECK-DAG: lw [[RT:\$[0-9]+]], %got(llvm_mips_sld_d_ARG2)
-; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
-; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
-; CHECK-DAG: sld.d [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sld_d_ARG2)
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_sld_d_ARG3)
+; CHECK-DAG: ld.d [[WD:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R3]])
+; CHECK-DAG: sld.d [[WD]], [[WS]]{{\[}}[[RT]]{{\]}}
 ; CHECK-DAG: st.d [[WD]]
 ; CHECK: .size llvm_mips_sld_d_test
 ;
diff --git a/test/CodeGen/Mips/msa/arithmetic_float.ll b/test/CodeGen/Mips/msa/arithmetic_float.ll
index dc38721..86e57ac 100644
--- a/test/CodeGen/Mips/msa/arithmetic_float.ll
+++ b/test/CodeGen/Mips/msa/arithmetic_float.ll
@@ -295,7 +295,8 @@ define void @fexp2_v2f64_2(<2 x double>* %c, <2 x double>* %a) nounwind {
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = tail call <2 x double> @llvm.exp2.v2f64 (<2 x double> %1)
   %3 = fmul <2 x double> <double 2.0, double 2.0>, %2
-  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo(
+  ; CHECK-DAG: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[G_PTR]])
   ; CHECK-DAG: fexp2.d [[R4:\$w[0-9]+]], [[R3]], [[R1]]
   store <2 x double> %3, <2 x double>* %c
   ; CHECK-DAG: st.d [[R4]], 0($4)
diff --git a/test/CodeGen/Mips/msa/basic_operations.ll b/test/CodeGen/Mips/msa/basic_operations.ll
index 0169a07..2725e9a 100644
--- a/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/test/CodeGen/Mips/msa/basic_operations.ll
@@ -18,10 +18,12 @@ define void @const_v16i8() nounwind {
   ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 1
 
   store volatile <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 31>, <16 x i8>*@v16i8
-  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32-AE: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6>, <16 x i8>*@v16i8
-  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32-AE: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <16 x i8> <i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0>, <16 x i8>*@v16i8
   ; MIPS32-BE: ldi.h [[R1:\$w[0-9]+]], 256
@@ -35,7 +37,8 @@ define void @const_v16i8() nounwind {
   ; MIPS32-AE-DAG: fill.w [[R1:\$w[0-9]+]], [[R2]]
 
   store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, <16 x i8>*@v16i8
-  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32-AE: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
   ; MIPS32-AE: .size const_v16i8
@@ -51,7 +54,8 @@ define void @const_v8i16() nounwind {
   ; MIPS32-AE: ldi.h [[R1:\$w[0-9]+]], 1
 
   store volatile <8 x i16> <i16 1, i16 1, i16 1, i16 2, i16 1, i16 1, i16 1, i16 31>, <8 x i16>*@v8i16
-  ; MIPS32-AE: ld.h  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32-AE: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32-AE: ld.h  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <8 x i16> <i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028>, <8 x i16>*@v8i16
   ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 4
@@ -64,7 +68,8 @@ define void @const_v8i16() nounwind {
   ; MIPS32-AE-DAG: fill.w [[R1:\$w[0-9]+]], [[R2]]
 
   store volatile <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>, <8 x i16>*@v8i16
-  ; MIPS32-AE: ld.h  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32-AE: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32-AE: ld.h  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
   ; MIPS32-AE: .size const_v8i16
@@ -80,7 +85,8 @@ define void @const_v4i32() nounwind {
   ; MIPS32-AE: ldi.w [[R1:\$w[0-9]+]], 1
 
   store volatile <4 x i32> <i32 1, i32 1, i32 1, i32 31>, <4 x i32>*@v4i32
-  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32-AE: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x i32> <i32 16843009, i32 16843009, i32 16843009, i32 16843009>, <4 x i32>*@v4i32
   ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 1
@@ -89,10 +95,12 @@ define void @const_v4i32() nounwind {
   ; MIPS32-AE: ldi.h [[R1:\$w[0-9]+]], 1
 
   store volatile <4 x i32> <i32 1, i32 2, i32 1, i32 2>, <4 x i32>*@v4i32
-  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32-AE: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x i32> <i32 3, i32 4, i32 5, i32 6>, <4 x i32>*@v4i32
-  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32-AE: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
   ; MIPS32-AE: .size const_v4i32
@@ -117,10 +125,12 @@ define void @const_v2i64() nounwind {
   ; MIPS32-AE: ldi.d [[R1:\$w[0-9]+]], 1
 
   store volatile <2 x i64> <i64 1, i64 31>, <2 x i64>*@v2i64
-  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32-AE: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x i64> <i64 3, i64 4>, <2 x i64>*@v2i64
-  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32-AE: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
   ; MIPS32-AE: .size const_v2i64
diff --git a/test/CodeGen/Mips/msa/basic_operations_float.ll b/test/CodeGen/Mips/msa/basic_operations_float.ll
index 1f53810..c8cef44 100644
--- a/test/CodeGen/Mips/msa/basic_operations_float.ll
+++ b/test/CodeGen/Mips/msa/basic_operations_float.ll
@@ -17,7 +17,8 @@ define void @const_v4f32() nounwind {
   ; MIPS32: fill.w  [[R2:\$w[0-9]+]], [[R1]]
 
   store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 31.0>, <4 x float>*@v4f32
-  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x float> <float 65537.0, float 65537.0, float 65537.0, float 65537.0>, <4 x float>*@v4f32
   ; MIPS32: lui     [[R1:\$[0-9]+]], 18304
@@ -25,10 +26,12 @@ define void @const_v4f32() nounwind {
   ; MIPS32: fill.w  [[R3:\$w[0-9]+]], [[R2]]
 
   store volatile <4 x float> <float 1.0, float 2.0, float 1.0, float 2.0>, <4 x float>*@v4f32
-  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x float> <float 3.0, float 4.0, float 5.0, float 6.0>, <4 x float>*@v4f32
-  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
   ; MIPS32: .size const_v4f32
@@ -41,22 +44,28 @@ define void @const_v2f64() nounwind {
   ; MIPS32: ldi.b  [[R1:\$w[0-9]+]], 0
 
   store volatile <2 x double> <double 72340172838076673.0, double 72340172838076673.0>, <2 x double>*@v2f64
-  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 281479271743489.0, double 281479271743489.0>, <2 x double>*@v2f64
-  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 4294967297.0, double 4294967297.0>, <2 x double>*@v2f64
-  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 1.0, double 1.0>, <2 x double>*@v2f64
-  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 1.0, double 31.0>, <2 x double>*@v2f64
-  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 3.0, double 4.0>, <2 x double>*@v2f64
-  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+  ; MIPS32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
   ; MIPS32: .size const_v2f64
@@ -128,6 +137,24 @@ define float @extract_v4f32_elt0() nounwind {
   ; MIPS32: .size extract_v4f32_elt0
 }
 
+define float @extract_v4f32_elt2() nounwind {
+  ; MIPS32: extract_v4f32_elt2:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = fadd <4 x float> %1, %1
+  ; MIPS32-DAG: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <4 x float> %2, i32 2
+  ; Element 2 can be obtained by splatting it across the vector and extracting
+  ; $w0:sub_lo
+  ; MIPS32-DAG: splati.w $w0, [[R1]][2]
+
+  ret float %3
+  ; MIPS32: .size extract_v4f32_elt2
+}
+
 define double @extract_v2f64() nounwind {
   ; MIPS32: extract_v2f64:
 
diff --git a/test/CodeGen/Mips/msa/bitwise.ll b/test/CodeGen/Mips/msa/bitwise.ll
index 9a88c47..5d57198 100644
--- a/test/CodeGen/Mips/msa/bitwise.ll
+++ b/test/CodeGen/Mips/msa/bitwise.ll
@@ -990,9 +990,10 @@ define void @bsel_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b, <16 x i8>*
   %6 = and <16 x i8> %2, %4
   %7 = or <16 x i8> %5, %6
   ; bmnz is the same operation
-  ; CHECK-DAG: bmnz.v [[R1]], [[R2]], [[R3]]
+  ; (vselect Mask, IfSet, IfClr) -> (BMNZ IfClr, IfSet, Mask)
+  ; CHECK-DAG: bmnz.v [[R2]], [[R1]], [[R3]]
   store <16 x i8> %7, <16 x i8>* %c
-  ; CHECK-DAG: st.b [[R1]], 0($4)
+  ; CHECK-DAG: st.b [[R2]], 0($4)
 
   ret void
   ; CHECK: .size bsel_v16i8
diff --git a/test/CodeGen/Mips/msa/compare.ll b/test/CodeGen/Mips/msa/compare.ll
index 6408d7b..87ca148 100644
--- a/test/CodeGen/Mips/msa/compare.ll
+++ b/test/CodeGen/Mips/msa/compare.ll
@@ -761,7 +761,8 @@ define void @bsel_s_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
   %4 = icmp sgt <8 x i16> %1, %2
   ; CHECK-DAG: clt_s.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %5 = select <8 x i1> %4, <8 x i16> %1, <8 x i16> %3
-  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  ; Note that IfSet and IfClr are swapped since the condition is inverted
+  ; CHECK-DAG: bsel.v [[R4]], [[R3]], [[R1]]
   store <8 x i16> %5, <8 x i16>* %d
   ; CHECK-DAG: st.h [[R4]], 0($4)
 
@@ -782,7 +783,8 @@ define void @bsel_s_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
   %4 = icmp sgt <4 x i32> %1, %2
   ; CHECK-DAG: clt_s.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %5 = select <4 x i1> %4, <4 x i32> %1, <4 x i32> %3
-  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  ; Note that IfSet and IfClr are swapped since the condition is inverted
+  ; CHECK-DAG: bsel.v [[R4]], [[R3]], [[R1]]
   store <4 x i32> %5, <4 x i32>* %d
   ; CHECK-DAG: st.w [[R4]], 0($4)
 
@@ -803,7 +805,8 @@ define void @bsel_s_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
   %4 = icmp sgt <2 x i64> %1, %2
   ; CHECK-DAG: clt_s.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %5 = select <2 x i1> %4, <2 x i64> %1, <2 x i64> %3
-  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  ; Note that IfSet and IfClr are swapped since the condition is inverted
+  ; CHECK-DAG: bsel.v [[R4]], [[R3]], [[R1]]
   store <2 x i64> %5, <2 x i64>* %d
   ; CHECK-DAG: st.d [[R4]], 0($4)
 
@@ -846,7 +849,8 @@ define void @bsel_u_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
   %4 = icmp ugt <8 x i16> %1, %2
   ; CHECK-DAG: clt_u.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %5 = select <8 x i1> %4, <8 x i16> %1, <8 x i16> %3
-  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  ; Note that IfSet and IfClr are swapped since the condition is inverted
+  ; CHECK-DAG: bsel.v [[R4]], [[R3]], [[R1]]
   store <8 x i16> %5, <8 x i16>* %d
   ; CHECK-DAG: st.h [[R4]], 0($4)
 
@@ -867,7 +871,8 @@ define void @bsel_u_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
   %4 = icmp ugt <4 x i32> %1, %2
   ; CHECK-DAG: clt_u.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %5 = select <4 x i1> %4, <4 x i32> %1, <4 x i32> %3
-  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  ; Note that IfSet and IfClr are swapped since the condition is inverted
+  ; CHECK-DAG: bsel.v [[R4]], [[R3]], [[R1]]
   store <4 x i32> %5, <4 x i32>* %d
   ; CHECK-DAG: st.w [[R4]], 0($4)
 
@@ -888,7 +893,8 @@ define void @bsel_u_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
   %4 = icmp ugt <2 x i64> %1, %2
   ; CHECK-DAG: clt_u.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %5 = select <2 x i1> %4, <2 x i64> %1, <2 x i64> %3
-  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  ; Note that IfSet and IfClr are swapped since the condition is inverted
+  ; CHECK-DAG: bsel.v [[R4]], [[R3]], [[R1]]
   store <2 x i64> %5, <2 x i64>* %d
   ; CHECK-DAG: st.d [[R4]], 0($4)
 
@@ -906,7 +912,7 @@ define void @bseli_s_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
   ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
   %3 = icmp sgt <16 x i8> %1, %2
   ; CHECK-DAG: clt_s.b [[R4:\$w[0-9]+]], [[R2]], [[R1]]
-  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %4 = select <16 x i1> %3, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8> %1
   ; CHECK-DAG: bseli.b [[R4]], [[R1]], 1
   store <16 x i8> %4, <16 x i8>* %d
   ; CHECK-DAG: st.b [[R4]], 0($4)
@@ -925,7 +931,7 @@ define void @bseli_s_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
   ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
   %3 = icmp sgt <8 x i16> %1, %2
   ; CHECK-DAG: clt_s.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
-  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %4 = select <8 x i1> %3, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16> %1
   ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
   ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
   store <8 x i16> %4, <8 x i16>* %d
@@ -945,7 +951,7 @@ define void @bseli_s_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
   ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = icmp sgt <4 x i32> %1, %2
   ; CHECK-DAG: clt_s.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
-  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %4 = select <4 x i1> %3, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> %1
   ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
   ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
   store <4 x i32> %4, <4 x i32>* %d
@@ -965,7 +971,7 @@ define void @bseli_s_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
   ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = icmp sgt <2 x i64> %1, %2
   ; CHECK-DAG: clt_s.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
-  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  %4 = select <2 x i1> %3, <2 x i64> <i64 1, i64 1>, <2 x i64> %1
   ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
   ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
   store <2 x i64> %4, <2 x i64>* %d
@@ -985,7 +991,7 @@ define void @bseli_u_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
   ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
   %3 = icmp ugt <16 x i8> %1, %2
   ; CHECK-DAG: clt_u.b [[R4:\$w[0-9]+]], [[R2]], [[R1]]
-  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %4 = select <16 x i1> %3, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8> %1
   ; CHECK-DAG: bseli.b [[R4]], [[R1]], 1
   store <16 x i8> %4, <16 x i8>* %d
   ; CHECK-DAG: st.b [[R4]], 0($4)
@@ -1004,7 +1010,7 @@ define void @bseli_u_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
   ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
   %3 = icmp ugt <8 x i16> %1, %2
   ; CHECK-DAG: clt_u.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
-  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %4 = select <8 x i1> %3, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16> %1
   ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
   ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
   store <8 x i16> %4, <8 x i16>* %d
@@ -1024,7 +1030,7 @@ define void @bseli_u_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
   ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = icmp ugt <4 x i32> %1, %2
   ; CHECK-DAG: clt_u.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
-  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %4 = select <4 x i1> %3, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> %1
   ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
   ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
   store <4 x i32> %4, <4 x i32>* %d
@@ -1044,7 +1050,7 @@ define void @bseli_u_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
   ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = icmp ugt <2 x i64> %1, %2
   ; CHECK-DAG: clt_u.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
-  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  %4 = select <2 x i1> %3, <2 x i64> <i64 1, i64 1>, <2 x i64> %1
   ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
   ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
   store <2 x i64> %4, <2 x i64>* %d
diff --git a/test/CodeGen/Mips/msa/compare_float.ll b/test/CodeGen/Mips/msa/compare_float.ll
index 2fc61f8..e93221b 100644
--- a/test/CodeGen/Mips/msa/compare_float.ll
+++ b/test/CodeGen/Mips/msa/compare_float.ll
@@ -32,12 +32,9 @@ define void @false_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) noun
   store <2 x i64> %4, <2 x i64>* %c
   ret void
 
-  ; FIXME: This code is correct, but poor. Ideally it would be similar to
-  ;        the code in @false_v4f32
+  ; (setcc $a, $b, SETFALSE) is always folded
   ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], 0
-  ; CHECK-DAG: slli.d [[R3:\$w[0-9]+]], [[R1]], 63
-  ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R3]], 63
-  ; CHECK-DAG: st.d [[R4]], 0($4)
+  ; CHECK-DAG: st.w [[R1]], 0($4)
   ; CHECK: .size false_v2f64
 }
 
@@ -509,12 +506,9 @@ define void @true_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounw
   store <2 x i64> %4, <2 x i64>* %c
   ret void
 
-  ; FIXME: This code is correct, but poor. Ideally it would be similar to
-  ;        the code in @true_v4f32
-  ; CHECK-DAG: ldi.d [[R1:\$w[0-9]+]], 1
-  ; CHECK-DAG: slli.d [[R3:\$w[0-9]+]], [[R1]], 63
-  ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R3]], 63
-  ; CHECK-DAG: st.d [[R4]], 0($4)
+  ; (setcc $a, $b, SETTRUE) is always folded.
+  ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], -1
+  ; CHECK-DAG: st.w [[R1]], 0($4)
   ; CHECK: .size true_v2f64
 }
 
@@ -531,7 +525,8 @@ define void @bsel_v4f32(<4 x float>* %d, <4 x float>* %a, <4 x float>* %b,
   %4 = fcmp ogt <4 x float> %1, %2
   ; CHECK-DAG: fclt.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %5 = select <4 x i1> %4, <4 x float> %1, <4 x float> %3
-  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  ; Note that IfSet and IfClr are swapped since the condition is inverted
+  ; CHECK-DAG: bsel.v [[R4]], [[R3]], [[R1]]
   store <4 x float> %5, <4 x float>* %d
   ; CHECK-DAG: st.w [[R4]], 0($4)
 
@@ -552,7 +547,8 @@ define void @bsel_v2f64(<2 x double>* %d, <2 x double>* %a, <2 x double>* %b,
   %4 = fcmp ogt <2 x double> %1, %2
   ; CHECK-DAG: fclt.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %5 = select <2 x i1> %4, <2 x double> %1, <2 x double> %3
-  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  ; Note that IfSet and IfClr are swapped since the condition is inverted
+  ; CHECK-DAG: bsel.v [[R4]], [[R3]], [[R1]]
   store <2 x double> %5, <2 x double>* %d
   ; CHECK-DAG: st.d [[R4]], 0($4)
 
@@ -571,7 +567,8 @@ define void @bseli_v4f32(<4 x float>* %d, <4 x float>* %a, <4 x float>* %b,
   %3 = fcmp ogt <4 x float> %1, %2
   ; CHECK-DAG: fclt.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %4 = select <4 x i1> %3, <4 x float> %1, <4 x float> zeroinitializer
-  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3:\$w[0-9]+]]
+  ; Note that IfSet and IfClr are swapped since the condition is inverted
+  ; CHECK-DAG: bsel.v [[R4]], [[R3:\$w[0-9]+]], [[R1]]
   store <4 x float> %4, <4 x float>* %d
   ; CHECK-DAG: st.w [[R4]], 0($4)
 
@@ -590,7 +587,8 @@ define void @bseli_v2f64(<2 x double>* %d, <2 x double>* %a, <2 x double>* %b,
   %3 = fcmp ogt <2 x double> %1, %2
   ; CHECK-DAG: fclt.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
   %4 = select <2 x i1> %3, <2 x double> %1, <2 x double> zeroinitializer
-  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3:\$w[0-9]+]]
+  ; Note that IfSet and IfClr are swapped since the condition is inverted
+  ; CHECK-DAG: bsel.v [[R4]], [[R3:\$w[0-9]+]], [[R1]]
   store <2 x double> %4, <2 x double>* %d
   ; CHECK-DAG: st.d [[R4]], 0($4)
 
diff --git a/test/CodeGen/Mips/msa/elm_copy.ll b/test/CodeGen/Mips/msa/elm_copy.ll
index ed3e52c..0dd75fa 100644
--- a/test/CodeGen/Mips/msa/elm_copy.ll
+++ b/test/CodeGen/Mips/msa/elm_copy.ll
@@ -1,8 +1,14 @@
 ; Test the MSA intrinsics that are encoded with the ELM instruction format and
 ; are element extraction operations.
 
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
+; RUN: llc -march=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
 
 @llvm_mips_copy_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_copy_s_b_RES  = global i32 0, align 16
@@ -17,11 +23,15 @@ entry:
 
 declare i32 @llvm.mips.copy.s.b(<16 x i8>, i32) nounwind
 
-; CHECK: llvm_mips_copy_s_b_test:
-; CHECK: ld.b
-; CHECK: copy_s.b
-; CHECK: sw
-; CHECK: .size llvm_mips_copy_s_b_test
+; MIPS-ANY: llvm_mips_copy_s_b_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_copy_s_b_ARG1)
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_copy_s_b_ARG1)
+; MIPS-ANY-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; MIPS-ANY-DAG: copy_s.b [[RD:\$[0-9]+]], [[WS]][1]
+; MIPS32-DAG: lw [[RES:\$[0-9]+]], %got(llvm_mips_copy_s_b_RES)
+; MIPS64-DAG: ld [[RES:\$[0-9]+]], %got_disp(llvm_mips_copy_s_b_RES)
+; MIPS-ANY-DAG: sw [[RD]], 0([[RES]])
+; MIPS-ANY: .size llvm_mips_copy_s_b_test
 ;
 @llvm_mips_copy_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
 @llvm_mips_copy_s_h_RES  = global i32 0, align 16
@@ -36,11 +46,15 @@ entry:
 
 declare i32 @llvm.mips.copy.s.h(<8 x i16>, i32) nounwind
 
-; CHECK: llvm_mips_copy_s_h_test:
-; CHECK: ld.h
-; CHECK: copy_s.h
-; CHECK: sw
-; CHECK: .size llvm_mips_copy_s_h_test
+; MIPS-ANY: llvm_mips_copy_s_h_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_copy_s_h_ARG1)
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_copy_s_h_ARG1)
+; MIPS-ANY-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; MIPS-ANY-DAG: copy_s.h [[RD:\$[0-9]+]], [[WS]][1]
+; MIPS32-DAG: lw [[RES:\$[0-9]+]], %got(llvm_mips_copy_s_h_RES)
+; MIPS64-DAG: ld [[RES:\$[0-9]+]], %got_disp(llvm_mips_copy_s_h_RES)
+; MIPS-ANY-DAG: sw [[RD]], 0([[RES]])
+; MIPS-ANY: .size llvm_mips_copy_s_h_test
 ;
 @llvm_mips_copy_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
 @llvm_mips_copy_s_w_RES  = global i32 0, align 16
@@ -55,11 +69,15 @@ entry:
 
 declare i32 @llvm.mips.copy.s.w(<4 x i32>, i32) nounwind
 
-; CHECK: llvm_mips_copy_s_w_test:
-; CHECK: ld.w
-; CHECK: copy_s.w
-; CHECK: sw
-; CHECK: .size llvm_mips_copy_s_w_test
+; MIPS-ANY: llvm_mips_copy_s_w_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_copy_s_w_ARG1)
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_copy_s_w_ARG1)
+; MIPS-ANY-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; MIPS-ANY-DAG: copy_s.w [[RD:\$[0-9]+]], [[WS]][1]
+; MIPS32-DAG: lw [[RES:\$[0-9]+]], %got(llvm_mips_copy_s_w_RES)
+; MIPS64-DAG: ld [[RES:\$[0-9]+]], %got_disp(llvm_mips_copy_s_w_RES)
+; MIPS-ANY-DAG: sw [[RD]], 0([[RES]])
+; MIPS-ANY: .size llvm_mips_copy_s_w_test
 ;
 @llvm_mips_copy_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
 @llvm_mips_copy_s_d_RES  = global i64 0, align 16
@@ -74,13 +92,20 @@ entry:
 
 declare i64 @llvm.mips.copy.s.d(<2 x i64>, i32) nounwind
 
-; CHECK: llvm_mips_copy_s_d_test:
-; CHECK: ld.w
-; CHECK: copy_s.w
-; CHECK: copy_s.w
-; CHECK: sw
-; CHECK: sw
-; CHECK: .size llvm_mips_copy_s_d_test
+; MIPS-ANY: llvm_mips_copy_s_d_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_copy_s_d_ARG1)
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_copy_s_d_ARG1)
+; MIPS32-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; MIPS64-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; MIPS32-DAG: copy_s.w [[RD1:\$[0-9]+]], [[WS]][2]
+; MIPS32-DAG: copy_s.w [[RD2:\$[0-9]+]], [[WS]][3]
+; MIPS64-DAG: copy_s.d [[RD:\$[0-9]+]], [[WS]][1]
+; MIPS32-DAG: lw [[RES:\$[0-9]+]], %got(llvm_mips_copy_s_d_RES)
+; MIPS64-DAG: ld [[RES:\$[0-9]+]], %got_disp(llvm_mips_copy_s_d_RES)
+; MIPS32-DAG: sw [[RD1]], 0([[RES]])
+; MIPS32-DAG: sw [[RD2]], 4([[RES]])
+; MIPS64-DAG: sd [[RD]], 0([[RES]])
+; MIPS-ANY: .size llvm_mips_copy_s_d_test
 ;
 @llvm_mips_copy_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_copy_u_b_RES  = global i32 0, align 16
@@ -95,11 +120,15 @@ entry:
 
 declare i32 @llvm.mips.copy.u.b(<16 x i8>, i32) nounwind
 
-; CHECK: llvm_mips_copy_u_b_test:
-; CHECK: ld.b
-; CHECK: copy_u.b
-; CHECK: sw
-; CHECK: .size llvm_mips_copy_u_b_test
+; MIPS-ANY: llvm_mips_copy_u_b_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_copy_u_b_ARG1)
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_copy_u_b_ARG1)
+; MIPS-ANY-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; MIPS-ANY-DAG: copy_u.b [[RD:\$[0-9]+]], [[WS]][1]
+; MIPS32-DAG: lw [[RES:\$[0-9]+]], %got(llvm_mips_copy_u_b_RES)
+; MIPS64-DAG: ld [[RES:\$[0-9]+]], %got_disp(llvm_mips_copy_u_b_RES)
+; MIPS-ANY-DAG: sw [[RD]], 0([[RES]])
+; MIPS-ANY: .size llvm_mips_copy_u_b_test
 ;
 @llvm_mips_copy_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
 @llvm_mips_copy_u_h_RES  = global i32 0, align 16
@@ -114,11 +143,15 @@ entry:
 
 declare i32 @llvm.mips.copy.u.h(<8 x i16>, i32) nounwind
 
-; CHECK: llvm_mips_copy_u_h_test:
-; CHECK: ld.h
-; CHECK: copy_u.h
-; CHECK: sw
-; CHECK: .size llvm_mips_copy_u_h_test
+; MIPS-ANY: llvm_mips_copy_u_h_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_copy_u_h_ARG1)
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_copy_u_h_ARG1)
+; MIPS-ANY-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; MIPS-ANY-DAG: copy_u.h [[RD:\$[0-9]+]], [[WS]][1]
+; MIPS32-DAG: lw [[RES:\$[0-9]+]], %got(llvm_mips_copy_u_h_RES)
+; MIPS64-DAG: ld [[RES:\$[0-9]+]], %got_disp(llvm_mips_copy_u_h_RES)
+; MIPS-ANY-DAG: sw [[RD]], 0([[RES]])
+; MIPS-ANY: .size llvm_mips_copy_u_h_test
 ;
 @llvm_mips_copy_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
 @llvm_mips_copy_u_w_RES  = global i32 0, align 16
@@ -133,11 +166,15 @@ entry:
 
 declare i32 @llvm.mips.copy.u.w(<4 x i32>, i32) nounwind
 
-; CHECK: llvm_mips_copy_u_w_test:
-; CHECK: ld.w
-; CHECK: copy_u.w
-; CHECK: sw
-; CHECK: .size llvm_mips_copy_u_w_test
+; MIPS-ANY: llvm_mips_copy_u_w_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_copy_u_w_ARG1)
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_copy_u_w_ARG1)
+; MIPS-ANY-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; MIPS-ANY-DAG: copy_u.w [[RD:\$[0-9]+]], [[WS]][1]
+; MIPS32-DAG: lw [[RES:\$[0-9]+]], %got(llvm_mips_copy_u_w_RES)
+; MIPS64-DAG: ld [[RES:\$[0-9]+]], %got_disp(llvm_mips_copy_u_w_RES)
+; MIPS-ANY-DAG: sw [[RD]], 0([[RES]])
+; MIPS-ANY: .size llvm_mips_copy_u_w_test
 ;
 @llvm_mips_copy_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
 @llvm_mips_copy_u_d_RES  = global i64 0, align 16
@@ -152,11 +189,18 @@ entry:
 
 declare i64 @llvm.mips.copy.u.d(<2 x i64>, i32) nounwind
 
-; CHECK: llvm_mips_copy_u_d_test:
-; CHECK: ld.w
-; CHECK: copy_s.w
-; CHECK: copy_s.w
-; CHECK: sw
-; CHECK: sw
-; CHECK: .size llvm_mips_copy_u_d_test
+; MIPS-ANY: llvm_mips_copy_u_d_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_copy_u_d_ARG1)
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_copy_u_d_ARG1)
+; MIPS32-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; MIPS64-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; MIPS32-DAG: copy_s.w [[RD1:\$[0-9]+]], [[WS]][2]
+; MIPS32-DAG: copy_s.w [[RD2:\$[0-9]+]], [[WS]][3]
+; MIPS64-DAG: copy_u.d [[RD:\$[0-9]+]], [[WS]][1]
+; MIPS32-DAG: lw [[RES:\$[0-9]+]], %got(llvm_mips_copy_u_d_RES)
+; MIPS64-DAG: ld [[RES:\$[0-9]+]], %got_disp(llvm_mips_copy_u_d_RES)
+; MIPS32-DAG: sw [[RD1]], 0([[RES]])
+; MIPS32-DAG: sw [[RD2]], 4([[RES]])
+; MIPS64-DAG: sd [[RD]], 0([[RES]])
+; MIPS-ANY: .size llvm_mips_copy_u_d_test
 ;
diff --git a/test/CodeGen/Mips/msa/elm_insv.ll b/test/CodeGen/Mips/msa/elm_insv.ll
index fa7ceaf..c746e52 100644
--- a/test/CodeGen/Mips/msa/elm_insv.ll
+++ b/test/CodeGen/Mips/msa/elm_insv.ll
@@ -1,8 +1,14 @@
 ; Test the MSA element insertion intrinsics that are encoded with the ELM
 ; instruction format.
 
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
+; RUN: llc -march=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
 
 @llvm_mips_insert_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_insert_b_ARG3 = global i32 27, align 16
@@ -19,12 +25,12 @@ entry:
 
 declare <16 x i8> @llvm.mips.insert.b(<16 x i8>, i32, i32) nounwind
 
-; CHECK: llvm_mips_insert_b_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
-; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0(
-; CHECK-DAG: insert.b [[R2]][1], [[R1]]
-; CHECK-DAG: st.b [[R2]], 0(
-; CHECK: .size llvm_mips_insert_b_test
+; MIPS-ANY: llvm_mips_insert_b_test:
+; MIPS-ANY-DAG: lw [[R1:\$[0-9]+]], 0(
+; MIPS-ANY-DAG: ld.b [[R2:\$w[0-9]+]], 0(
+; MIPS-ANY-DAG: insert.b [[R2]][1], [[R1]]
+; MIPS-ANY-DAG: st.b [[R2]], 0(
+; MIPS-ANY: .size llvm_mips_insert_b_test
 ;
 @llvm_mips_insert_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
 @llvm_mips_insert_h_ARG3 = global i32 27, align 16
@@ -41,12 +47,12 @@ entry:
 
 declare <8 x i16> @llvm.mips.insert.h(<8 x i16>, i32, i32) nounwind
 
-; CHECK: llvm_mips_insert_h_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
-; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0(
-; CHECK-DAG: insert.h [[R2]][1], [[R1]]
-; CHECK-DAG: st.h [[R2]], 0(
-; CHECK: .size llvm_mips_insert_h_test
+; MIPS-ANY: llvm_mips_insert_h_test:
+; MIPS-ANY-DAG: lw [[R1:\$[0-9]+]], 0(
+; MIPS-ANY-DAG: ld.h [[R2:\$w[0-9]+]], 0(
+; MIPS-ANY-DAG: insert.h [[R2]][1], [[R1]]
+; MIPS-ANY-DAG: st.h [[R2]], 0(
+; MIPS-ANY: .size llvm_mips_insert_h_test
 ;
 @llvm_mips_insert_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
 @llvm_mips_insert_w_ARG3 = global i32 27, align 16
@@ -63,12 +69,12 @@ entry:
 
 declare <4 x i32> @llvm.mips.insert.w(<4 x i32>, i32, i32) nounwind
 
-; CHECK: llvm_mips_insert_w_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
-; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0(
-; CHECK-DAG: insert.w [[R2]][1], [[R1]]
-; CHECK-DAG: st.w [[R2]], 0(
-; CHECK: .size llvm_mips_insert_w_test
+; MIPS-ANY: llvm_mips_insert_w_test:
+; MIPS-ANY-DAG: lw [[R1:\$[0-9]+]], 0(
+; MIPS-ANY-DAG: ld.w [[R2:\$w[0-9]+]], 0(
+; MIPS-ANY-DAG: insert.w [[R2]][1], [[R1]]
+; MIPS-ANY-DAG: st.w [[R2]], 0(
+; MIPS-ANY: .size llvm_mips_insert_w_test
 ;
 @llvm_mips_insert_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
 @llvm_mips_insert_d_ARG3 = global i64 27, align 16
@@ -85,14 +91,18 @@ entry:
 
 declare <2 x i64> @llvm.mips.insert.d(<2 x i64>, i32, i64) nounwind
 
-; CHECK: llvm_mips_insert_d_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
-; CHECK-DAG: lw [[R2:\$[0-9]+]], 4(
-; CHECK-DAG: ld.w [[R3:\$w[0-9]+]],
-; CHECK-DAG: insert.w [[R3]][2], [[R1]]
-; CHECK-DAG: insert.w [[R3]][3], [[R2]]
-; CHECK-DAG: st.w [[R3]],
-; CHECK: .size llvm_mips_insert_d_test
+; MIPS-ANY: llvm_mips_insert_d_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], 0(
+; MIPS32-DAG: lw [[R2:\$[0-9]+]], 4(
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], 0(
+; MIPS32-DAG: ld.w [[R3:\$w[0-9]+]],
+; MIPS64-DAG: ld.d [[W1:\$w[0-9]+]],
+; MIPS32-DAG: insert.w [[R3]][2], [[R1]]
+; MIPS32-DAG: insert.w [[R3]][3], [[R2]]
+; MIPS64-DAG: insert.d [[W1]][1], [[R1]]
+; MIPS32-DAG: st.w [[R3]],
+; MIPS64-DAG: st.d [[W1]],
+; MIPS-ANY: .size llvm_mips_insert_d_test
 ;
 @llvm_mips_insve_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_insve_b_ARG3 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
@@ -109,14 +119,16 @@ entry:
 
 declare <16 x i8> @llvm.mips.insve.b(<16 x i8>, i32, <16 x i8>) nounwind
 
-; CHECK: llvm_mips_insve_b_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_b_ARG1)(
-; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_b_ARG3)(
-; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
-; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
-; CHECK-DAG: insve.b [[R3]][1], [[R4]][0]
-; CHECK-DAG: st.b [[R3]],
-; CHECK: .size llvm_mips_insve_b_test
+; MIPS-ANY: llvm_mips_insve_b_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_b_ARG1)(
+; MIPS32-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_b_ARG3)(
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_insve_b_ARG1)(
+; MIPS64-DAG: ld [[R2:\$[0-9]+]], %got_disp(llvm_mips_insve_b_ARG3)(
+; MIPS-ANY-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS-ANY-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; MIPS-ANY-DAG: insve.b [[R3]][1], [[R4]][0]
+; MIPS-ANY-DAG: st.b [[R3]],
+; MIPS-ANY: .size llvm_mips_insve_b_test
 ;
 @llvm_mips_insve_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
 @llvm_mips_insve_h_ARG3 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
@@ -133,14 +145,16 @@ entry:
 
 declare <8 x i16> @llvm.mips.insve.h(<8 x i16>, i32, <8 x i16>) nounwind
 
-; CHECK: llvm_mips_insve_h_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_h_ARG1)(
-; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_h_ARG3)(
-; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[R1]])
-; CHECK-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R2]])
-; CHECK-DAG: insve.h [[R3]][1], [[R4]][0]
-; CHECK-DAG: st.h [[R3]],
-; CHECK: .size llvm_mips_insve_h_test
+; MIPS-ANY: llvm_mips_insve_h_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_h_ARG1)(
+; MIPS32-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_h_ARG3)(
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_insve_h_ARG1)(
+; MIPS64-DAG: ld [[R2:\$[0-9]+]], %got_disp(llvm_mips_insve_h_ARG3)(
+; MIPS-ANY-DAG: ld.h [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS-ANY-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R2]])
+; MIPS-ANY-DAG: insve.h [[R3]][1], [[R4]][0]
+; MIPS-ANY-DAG: st.h [[R3]],
+; MIPS-ANY: .size llvm_mips_insve_h_test
 ;
 @llvm_mips_insve_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
 @llvm_mips_insve_w_ARG3 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
@@ -157,14 +171,16 @@ entry:
 
 declare <4 x i32> @llvm.mips.insve.w(<4 x i32>, i32, <4 x i32>) nounwind
 
-; CHECK: llvm_mips_insve_w_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_w_ARG1)(
-; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_w_ARG3)(
-; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0([[R1]])
-; CHECK-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R2]])
-; CHECK-DAG: insve.w [[R3]][1], [[R4]][0]
-; CHECK-DAG: st.w [[R3]],
-; CHECK: .size llvm_mips_insve_w_test
+; MIPS-ANY: llvm_mips_insve_w_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_w_ARG1)(
+; MIPS32-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_w_ARG3)(
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_insve_w_ARG1)(
+; MIPS64-DAG: ld [[R2:\$[0-9]+]], %got_disp(llvm_mips_insve_w_ARG3)(
+; MIPS-ANY-DAG: ld.w [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS-ANY-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R2]])
+; MIPS-ANY-DAG: insve.w [[R3]][1], [[R4]][0]
+; MIPS-ANY-DAG: st.w [[R3]],
+; MIPS-ANY: .size llvm_mips_insve_w_test
 ;
 @llvm_mips_insve_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
 @llvm_mips_insve_d_ARG3 = global <2 x i64> <i64 2, i64 3>, align 16
@@ -181,12 +197,14 @@ entry:
 
 declare <2 x i64> @llvm.mips.insve.d(<2 x i64>, i32, <2 x i64>) nounwind
 
-; CHECK: llvm_mips_insve_d_test:
-; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_d_ARG1)(
-; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_d_ARG3)(
-; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[R1]])
-; CHECK-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R2]])
-; CHECK-DAG: insve.d [[R3]][1], [[R4]][0]
-; CHECK-DAG: st.d [[R3]],
-; CHECK: .size llvm_mips_insve_d_test
+; MIPS-ANY: llvm_mips_insve_d_test:
+; MIPS32-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_d_ARG1)(
+; MIPS32-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_d_ARG3)(
+; MIPS64-DAG: ld [[R1:\$[0-9]+]], %got_disp(llvm_mips_insve_d_ARG1)(
+; MIPS64-DAG: ld [[R2:\$[0-9]+]], %got_disp(llvm_mips_insve_d_ARG3)(
+; MIPS-ANY-DAG: ld.d [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS-ANY-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R2]])
+; MIPS-ANY-DAG: insve.d [[R3]][1], [[R4]][0]
+; MIPS-ANY-DAG: st.d [[R3]],
+; MIPS-ANY: .size llvm_mips_insve_d_test
 ;
diff --git a/test/CodeGen/Mips/msa/elm_shift_slide.ll b/test/CodeGen/Mips/msa/elm_shift_slide.ll
index 39d670d..00a6544 100644
--- a/test/CodeGen/Mips/msa/elm_shift_slide.ll
+++ b/test/CodeGen/Mips/msa/elm_shift_slide.ll
@@ -5,17 +5,19 @@
 ; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
 
 @llvm_mips_sldi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sldi_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_sldi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
 
 define void @llvm_mips_sldi_b_test() nounwind {
 entry:
   %0 = load <16 x i8>* @llvm_mips_sldi_b_ARG1
-  %1 = tail call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %0, i32 1)
-  store <16 x i8> %1, <16 x i8>* @llvm_mips_sldi_b_RES
+  %1 = load <16 x i8>* @llvm_mips_sldi_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %0, <16 x i8> %1, i32 1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_sldi_b_RES
   ret void
 }
 
-declare <16 x i8> @llvm.mips.sldi.b(<16 x i8>, i32) nounwind
+declare <16 x i8> @llvm.mips.sldi.b(<16 x i8>, <16 x i8>, i32) nounwind
 
 ; CHECK: llvm_mips_sldi_b_test:
 ; CHECK: ld.b
@@ -24,17 +26,19 @@ declare <16 x i8> @llvm.mips.sldi.b(<16 x i8>, i32) nounwind
 ; CHECK: .size llvm_mips_sldi_b_test
 ;
 @llvm_mips_sldi_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sldi_h_ARG2 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
 @llvm_mips_sldi_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
 
 define void @llvm_mips_sldi_h_test() nounwind {
 entry:
   %0 = load <8 x i16>* @llvm_mips_sldi_h_ARG1
-  %1 = tail call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %0, i32 1)
-  store <8 x i16> %1, <8 x i16>* @llvm_mips_sldi_h_RES
+  %1 = load <8 x i16>* @llvm_mips_sldi_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %0, <8 x i16> %1, i32 1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_sldi_h_RES
   ret void
 }
 
-declare <8 x i16> @llvm.mips.sldi.h(<8 x i16>, i32) nounwind
+declare <8 x i16> @llvm.mips.sldi.h(<8 x i16>, <8 x i16>, i32) nounwind
 
 ; CHECK: llvm_mips_sldi_h_test:
 ; CHECK: ld.h
@@ -43,17 +47,19 @@ declare <8 x i16> @llvm.mips.sldi.h(<8 x i16>, i32) nounwind
 ; CHECK: .size llvm_mips_sldi_h_test
 ;
 @llvm_mips_sldi_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sldi_w_ARG2 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
 @llvm_mips_sldi_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
 
 define void @llvm_mips_sldi_w_test() nounwind {
 entry:
   %0 = load <4 x i32>* @llvm_mips_sldi_w_ARG1
-  %1 = tail call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %0, i32 1)
-  store <4 x i32> %1, <4 x i32>* @llvm_mips_sldi_w_RES
+  %1 = load <4 x i32>* @llvm_mips_sldi_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %0, <4 x i32> %1, i32 1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_sldi_w_RES
   ret void
 }
 
-declare <4 x i32> @llvm.mips.sldi.w(<4 x i32>, i32) nounwind
+declare <4 x i32> @llvm.mips.sldi.w(<4 x i32>, <4 x i32>, i32) nounwind
 
 ; CHECK: llvm_mips_sldi_w_test:
 ; CHECK: ld.w
@@ -62,17 +68,19 @@ declare <4 x i32> @llvm.mips.sldi.w(<4 x i32>, i32) nounwind
 ; CHECK: .size llvm_mips_sldi_w_test
 ;
 @llvm_mips_sldi_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sldi_d_ARG2 = global <2 x i64> <i64 0, i64 1>, align 16
 @llvm_mips_sldi_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
 
 define void @llvm_mips_sldi_d_test() nounwind {
 entry:
   %0 = load <2 x i64>* @llvm_mips_sldi_d_ARG1
-  %1 = tail call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %0, i32 1)
-  store <2 x i64> %1, <2 x i64>* @llvm_mips_sldi_d_RES
+  %1 = load <2 x i64>* @llvm_mips_sldi_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %0, <2 x i64> %1, i32 1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_sldi_d_RES
   ret void
 }
 
-declare <2 x i64> @llvm.mips.sldi.d(<2 x i64>, i32) nounwind
+declare <2 x i64> @llvm.mips.sldi.d(<2 x i64>, <2 x i64>, i32) nounwind
 
 ; CHECK: llvm_mips_sldi_d_test:
 ; CHECK: ld.d
diff --git a/test/CodeGen/Mips/msa/frameindex.ll b/test/CodeGen/Mips/msa/frameindex.ll
index 3088e1b..07e67bf 100644
--- a/test/CodeGen/Mips/msa/frameindex.ll
+++ b/test/CodeGen/Mips/msa/frameindex.ll
@@ -83,3 +83,312 @@ define void @loadstore_v16i8_just_over_simm16() nounwind {
   ret void
   ; MIPS32-AE: .size loadstore_v16i8_just_over_simm16
 }
+
+define void @loadstore_v8i16_near() nounwind {
+  ; MIPS32-AE: loadstore_v8i16_near:
+
+  %1 = alloca <8 x i16>
+  %2 = load volatile <8 x i16>* %1
+  ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0($sp)
+  store volatile <8 x i16> %2, <8 x i16>* %1
+  ; MIPS32-AE: st.h [[R1]], 0($sp)
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v8i16_near
+}
+
+define void @loadstore_v8i16_unaligned() nounwind {
+  ; MIPS32-AE: loadstore_v8i16_unaligned:
+
+  %1 = alloca [2 x <8 x i16>]
+  %2 = bitcast [2 x <8 x i16>]* %1 to i8*
+  %3 = getelementptr i8* %2, i32 1
+  %4 = bitcast i8* %3 to [2 x <8 x i16>]*
+  %5 = getelementptr [2 x <8 x i16>]* %4, i32 0, i32 0
+
+  %6 = load volatile <8 x i16>* %5
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <8 x i16> %6, <8 x i16>* %5
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v8i16_unaligned
+}
+
+define void @loadstore_v8i16_just_under_simm10() nounwind {
+  ; MIPS32-AE: loadstore_v8i16_just_under_simm10:
+
+  %1 = alloca <8 x i16>
+  %2 = alloca [1008 x i8] ; Push the frame right up to 1024 bytes
+
+  %3 = load volatile <8 x i16>* %1
+  ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 1008($sp)
+  store volatile <8 x i16> %3, <8 x i16>* %1
+  ; MIPS32-AE: st.h [[R1]], 1008($sp)
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v8i16_just_under_simm10
+}
+
+define void @loadstore_v8i16_just_over_simm10() nounwind {
+  ; MIPS32-AE: loadstore_v8i16_just_over_simm10:
+
+  %1 = alloca <8 x i16>
+  %2 = alloca [1009 x i8] ; Push the frame just over 1024 bytes
+
+  %3 = load volatile <8 x i16>* %1
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1024
+  ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <8 x i16> %3, <8 x i16>* %1
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1024
+  ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v8i16_just_over_simm10
+}
+
+define void @loadstore_v8i16_just_under_simm16() nounwind {
+  ; MIPS32-AE: loadstore_v8i16_just_under_simm16:
+
+  %1 = alloca <8 x i16>
+  %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+
+  %3 = load volatile <8 x i16>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <8 x i16> %3, <8 x i16>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v8i16_just_under_simm16
+}
+
+define void @loadstore_v8i16_just_over_simm16() nounwind {
+  ; MIPS32-AE: loadstore_v8i16_just_over_simm16:
+
+  %1 = alloca <8 x i16>
+  %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+
+  %3 = load volatile <8 x i16>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <8 x i16> %3, <8 x i16>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v8i16_just_over_simm16
+}
+
+define void @loadstore_v4i32_near() nounwind {
+  ; MIPS32-AE: loadstore_v4i32_near:
+
+  %1 = alloca <4 x i32>
+  %2 = load volatile <4 x i32>* %1
+  ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0($sp)
+  store volatile <4 x i32> %2, <4 x i32>* %1
+  ; MIPS32-AE: st.w [[R1]], 0($sp)
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v4i32_near
+}
+
+define void @loadstore_v4i32_unaligned() nounwind {
+  ; MIPS32-AE: loadstore_v4i32_unaligned:
+
+  %1 = alloca [2 x <4 x i32>]
+  %2 = bitcast [2 x <4 x i32>]* %1 to i8*
+  %3 = getelementptr i8* %2, i32 1
+  %4 = bitcast i8* %3 to [2 x <4 x i32>]*
+  %5 = getelementptr [2 x <4 x i32>]* %4, i32 0, i32 0
+
+  %6 = load volatile <4 x i32>* %5
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <4 x i32> %6, <4 x i32>* %5
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v4i32_unaligned
+}
+
+define void @loadstore_v4i32_just_under_simm10() nounwind {
+  ; MIPS32-AE: loadstore_v4i32_just_under_simm10:
+
+  %1 = alloca <4 x i32>
+  %2 = alloca [2032 x i8] ; Push the frame right up to 2048 bytes
+
+  %3 = load volatile <4 x i32>* %1
+  ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 2032($sp)
+  store volatile <4 x i32> %3, <4 x i32>* %1
+  ; MIPS32-AE: st.w [[R1]], 2032($sp)
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v4i32_just_under_simm10
+}
+
+define void @loadstore_v4i32_just_over_simm10() nounwind {
+  ; MIPS32-AE: loadstore_v4i32_just_over_simm10:
+
+  %1 = alloca <4 x i32>
+  %2 = alloca [2033 x i8] ; Push the frame just over 2048 bytes
+
+  %3 = load volatile <4 x i32>* %1
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 2048
+  ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <4 x i32> %3, <4 x i32>* %1
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 2048
+  ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v4i32_just_over_simm10
+}
+
+define void @loadstore_v4i32_just_under_simm16() nounwind {
+  ; MIPS32-AE: loadstore_v4i32_just_under_simm16:
+
+  %1 = alloca <4 x i32>
+  %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+
+  %3 = load volatile <4 x i32>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <4 x i32> %3, <4 x i32>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v4i32_just_under_simm16
+}
+
+define void @loadstore_v4i32_just_over_simm16() nounwind {
+  ; MIPS32-AE: loadstore_v4i32_just_over_simm16:
+
+  %1 = alloca <4 x i32>
+  %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+
+  %3 = load volatile <4 x i32>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <4 x i32> %3, <4 x i32>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v4i32_just_over_simm16
+}
+
+define void @loadstore_v2i64_near() nounwind {
+  ; MIPS32-AE: loadstore_v2i64_near:
+
+  %1 = alloca <2 x i64>
+  %2 = load volatile <2 x i64>* %1
+  ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0($sp)
+  store volatile <2 x i64> %2, <2 x i64>* %1
+  ; MIPS32-AE: st.d [[R1]], 0($sp)
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v2i64_near
+}
+
+define void @loadstore_v2i64_unaligned() nounwind {
+  ; MIPS32-AE: loadstore_v2i64_unaligned:
+
+  %1 = alloca [2 x <2 x i64>]
+  %2 = bitcast [2 x <2 x i64>]* %1 to i8*
+  %3 = getelementptr i8* %2, i32 1
+  %4 = bitcast i8* %3 to [2 x <2 x i64>]*
+  %5 = getelementptr [2 x <2 x i64>]* %4, i32 0, i32 0
+
+  %6 = load volatile <2 x i64>* %5
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <2 x i64> %6, <2 x i64>* %5
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v2i64_unaligned
+}
+
+define void @loadstore_v2i64_just_under_simm10() nounwind {
+  ; MIPS32-AE: loadstore_v2i64_just_under_simm10:
+
+  %1 = alloca <2 x i64>
+  %2 = alloca [4080 x i8] ; Push the frame right up to 4096 bytes
+
+  %3 = load volatile <2 x i64>* %1
+  ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 4080($sp)
+  store volatile <2 x i64> %3, <2 x i64>* %1
+  ; MIPS32-AE: st.d [[R1]], 4080($sp)
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v2i64_just_under_simm10
+}
+
+define void @loadstore_v2i64_just_over_simm10() nounwind {
+  ; MIPS32-AE: loadstore_v2i64_just_over_simm10:
+
+  %1 = alloca <2 x i64>
+  %2 = alloca [4081 x i8] ; Push the frame just over 4096 bytes
+
+  %3 = load volatile <2 x i64>* %1
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 4096
+  ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <2 x i64> %3, <2 x i64>* %1
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 4096
+  ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v2i64_just_over_simm10
+}
+
+define void @loadstore_v2i64_just_under_simm16() nounwind {
+  ; MIPS32-AE: loadstore_v2i64_just_under_simm16:
+
+  %1 = alloca <2 x i64>
+  %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+
+  %3 = load volatile <2 x i64>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <2 x i64> %3, <2 x i64>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v2i64_just_under_simm16
+}
+
+define void @loadstore_v2i64_just_over_simm16() nounwind {
+  ; MIPS32-AE: loadstore_v2i64_just_over_simm16:
+
+  %1 = alloca <2 x i64>
+  %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+
+  %3 = load volatile <2 x i64>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <2 x i64> %3, <2 x i64>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v2i64_just_over_simm16
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s449609655-simplified.ll b/test/CodeGen/Mips/msa/llvm-stress-s449609655-simplified.ll
index 24e27cb..f25ab22 100644
--- a/test/CodeGen/Mips/msa/llvm-stress-s449609655-simplified.ll
+++ b/test/CodeGen/Mips/msa/llvm-stress-s449609655-simplified.ll
@@ -10,7 +10,7 @@
 ; The legalizer legalized ; the <4 x i8>'s into <4 x i32>'s, then a call to
 ; isVSplat() returned the splat value for <i8 -1, i8 -1, ...> as a 32-bit APInt
 ; (255), but the zeroinitializer splat value as an 8-bit APInt (0). The
-; assertion occured when trying to check the values were bitwise inverses of
+; assertion occurred when trying to check the values were bitwise inverses of
 ; each-other.
 ;
 ; It should at least successfully build.
diff --git a/test/CodeGen/Mips/msa/shift-dagcombine.ll b/test/CodeGen/Mips/msa/shift-dagcombine.ll
index 0d809fb..322acff 100644
--- a/test/CodeGen/Mips/msa/shift-dagcombine.ll
+++ b/test/CodeGen/Mips/msa/shift-dagcombine.ll
@@ -37,7 +37,8 @@ define void @lshr_v4i32(<4 x i32>* %c) nounwind {
   %2 = lshr <4 x i32> <i32 -2, i32 -4, i32 -8, i32 -16>,
                       <i32 0, i32 1, i32 2, i32 3>
   ; CHECK-NOT: srl
-  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], %lo
+  ; CHECK-DAG: addiu [[CPOOL:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0([[CPOOL]])
   ; CHECK-NOT: srl
   store volatile <4 x i32> %2, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R1]], 0($4)
diff --git a/test/CodeGen/Mips/msa/shuffle.ll b/test/CodeGen/Mips/msa/shuffle.ll
index 316c669..faeec5d 100644
--- a/test/CodeGen/Mips/msa/shuffle.ll
+++ b/test/CodeGen/Mips/msa/shuffle.ll
@@ -7,7 +7,8 @@ define void @vshf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind
   %1 = load <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
   %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[PTR_A]])
   ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R1]]
   store <16 x i8> %2, <16 x i8>* %c
   ; CHECK-DAG: st.b [[R3]], 0($4)
@@ -37,7 +38,8 @@ define void @vshf_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind
   %2 = load <16 x i8>* %b
   ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 16>
-  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[PTR_A]])
   ; CHECK-DAG: vshf.b [[R3]], [[R2]], [[R2]]
   store <16 x i8> %3, <16 x i8>* %c
   ; CHECK-DAG: st.b [[R3]], 0($4)
@@ -54,8 +56,11 @@ define void @vshf_v16i8_3(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind
   %2 = load <16 x i8>* %b
   ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 17, i32 24, i32 25, i32 18, i32 19, i32 20, i32 28, i32 19, i32 1, i32 8, i32 9, i32 2, i32 3, i32 4, i32 12, i32 3>
-  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo
-  ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R2]]
+  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[PTR_A]])
+  ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
+  ; the operands to get the right answer.
+  ; CHECK-DAG: vshf.b [[R3]], [[R2]], [[R1]]
   store <16 x i8> %3, <16 x i8>* %c
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
@@ -83,7 +88,8 @@ define void @vshf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind
   %1 = load <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[PTR_A]])
   ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R1]]
   store <8 x i16> %2, <8 x i16>* %c
   ; CHECK-DAG: st.h [[R3]], 0($4)
@@ -113,7 +119,8 @@ define void @vshf_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind
   %2 = load <8 x i16>* %b
   ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8>
-  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[PTR_A]])
   ; CHECK-DAG: vshf.h [[R3]], [[R2]], [[R2]]
   store <8 x i16> %3, <8 x i16>* %c
   ; CHECK-DAG: st.h [[R3]], 0($4)
@@ -130,8 +137,11 @@ define void @vshf_v8i16_3(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind
   %2 = load <8 x i16>* %b
   ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 8, i32 9, i32 2, i32 3, i32 4, i32 12, i32 3>
-  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo
-  ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R2]]
+  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[PTR_A]])
+  ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
+  ; the operands to get the right answer.
+  ; CHECK-DAG: vshf.h [[R3]], [[R2]], [[R1]]
   store <8 x i16> %3, <8 x i16>* %c
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
@@ -207,8 +217,11 @@ define void @vshf_v4i32_3(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind
   %2 = load <4 x i32>* %b
   ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 5, i32 6, i32 4>
-  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], %lo
-  ; CHECK-DAG: vshf.w [[R3]], [[R1]], [[R2]]
+  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0([[PTR_A]])
+  ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
+  ; the operands to get the right answer.
+  ; CHECK-DAG: vshf.w [[R3]], [[R2]], [[R1]]
   store <4 x i32> %3, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
@@ -236,7 +249,8 @@ define void @vshf_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind
   %1 = load <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[PTR_A]])
   ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R1]]
   store <2 x i64> %2, <2 x i64>* %c
   ; CHECK-DAG: st.d [[R3]], 0($4)
@@ -266,7 +280,8 @@ define void @vshf_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind
   %2 = load <2 x i64>* %b
   ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 3, i32 2>
-  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[PTR_A]])
   ; CHECK-DAG: vshf.d [[R3]], [[R2]], [[R2]]
   store <2 x i64> %3, <2 x i64>* %c
   ; CHECK-DAG: st.d [[R3]], 0($4)
@@ -283,8 +298,11 @@ define void @vshf_v2i64_3(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind
   %2 = load <2 x i64>* %b
   ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 2>
-  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo
-  ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R2]]
+  ; CHECK-DAG: addiu [[PTR_A:\$[0-9]+]], {{.*}}, %lo($
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[PTR_A]])
+  ; The concatenation step of vshf is bitwise not vectorwise so we must reverse
+  ; the operands to get the right answer.
+  ; CHECK-DAG: vshf.d [[R3]], [[R2]], [[R1]]
   store <2 x i64> %3, <2 x i64>* %c
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
diff --git a/test/CodeGen/Mips/msa/special.ll b/test/CodeGen/Mips/msa/special.ll
index 60a4369..f65a14f 100644
--- a/test/CodeGen/Mips/msa/special.ll
+++ b/test/CodeGen/Mips/msa/special.ll
@@ -1,6 +1,9 @@
 ; Test the MSA intrinsics that are encoded with the SPECIAL instruction format.
 
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s --check-prefix=MIPS32
+; RUN: llc -march=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
+; RUN:   FileCheck %s --check-prefix=MIPS64
 
 define i32 @llvm_mips_lsa_test(i32 %a, i32 %b) nounwind {
 entry:
@@ -10,9 +13,9 @@ entry:
 
 declare i32 @llvm.mips.lsa(i32, i32, i32) nounwind
 
-; CHECK: llvm_mips_lsa_test:
-; CHECK: lsa {{\$[0-9]+}}, {{\$[0-9]+}}, {{\$[0-9]+}}, 2
-; CHECK: .size llvm_mips_lsa_test
+; MIPS32: llvm_mips_lsa_test:
+; MIPS32: lsa {{\$[0-9]+}}, $5, $4, 2
+; MIPS32: .size llvm_mips_lsa_test
 
 define i32 @lsa_test(i32 %a, i32 %b) nounwind {
 entry:
@@ -21,6 +24,29 @@ entry:
   ret i32 %1
 }
 
-; CHECK: lsa_test:
-; CHECK: lsa {{\$[0-9]+}}, {{\$[0-9]+}}, {{\$[0-9]+}}, 2
-; CHECK: .size lsa_test
+; MIPS32: lsa_test:
+; MIPS32: lsa {{\$[0-9]+}}, $5, $4, 2
+; MIPS32: .size lsa_test
+
+define i64 @llvm_mips_dlsa_test(i64 %a, i64 %b) nounwind {
+entry:
+  %0 = tail call i64 @llvm.mips.dlsa(i64 %a, i64 %b, i32 2)
+  ret i64 %0
+}
+
+declare i64 @llvm.mips.dlsa(i64, i64, i32) nounwind
+
+; MIPS64: llvm_mips_dlsa_test:
+; MIPS64: dlsa {{\$[0-9]+}}, $5, $4, 2
+; MIPS64: .size llvm_mips_dlsa_test
+
+define i64 @dlsa_test(i64 %a, i64 %b) nounwind {
+entry:
+  %0 = shl i64 %b, 2
+  %1 = add i64 %a, %0
+  ret i64 %1
+}
+
+; MIPS64: dlsa_test:
+; MIPS64: dlsa {{\$[0-9]+}}, $5, $4, 2
+; MIPS64: .size dlsa_test
diff --git a/test/CodeGen/Mips/msa/vec.ll b/test/CodeGen/Mips/msa/vec.ll
index 5bddf5a..d5b97f5 100644
--- a/test/CodeGen/Mips/msa/vec.ll
+++ b/test/CodeGen/Mips/msa/vec.ll
@@ -104,12 +104,12 @@ entry:
   ret void
 }
 
-; CHECK: and_v_b_test:
-; CHECK: ld.b
-; CHECK: ld.b
-; CHECK: and.v
-; CHECK: st.b
-; CHECK: .size and_v_b_test
+; ANYENDIAN: and_v_b_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: and.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size and_v_b_test
 ;
 define void @and_v_h_test() nounwind {
 entry:
@@ -120,12 +120,12 @@ entry:
   ret void
 }
 
-; CHECK: and_v_h_test:
-; CHECK: ld.h
-; CHECK: ld.h
-; CHECK: and.v
-; CHECK: st.h
-; CHECK: .size and_v_h_test
+; ANYENDIAN: and_v_h_test:
+; ANYENDIAN: ld.h
+; ANYENDIAN: ld.h
+; ANYENDIAN: and.v
+; ANYENDIAN: st.h
+; ANYENDIAN: .size and_v_h_test
 ;
 
 define void @and_v_w_test() nounwind {
@@ -137,12 +137,12 @@ entry:
   ret void
 }
 
-; CHECK: and_v_w_test:
-; CHECK: ld.w
-; CHECK: ld.w
-; CHECK: and.v
-; CHECK: st.w
-; CHECK: .size and_v_w_test
+; ANYENDIAN: and_v_w_test:
+; ANYENDIAN: ld.w
+; ANYENDIAN: ld.w
+; ANYENDIAN: and.v
+; ANYENDIAN: st.w
+; ANYENDIAN: .size and_v_w_test
 ;
 
 define void @and_v_d_test() nounwind {
@@ -154,12 +154,12 @@ entry:
   ret void
 }
 
-; CHECK: and_v_d_test:
-; CHECK: ld.d
-; CHECK: ld.d
-; CHECK: and.v
-; CHECK: st.d
-; CHECK: .size and_v_d_test
+; ANYENDIAN: and_v_d_test:
+; ANYENDIAN: ld.d
+; ANYENDIAN: ld.d
+; ANYENDIAN: and.v
+; ANYENDIAN: st.d
+; ANYENDIAN: .size and_v_d_test
 ;
 @llvm_mips_bmnz_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_bmnz_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
@@ -431,9 +431,9 @@ entry:
 ; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
 ; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
 ; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
-; bmnz.v is the same as bsel.v with wt and wd_in swapped
-; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
-; ANYENDIAN-DAG: st.b [[R6]], 0(
+; bmnz.v is the same as bsel.v with (wd_in, wt, ws) -> (wt, ws, wd_in)
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R6]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
 ; ANYENDIAN: .size llvm_mips_bsel_v_b_test
 
 @llvm_mips_bsel_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
@@ -462,9 +462,9 @@ entry:
 ; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
 ; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
 ; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
-; bmnz.v is the same as bsel.v with wt and wd_in swapped
-; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
-; ANYENDIAN-DAG: st.b [[R6]], 0(
+; bmnz.v is the same as bsel.v with (wd_in, wt, ws) -> (wt, ws, wd_in)
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R6]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
 ; ANYENDIAN: .size llvm_mips_bsel_v_h_test
 
 @llvm_mips_bsel_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
@@ -493,9 +493,9 @@ entry:
 ; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
 ; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
 ; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
-; bmnz.v is the same as bsel.v with wt and wd_in swapped
-; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
-; ANYENDIAN-DAG: st.b [[R6]], 0(
+; bmnz.v is the same as bsel.v with (wd_in, wt, ws) -> (wt, ws, wd_in)
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R6]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
 ; ANYENDIAN: .size llvm_mips_bsel_v_w_test
 
 @llvm_mips_bsel_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
@@ -524,9 +524,9 @@ entry:
 ; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
 ; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
 ; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
-; bmnz.v is the same as bsel.v with wt and wd_in swapped
-; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
-; ANYENDIAN-DAG: st.b [[R6]], 0(
+; bmnz.v is the same as bsel.v with (wd_in, wt, ws) -> (wt, ws, wd_in)
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R6]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
 ; ANYENDIAN: .size llvm_mips_bsel_v_d_test
 
 @llvm_mips_nor_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
@@ -722,12 +722,12 @@ entry:
   ret void
 }
 
-; CHECK: or_v_b_test:
-; CHECK: ld.b
-; CHECK: ld.b
-; CHECK: or.v
-; CHECK: st.b
-; CHECK: .size or_v_b_test
+; ANYENDIAN: or_v_b_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: or.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size or_v_b_test
 ;
 define void @or_v_h_test() nounwind {
 entry:
@@ -738,12 +738,12 @@ entry:
   ret void
 }
 
-; CHECK: or_v_h_test:
-; CHECK: ld.h
-; CHECK: ld.h
-; CHECK: or.v
-; CHECK: st.h
-; CHECK: .size or_v_h_test
+; ANYENDIAN: or_v_h_test:
+; ANYENDIAN: ld.h
+; ANYENDIAN: ld.h
+; ANYENDIAN: or.v
+; ANYENDIAN: st.h
+; ANYENDIAN: .size or_v_h_test
 ;
 
 define void @or_v_w_test() nounwind {
@@ -755,12 +755,12 @@ entry:
   ret void
 }
 
-; CHECK: or_v_w_test:
-; CHECK: ld.w
-; CHECK: ld.w
-; CHECK: or.v
-; CHECK: st.w
-; CHECK: .size or_v_w_test
+; ANYENDIAN: or_v_w_test:
+; ANYENDIAN: ld.w
+; ANYENDIAN: ld.w
+; ANYENDIAN: or.v
+; ANYENDIAN: st.w
+; ANYENDIAN: .size or_v_w_test
 ;
 
 define void @or_v_d_test() nounwind {
@@ -772,12 +772,12 @@ entry:
   ret void
 }
 
-; CHECK: or_v_d_test:
-; CHECK: ld.d
-; CHECK: ld.d
-; CHECK: or.v
-; CHECK: st.d
-; CHECK: .size or_v_d_test
+; ANYENDIAN: or_v_d_test:
+; ANYENDIAN: ld.d
+; ANYENDIAN: ld.d
+; ANYENDIAN: or.v
+; ANYENDIAN: st.d
+; ANYENDIAN: .size or_v_d_test
 ;
 @llvm_mips_xor_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_xor_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
@@ -880,12 +880,12 @@ entry:
   ret void
 }
 
-; CHECK: xor_v_b_test:
-; CHECK: ld.b
-; CHECK: ld.b
-; CHECK: xor.v
-; CHECK: st.b
-; CHECK: .size xor_v_b_test
+; ANYENDIAN: xor_v_b_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size xor_v_b_test
 ;
 define void @xor_v_h_test() nounwind {
 entry:
@@ -896,12 +896,12 @@ entry:
   ret void
 }
 
-; CHECK: xor_v_h_test:
-; CHECK: ld.h
-; CHECK: ld.h
-; CHECK: xor.v
-; CHECK: st.h
-; CHECK: .size xor_v_h_test
+; ANYENDIAN: xor_v_h_test:
+; ANYENDIAN: ld.h
+; ANYENDIAN: ld.h
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.h
+; ANYENDIAN: .size xor_v_h_test
 ;
 
 define void @xor_v_w_test() nounwind {
@@ -913,12 +913,12 @@ entry:
   ret void
 }
 
-; CHECK: xor_v_w_test:
-; CHECK: ld.w
-; CHECK: ld.w
-; CHECK: xor.v
-; CHECK: st.w
-; CHECK: .size xor_v_w_test
+; ANYENDIAN: xor_v_w_test:
+; ANYENDIAN: ld.w
+; ANYENDIAN: ld.w
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.w
+; ANYENDIAN: .size xor_v_w_test
 ;
 
 define void @xor_v_d_test() nounwind {
@@ -930,12 +930,12 @@ entry:
   ret void
 }
 
-; CHECK: xor_v_d_test:
-; CHECK: ld.d
-; CHECK: ld.d
-; CHECK: xor.v
-; CHECK: st.d
-; CHECK: .size xor_v_d_test
+; ANYENDIAN: xor_v_d_test:
+; ANYENDIAN: ld.d
+; ANYENDIAN: ld.d
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.d
+; ANYENDIAN: .size xor_v_d_test
 ;
 declare <16 x i8> @llvm.mips.and.v(<16 x i8>, <16 x i8>) nounwind
 declare <16 x i8> @llvm.mips.bmnz.v(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
diff --git a/test/CodeGen/Mips/nacl-align.ll b/test/CodeGen/Mips/nacl-align.ll
new file mode 100644
index 0000000..e61b834
--- /dev/null
+++ b/test/CodeGen/Mips/nacl-align.ll
@@ -0,0 +1,96 @@
+; RUN: llc -filetype=asm -mtriple=mipsel-none-nacl -relocation-model=static \
+; RUN:     -O3 < %s | FileCheck %s
+
+
+; This test tests that NaCl functions are bundle-aligned.
+
+define void @test0() {
+  ret void
+
+; CHECK:          .align  4
+; CHECK-NOT:      .align
+; CHECK-LABEL:    test0:
+
+}
+
+
+; This test tests that blocks that are jumped to through jump table are
+; bundle-aligned.
+
+define i32 @test1(i32 %i) {
+entry:
+  switch i32 %i, label %default [
+    i32 0, label %bb1
+    i32 1, label %bb2
+    i32 2, label %bb3
+    i32 3, label %bb4
+  ]
+
+bb1:
+  ret i32 111
+bb2:
+  ret i32 222
+bb3:
+  ret i32 333
+bb4:
+  ret i32 444
+default:
+  ret i32 555
+
+
+; CHECK-LABEL:       test1:
+
+; CHECK:             .align  4
+; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
+; CHECK-NEXT:        jr      $ra
+; CHECK-NEXT:        addiu   $2, $zero, 111
+; CHECK-NEXT:        .align  4
+; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
+; CHECK-NEXT:        jr      $ra
+; CHECK-NEXT:        addiu   $2, $zero, 222
+; CHECK-NEXT:        .align  4
+; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
+; CHECK-NEXT:        jr      $ra
+; CHECK-NEXT:        addiu   $2, $zero, 333
+; CHECK-NEXT:        .align  4
+; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
+; CHECK-NEXT:        jr      $ra
+; CHECK-NEXT:        addiu   $2, $zero, 444
+
+}
+
+
+; This test tests that a block whose address is taken is bundle-aligned in NaCl.
+
+@bb_array = constant [2 x i8*] [i8* blockaddress(@test2, %bb1),
+                                i8* blockaddress(@test2, %bb2)], align 4
+
+define i32 @test2(i32 %i) {
+entry:
+  %elementptr = getelementptr inbounds [2 x i8*]* @bb_array, i32 0, i32 %i
+  %0 = load i8** %elementptr, align 4
+  indirectbr i8* %0, [label %bb1, label %bb2]
+
+bb1:
+  ret i32 111
+bb2:
+  ret i32 222
+
+
+; CHECK-LABEL:       test2:
+
+; Note that there are two consecutive labels - one temporary and one for
+; basic block.
+
+; CHECK:             .align  4
+; CHECK-NEXT:    ${{[a-zA-Z0-9]+}}:
+; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
+; CHECK-NEXT:        jr      $ra
+; CHECK-NEXT:        addiu   $2, $zero, 111
+; CHECK-NEXT:        .align  4
+; CHECK-NEXT:    ${{[a-zA-Z0-9]+}}:
+; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
+; CHECK-NEXT:        jr      $ra
+; CHECK-NEXT:        addiu   $2, $zero, 222
+
+}
diff --git a/test/CodeGen/Mips/nacl-branch-delay.ll b/test/CodeGen/Mips/nacl-branch-delay.ll
new file mode 100644
index 0000000..d251eee
--- /dev/null
+++ b/test/CodeGen/Mips/nacl-branch-delay.ll
@@ -0,0 +1,71 @@
+; RUN: llc -filetype=asm -mtriple=mipsel-none-linux -relocation-model=static \
+; RUN:     -O3 < %s | FileCheck %s
+
+; RUN: llc -filetype=asm -mtriple=mipsel-none-nacl -relocation-model=static \
+; RUN:     -O3 < %s | FileCheck %s -check-prefix=CHECK-NACL
+
+@x = global i32 0, align 4
+declare void @f1(i32)
+declare void @f2()
+
+
+define void @test1() {
+  %1 = load i32* @x, align 4
+  call void @f1(i32 %1)
+  ret void
+
+
+; CHECK-LABEL:       test1
+
+; We first make sure that for non-NaCl targets branch-delay slot contains
+; dangerous instructions.
+
+; Check that branch-delay slot is used to load argument from x before function
+; call.
+
+; CHECK:             jal
+; CHECK-NEXT:        lw      $4, %lo(x)(${{[0-9]+}})
+
+; Check that branch-delay slot is used for adjusting sp before return.
+
+; CHECK:             jr      $ra
+; CHECK-NEXT:        addiu   $sp, $sp, {{[0-9]+}}
+
+
+; For NaCl, check that branch-delay slot doesn't contain dangerous instructions.
+
+; CHECK-NACL:             jal
+; CHECK-NACL-NEXT:        nop
+
+; CHECK-NACL:             jr      $ra
+; CHECK-NACL-NEXT:        nop
+}
+
+
+define void @test2() {
+  store i32 1, i32* @x, align 4
+  tail call void @f2()
+  ret void
+
+
+; CHECK-LABEL:       test2
+
+; Check that branch-delay slot is used for storing to x before function call.
+
+; CHECK:             jal
+; CHECK-NEXT:        sw      ${{[0-9]+}}, %lo(x)(${{[0-9]+}})
+
+; Check that branch-delay slot is used for adjusting sp before return.
+
+; CHECK:             jr      $ra
+; CHECK-NEXT:        addiu   $sp, $sp, {{[0-9]+}}
+
+
+; For NaCl, check that branch-delay slot doesn't contain dangerous instructions.
+
+; CHECK-NACL:             jal
+; CHECK-NACL-NEXT:        nop
+
+; CHECK-NACL:             jr      $ra
+; CHECK-NACL-NEXT:        nop
+}
diff --git a/test/CodeGen/Mips/nacl-reserved-regs.ll b/test/CodeGen/Mips/nacl-reserved-regs.ll
new file mode 100644
index 0000000..ae21283
--- /dev/null
+++ b/test/CodeGen/Mips/nacl-reserved-regs.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=mipsel -O3 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-none-nacl-gnu -O3 < %s \
+; RUN:  | FileCheck %s -check-prefix=CHECK-NACL
+
+@var = external global i32
+
+define void @f() {
+  %val1 = load volatile i32* @var
+  %val2 = load volatile i32* @var
+  %val3 = load volatile i32* @var
+  %val4 = load volatile i32* @var
+  %val5 = load volatile i32* @var
+  %val6 = load volatile i32* @var
+  %val7 = load volatile i32* @var
+  %val8 = load volatile i32* @var
+  %val9 = load volatile i32* @var
+  %val10 = load volatile i32* @var
+  %val11 = load volatile i32* @var
+  %val12 = load volatile i32* @var
+  %val13 = load volatile i32* @var
+  %val14 = load volatile i32* @var
+  %val15 = load volatile i32* @var
+  %val16 = load volatile i32* @var
+  store volatile i32 %val1, i32* @var
+  store volatile i32 %val2, i32* @var
+  store volatile i32 %val3, i32* @var
+  store volatile i32 %val4, i32* @var
+  store volatile i32 %val5, i32* @var
+  store volatile i32 %val6, i32* @var
+  store volatile i32 %val7, i32* @var
+  store volatile i32 %val8, i32* @var
+  store volatile i32 %val9, i32* @var
+  store volatile i32 %val10, i32* @var
+  store volatile i32 %val11, i32* @var
+  store volatile i32 %val12, i32* @var
+  store volatile i32 %val13, i32* @var
+  store volatile i32 %val14, i32* @var
+  store volatile i32 %val15, i32* @var
+  store volatile i32 %val16, i32* @var
+  ret void
+
+; Check that t6, t7 and t8 are used in non-NaCl code.
+; CHECK:    lw  $14
+; CHECK:    lw  $15
+; CHECK:    lw  $24
+
+; t6, t7 and t8 are reserved in NaCl.
+; CHECK-NACL-NOT:    lw  $14
+; CHECK-NACL-NOT:    lw  $15
+; CHECK-NACL-NOT:    lw  $24
+}
diff --git a/test/CodeGen/Mips/nomips16.ll b/test/CodeGen/Mips/nomips16.ll
index bf7c667..0affb16 100644
--- a/test/CodeGen/Mips/nomips16.ll
+++ b/test/CodeGen/Mips/nomips16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s
 
 @x = global float 0.000000e+00, align 4
 @.str = private unnamed_addr constant [20 x i8] c"in main: mips16 %f\0A\00", align 1
diff --git a/test/CodeGen/Mips/null.ll b/test/CodeGen/Mips/null.ll
index 00c66a9..bc78a27 100644
--- a/test/CodeGen/Mips/null.ll
+++ b/test/CodeGen/Mips/null.ll
@@ -5,7 +5,7 @@ define i32 @main() nounwind {
 entry:
   ret i32 0
 
-; 16: 	.set	mips16                  # @main
+; 16: 	.set	mips16
 
 
 ; 16:	jrc	$ra
diff --git a/test/CodeGen/Mips/octeon.ll b/test/CodeGen/Mips/octeon.ll
new file mode 100644
index 0000000..d5ff9bd
--- /dev/null
+++ b/test/CodeGen/Mips/octeon.ll
@@ -0,0 +1,29 @@
+; RUN: llc -O1 < %s -march=mips64 -mcpu=octeon | FileCheck %s -check-prefix=OCTEON
+; RUN: llc -O1 < %s -march=mips64 -mcpu=mips64 | FileCheck %s -check-prefix=MIPS64
+
+define i64 @addi64(i64 %a, i64 %b) nounwind {
+entry:
+; OCTEON-LABEL: addi64:
+; OCTEON: jr      $ra
+; OCTEON: baddu   $2, $4, $5
+; MIPS64-LABEL: addi64:
+; MIPS64: daddu
+; MIPS64: jr
+; MIPS64: andi
+  %add = add i64 %a, %b
+  %and = and i64 %add, 255
+  ret i64 %and
+}
+
+define i64 @mul(i64 %a, i64 %b) nounwind {
+entry:
+; OCTEON-LABEL: mul:
+; OCTEON: jr    $ra
+; OCTEON: dmul  $2, $4, $5
+; MIPS64-LABEL: mul:
+; MIPS64: dmult
+; MIPS64: jr
+; MIPS64: mflo
+  %res = mul i64 %a, %b
+  ret i64 %res
+}
diff --git a/test/CodeGen/Mips/octeon_popcnt.ll b/test/CodeGen/Mips/octeon_popcnt.ll
new file mode 100644
index 0000000..52c37f6
--- /dev/null
+++ b/test/CodeGen/Mips/octeon_popcnt.ll
@@ -0,0 +1,47 @@
+; RUN: llc -O1 -march=mips64 -mcpu=octeon < %s | FileCheck %s -check-prefix=OCTEON
+; RUN: llc -O1 -march=mips64 -mcpu=mips64 < %s | FileCheck %s -check-prefix=MIPS64
+
+define i8 @cnt8(i8 %x) nounwind readnone {
+  %cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
+  ret i8 %cnt
+; OCTEON-LABEL: cnt8:
+; OCTEON: jr   $ra
+; OCTEON: pop  $2, $1
+; MIPS64-LABEL: cnt8:
+; MIPS64-NOT: pop
+}
+
+define i16 @cnt16(i16 %x) nounwind readnone {
+  %cnt = tail call i16 @llvm.ctpop.i16(i16 %x)
+  ret i16 %cnt
+; OCTEON-LABEL: cnt16:
+; OCTEON: jr   $ra
+; OCTEON: pop  $2, $1
+; MIPS64-LABEL: cnt16:
+; MIPS64-NOT: pop
+}
+
+define i32 @cnt32(i32 %x) nounwind readnone {
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+; OCTEON-LABEL: cnt32:
+; OCTEON: jr   $ra
+; OCTEON: pop  $2, $4
+; MIPS64-LABEL: cnt32:
+; MIPS64-NOT: pop
+}
+
+define i64 @cnt64(i64 %x) nounwind readnone {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+; OCTEON-LABEL: cnt64:
+; OCTEON: jr   $ra
+; OCTEON: dpop $2, $4
+; MIPS64-LABEL: cnt64:
+; MIPS64-NOT: dpop
+}
+
+declare i8 @llvm.ctpop.i8(i8) nounwind readnone
+declare i16 @llvm.ctpop.i16(i16) nounwind readnone
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/test/CodeGen/Mips/optimize-pic-o0.ll b/test/CodeGen/Mips/optimize-pic-o0.ll
new file mode 100644
index 0000000..554d49e
--- /dev/null
+++ b/test/CodeGen/Mips/optimize-pic-o0.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=mipsel -O0 < %s | FileCheck %s
+
+; Function Attrs: nounwind
+define i32 @main()  {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  call void bitcast (void (...)* @foo to void ()*)()
+; CHECK: jalr $25
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %1 = load i32* %i, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %2 = load i32* %retval
+  ret i32 %2
+}
+
+declare void @foo(...) 
diff --git a/test/CodeGen/Mips/powif64_16.ll b/test/CodeGen/Mips/powif64_16.ll
index 35a7ca9..4875727 100644
--- a/test/CodeGen/Mips/powif64_16.ll
+++ b/test/CodeGen/Mips/powif64_16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s
 
 declare float     @llvm.powi.f32(float  %Val, i32 %power)
 declare double    @llvm.powi.f64(double %Val, i32 %power)
diff --git a/test/CodeGen/Mips/rotate.ll b/test/CodeGen/Mips/rotate.ll
index 813bbdf..70eff6e 100644
--- a/test/CodeGen/Mips/rotate.ll
+++ b/test/CodeGen/Mips/rotate.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 -soft-float -mips16-hard-float   < %s | FileCheck %s -check-prefix=mips16 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 < %s | FileCheck %s -check-prefix=mips16
 
 ; CHECK:  rotrv $2, $4
 ; mips16: .ent rot0
diff --git a/test/CodeGen/Mips/s2rem.ll b/test/CodeGen/Mips/s2rem.ll
new file mode 100644
index 0000000..9edb5be
--- /dev/null
+++ b/test/CodeGen/Mips/s2rem.ll
@@ -0,0 +1,92 @@
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic  < %s | FileCheck %s -check-prefix=PIC
+
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static  < %s | FileCheck %s -check-prefix=STATIC
+
+
+@xi = common global i32 0, align 4
+@x = common global float 0.000000e+00, align 4
+@xd = common global double 0.000000e+00, align 8
+
+; Function Attrs: nounwind
+define void @it() #0 {
+entry:
+  %call = call i32 @i(i32 1)
+  store i32 %call, i32* @xi, align 4
+  ret void
+; PIC: 	.ent	it
+; STATIC: 	.ent	it
+; PIC: 	save	$16, $17, $ra, [[FS:[0-9]+]]
+; STATIC:      save	$16, $ra, [[FS:[0-9]+]]
+; PIC: 	restore	$16, $17, $ra, [[FS]]
+; STATIC:      restore	$16, $ra, [[FS]]
+; PIC: 	.end	it
+; STATIC: 	.end	it
+}
+
+declare i32 @i(i32) #1
+
+; Function Attrs: nounwind
+define void @ft() #0 {
+entry:
+  %call = call float @f()
+  store float %call, float* @x, align 4
+  ret void
+; PIC: 	.ent	ft
+; PIC: 	save	$16, $17, $ra, $18, [[FS:[0-9]+]]
+; PIC: 	restore	$16, $17, $ra, $18, [[FS]]
+; PIC: 	.end	ft
+}
+
+declare float @f() #1
+
+; Function Attrs: nounwind
+define void @dt() #0 {
+entry:
+  %call = call double @d()
+  store double %call, double* @xd, align 8
+  ret void
+; PIC: 	.ent	dt
+; PIC: 	save	$16, $17, $ra, $18, [[FS:[0-9]+]]
+; PIC: 	restore	$16, $17, $ra, $18, [[FS]]
+; PIC: 	.end	dt
+}
+
+declare double @d() #1
+
+; Function Attrs: nounwind
+define void @fft() #0 {
+entry:
+  %0 = load float* @x, align 4
+  %call = call float @ff(float %0)
+  store float %call, float* @x, align 4
+  ret void
+; PIC: 	.ent	fft
+; PIC: 	save	$16, $17, $ra, $18, [[FS:[0-9]+]]
+; PIC: 	restore	$16, $17, $ra, $18, [[FS]]
+; PIC: 	.end	fft
+}
+
+declare float @ff(float) #1
+
+; Function Attrs: nounwind
+define void @vft() #0 {
+entry:
+  %0 = load float* @x, align 4
+  call void @vf(float %0)
+  ret void
+; PIC: 	.ent	vft
+; STATIC: 	.ent	vft
+; PIC: 	save	$16, $ra, [[FS:[0-9]+]]
+; STATIC:      save	$16, $ra, [[FS:[0-9]+]]
+; PIC: 	restore	$16, $ra, [[FS]]
+; STATIC:      restore	$16, $ra, [[FS]]
+; PIC: 	.end	vft
+; STATIC: 	.end	vft
+}
+
+declare void @vf(float) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
diff --git a/test/CodeGen/Mips/sel1c.ll b/test/CodeGen/Mips/sel1c.ll
index 4c4784d..edd2e3e 100644
--- a/test/CodeGen/Mips/sel1c.ll
+++ b/test/CodeGen/Mips/sel1c.ll
@@ -10,7 +10,7 @@ entry:
   %0 = load i32* @i, align 4
   %1 = load i32* @j, align 4
   %cmp = icmp eq i32 %0, %1
-  %cond = select i1 %cmp, i32 1, i32 2
+  %cond = select i1 %cmp, i32 1, i32 3
   store i32 %cond, i32* @k, align 4
   ret void
 ; cond-b-short:	bteqz	$BB0_{{[0-9]+}}  # 16 bit inst
diff --git a/test/CodeGen/Mips/sel2c.ll b/test/CodeGen/Mips/sel2c.ll
index 25dfaa9..4b21124 100644
--- a/test/CodeGen/Mips/sel2c.ll
+++ b/test/CodeGen/Mips/sel2c.ll
@@ -10,7 +10,7 @@ entry:
   %0 = load i32* @i, align 4
   %1 = load i32* @j, align 4
   %cmp = icmp ne i32 %0, %1
-  %cond = select i1 %cmp, i32 1, i32 2
+  %cond = select i1 %cmp, i32 1, i32 3
   store i32 %cond, i32* @k, align 4
 ; cond-b-short:	btnez	$BB0_{{[0-9]+}}  # 16 bit inst
   ret void
diff --git a/test/CodeGen/Mips/sr1.ll b/test/CodeGen/Mips/sr1.ll
new file mode 100644
index 0000000..610693d
--- /dev/null
+++ b/test/CodeGen/Mips/sr1.ll
@@ -0,0 +1,60 @@
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static  < %s | FileCheck %s 
+
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static  < %s | FileCheck %s -check-prefix=NEG
+
+@f = common global float 0.000000e+00, align 4
+
+; Function Attrs: nounwind
+define void @foo1() #0 {
+entry:
+  %c = alloca [10 x i8], align 1
+  %arraydecay = getelementptr inbounds [10 x i8]* %c, i32 0, i32 0
+  call void @x(i8* %arraydecay)
+  %arraydecay1 = getelementptr inbounds [10 x i8]* %c, i32 0, i32 0
+  call void @x(i8* %arraydecay1)
+  ret void
+; CHECK: 	.ent	foo1
+; CHECK: 	save	$16, $17, $ra, [[FS:[0-9]+]]  # 16 bit inst
+; CHECK: 	restore	$16, $17, $ra, [[FS]] # 16 bit inst
+; CHECK: 	.end	foo1
+}
+
+declare void @x(i8*) #1
+
+; Function Attrs: nounwind
+define void @foo2() #0 {
+entry:
+  %c = alloca [150 x i8], align 1
+  %arraydecay = getelementptr inbounds [150 x i8]* %c, i32 0, i32 0
+  call void @x(i8* %arraydecay)
+  %arraydecay1 = getelementptr inbounds [150 x i8]* %c, i32 0, i32 0
+  call void @x(i8* %arraydecay1)
+  ret void
+; CHECK: 	.ent	foo2
+; CHECK: 	save	$16, $17, $ra, [[FS:[0-9]+]] 
+; CHECK: 	restore	$16, $17, $ra, [[FS]] 
+; CHECK: 	.end	foo2
+}
+
+; Function Attrs: nounwind
+define void @foo3() #0 {
+entry:
+  %call = call float @xf()
+  store float %call, float* @f, align 4
+  ret void
+; CHECK: 	.ent	foo3
+; CHECK: 	save	$16, $17, $ra, $18, [[FS:[0-9]+]]
+; CHECK: 	restore	$16, $17, $ra, $18, [[FS]]
+; CHECK: 	.end	foo3
+; NEG: 	.ent	foo3
+; NEG-NOT: 	save	$16, $17, $ra, $18, [[FS:[0-9]+]] # 16 bit inst
+; NEG-NOT: 	restore	$16, $17, $ra, $18, [[FS]] # 16 bit inst
+; NEG: 	.end	foo3
+}
+
+declare float @xf() #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
diff --git a/test/CodeGen/Mips/tail16.ll b/test/CodeGen/Mips/tail16.ll
new file mode 100644
index 0000000..4e62e55
--- /dev/null
+++ b/test/CodeGen/Mips/tail16.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic   < %s | FileCheck %s 
+
+; Function Attrs: nounwind optsize
+define float @h()  {
+entry:
+  %call = tail call float bitcast (float (...)* @g to float ()*)() 
+  ret float %call
+; CHECK:	.ent	h
+; CHECK: 	save	$16, $ra, $18, 32
+; CHECK: 	lw	${{[0-9]+}}, %got(__mips16_call_stub_sf_0)(${{[0-9]+}})
+; CHECK: 	restore	$16, $ra, $18, 32
+; CHECK: 	.end	h
+}
+
+; Function Attrs: optsize
+declare float @g(...) 
+
+
+
+
diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll
index 23a8f93..b14ad5b 100644
--- a/test/CodeGen/Mips/tls.ll
+++ b/test/CodeGen/Mips/tls.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | \
-; RUN:     FileCheck %s -check-prefix=PIC
+; RUN:     FileCheck %s -check-prefix=PIC -check-prefix=CHECK
 ; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler < \
-; RUN:     %s | FileCheck %s -check-prefix=STATIC
+; RUN:     %s | FileCheck %s -check-prefix=STATIC -check-prefix=CHECK
 ; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler \
 ; RUN:     -mips-fix-global-base-reg=false < %s  | \
-; RUN:     FileCheck %s -check-prefix=STATICGP
+; RUN:     FileCheck %s -check-prefix=STATICGP -check-prefix=CHECK
 
 @t1 = thread_local global i32 0, align 4
 
diff --git a/test/CodeGen/Mips/trap1.ll b/test/CodeGen/Mips/trap1.ll
index bfcd7fe..9075513 100644
--- a/test/CodeGen/Mips/trap1.ll
+++ b/test/CodeGen/Mips/trap1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic < %s | FileCheck %s -check-prefix=pic
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=pic
 
 declare void @llvm.trap()
 
diff --git a/test/CodeGen/NVPTX/addrspacecast.ll b/test/CodeGen/NVPTX/addrspacecast.ll
new file mode 100644
index 0000000..98ea655
--- /dev/null
+++ b/test/CodeGen/NVPTX/addrspacecast.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefix=PTX64
+
+
+define i32 @conv1(i32 addrspace(1)* %ptr) {
+; PTX32: conv1
+; PTX32: cvta.global.u32
+; PTX32: ld.u32
+; PTX64: conv1
+; PTX64: cvta.global.u64
+; PTX64: ld.u32
+  %genptr = addrspacecast i32 addrspace(1)* %ptr to i32*
+  %val = load i32* %genptr
+  ret i32 %val
+}
+
+define i32 @conv2(i32 addrspace(3)* %ptr) {
+; PTX32: conv2
+; PTX32: cvta.shared.u32
+; PTX32: ld.u32
+; PTX64: conv2
+; PTX64: cvta.shared.u64
+; PTX64: ld.u32
+  %genptr = addrspacecast i32 addrspace(3)* %ptr to i32*
+  %val = load i32* %genptr
+  ret i32 %val
+}
+
+define i32 @conv3(i32 addrspace(4)* %ptr) {
+; PTX32: conv3
+; PTX32: cvta.const.u32
+; PTX32: ld.u32
+; PTX64: conv3
+; PTX64: cvta.const.u64
+; PTX64: ld.u32
+  %genptr = addrspacecast i32 addrspace(4)* %ptr to i32*
+  %val = load i32* %genptr
+  ret i32 %val
+}
+
+define i32 @conv4(i32 addrspace(5)* %ptr) {
+; PTX32: conv4
+; PTX32: cvta.local.u32
+; PTX32: ld.u32
+; PTX64: conv4
+; PTX64: cvta.local.u64
+; PTX64: ld.u32
+  %genptr = addrspacecast i32 addrspace(5)* %ptr to i32*
+  %val = load i32* %genptr
+  ret i32 %val
+}
+
+define i32 @conv5(i32* %ptr) {
+; PTX32: conv5
+; PTX32: cvta.to.global.u32
+; PTX32: ld.global.u32
+; PTX64: conv5
+; PTX64: cvta.to.global.u64
+; PTX64: ld.global.u32
+  %specptr = addrspacecast i32* %ptr to i32 addrspace(1)*
+  %val = load i32 addrspace(1)* %specptr
+  ret i32 %val
+}
+
+define i32 @conv6(i32* %ptr) {
+; PTX32: conv6
+; PTX32: cvta.to.shared.u32
+; PTX32: ld.shared.u32
+; PTX64: conv6
+; PTX64: cvta.to.shared.u64
+; PTX64: ld.shared.u32
+  %specptr = addrspacecast i32* %ptr to i32 addrspace(3)*
+  %val = load i32 addrspace(3)* %specptr
+  ret i32 %val
+}
+
+define i32 @conv7(i32* %ptr) {
+; PTX32: conv7
+; PTX32: cvta.to.const.u32
+; PTX32: ld.const.u32
+; PTX64: conv7
+; PTX64: cvta.to.const.u64
+; PTX64: ld.const.u32
+  %specptr = addrspacecast i32* %ptr to i32 addrspace(4)*
+  %val = load i32 addrspace(4)* %specptr
+  ret i32 %val
+}
+
+define i32 @conv8(i32* %ptr) {
+; PTX32: conv8
+; PTX32: cvta.to.local.u32
+; PTX32: ld.local.u32
+; PTX64: conv8
+; PTX64: cvta.to.local.u64
+; PTX64: ld.local.u32
+  %specptr = addrspacecast i32* %ptr to i32 addrspace(5)*
+  %val = load i32 addrspace(5)* %specptr
+  ret i32 %val
+}
diff --git a/test/CodeGen/NVPTX/aggr-param.ll b/test/CodeGen/NVPTX/aggr-param.ll
new file mode 100644
index 0000000..21deb7e
--- /dev/null
+++ b/test/CodeGen/NVPTX/aggr-param.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; Make sure aggregate param types get emitted properly.
+
+%struct.float4 = type { float, float, float, float }
+
+; CHECK: .visible .func bar
+; CHECK:   .param .align 4 .b8 bar_param_0[16]
+define void @bar(%struct.float4 %f) {
+entry:
+  ret void
+}
+
+; CHECK: .visible .func foo
+; CHECK:   .param .align 4 .b8 foo_param_0[20]
+define void @foo([5 x i32] %f) {
+entry:
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/bug17709.ll b/test/CodeGen/NVPTX/bug17709.ll
index 92f0fcb1..076c446 100644
--- a/test/CodeGen/NVPTX/bug17709.ll
+++ b/test/CodeGen/NVPTX/bug17709.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-define linker_private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) {
+define private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) {
 entry:
   ;unreachable
   %t0 = insertvalue {double, double} undef, double 1.0, 0
diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
new file mode 100644
index 0000000..28dfa46
--- /dev/null
+++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+; Checks how NVPTX lowers alloca buffers and their passing to functions.
+;
+; Produced with the following CUDA code:
+;  extern "C" __attribute__((device)) void callee(float* f, char* buf);
+;
+;  extern "C" __attribute__((global)) void kernel_func(float* a) {
+;    char buf[4 * sizeof(float)];
+;    *(reinterpret_cast<float*>(&buf[0])) = a[0];
+;    *(reinterpret_cast<float*>(&buf[1])) = a[1];
+;    *(reinterpret_cast<float*>(&buf[2])) = a[2];
+;    *(reinterpret_cast<float*>(&buf[3])) = a[3];
+;    callee(a, buf);
+;  }
+
+; CHECK: .visible .entry kernel_func
+define void @kernel_func(float* %a) {
+entry:
+  %buf = alloca [16 x i8], align 4
+
+; CHECK: .local .align 4 .b8 	__local_depot0[16]
+; CHECK: mov.u64 %rl[[BUF_REG:[0-9]+]]
+; CHECK: cvta.local.u64 %SP, %rl[[BUF_REG]]
+
+; CHECK: ld.param.u64 %rl[[A_REG:[0-9]+]], [kernel_func_param_0]
+; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rl[[A_REG]]]
+; CHECK: st.f32 [%SP+0], %f[[A0_REG]]
+
+  %0 = load float* %a, align 4
+  %1 = bitcast [16 x i8]* %buf to float*
+  store float %0, float* %1, align 4
+  %arrayidx2 = getelementptr inbounds float* %a, i64 1
+  %2 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 1
+  %3 = bitcast i8* %arrayidx3 to float*
+  store float %2, float* %3, align 4
+  %arrayidx4 = getelementptr inbounds float* %a, i64 2
+  %4 = load float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 2
+  %5 = bitcast i8* %arrayidx5 to float*
+  store float %4, float* %5, align 4
+  %arrayidx6 = getelementptr inbounds float* %a, i64 3
+  %6 = load float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 3
+  %7 = bitcast i8* %arrayidx7 to float*
+  store float %6, float* %7, align 4
+
+; CHECK: add.u64 %rl[[SP_REG:[0-9]+]], %SP, 0
+; CHECK:        .param .b64 param0;
+; CHECK-NEXT:   st.param.b64  [param0+0], %rl[[A_REG]]
+; CHECK-NEXT:   .param .b64 param1;
+; CHECK-NEXT:   st.param.b64  [param1+0], %rl[[SP_REG]]
+; CHECK-NEXT:   call.uni
+; CHECK-NEXT:   callee,
+
+  %arraydecay = getelementptr inbounds [16 x i8]* %buf, i64 0, i64 0
+  call void @callee(float* %a, i8* %arraydecay) #2
+  ret void
+}
+
+declare void @callee(float*, i8*)
+
+!nvvm.annotations = !{!0}
+
+!0 = metadata !{void (float*)* @kernel_func, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/div-ri.ll b/test/CodeGen/NVPTX/div-ri.ll
new file mode 100644
index 0000000..7f796e0
--- /dev/null
+++ b/test/CodeGen/NVPTX/div-ri.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 | FileCheck %s
+
+define float @foo(float %a) {
+; CHECK: div.approx.f32
+  %div = fdiv float %a, 13.0
+  ret float %div
+}
+
diff --git a/test/CodeGen/NVPTX/ldparam-v4.ll b/test/CodeGen/NVPTX/ldparam-v4.ll
new file mode 100644
index 0000000..ec306aa
--- /dev/null
+++ b/test/CodeGen/NVPTX/ldparam-v4.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+declare <4 x float> @bar()
+
+define void @foo(<4 x float>* %ptr) {
+; CHECK: ld.param.v4.f32
+  %val = tail call <4 x float> @bar()
+  store <4 x float> %val, <4 x float>* %ptr
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
new file mode 100644
index 0000000..64745fc
--- /dev/null
+++ b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
@@ -0,0 +1,74 @@
+; RUN: opt < %s -O3 -S | FileCheck %s
+
+; Make sure the call to syncthreads is not duplicate here by the LLVM
+; optimizations, because it has the noduplicate attribute set.
+
+; CHECK: call void @llvm.cuda.syncthreads
+; CHECK-NOT: call void @llvm.cuda.syncthreads
+
+; Function Attrs: nounwind
+define void @foo(float* %output) #1 {
+entry:
+  %output.addr = alloca float*, align 8
+  store float* %output, float** %output.addr, align 8
+  %0 = load float** %output.addr, align 8
+  %arrayidx = getelementptr inbounds float* %0, i64 0
+  %1 = load float* %arrayidx, align 4
+  %conv = fpext float %1 to double
+  %cmp = fcmp olt double %conv, 1.000000e+01
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %2 = load float** %output.addr, align 8
+  %3 = load float* %2, align 4
+  %conv1 = fpext float %3 to double
+  %add = fadd double %conv1, 1.000000e+00
+  %conv2 = fptrunc double %add to float
+  store float %conv2, float* %2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %4 = load float** %output.addr, align 8
+  %5 = load float* %4, align 4
+  %conv3 = fpext float %5 to double
+  %add4 = fadd double %conv3, 2.000000e+00
+  %conv5 = fptrunc double %add4 to float
+  store float %conv5, float* %4, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  call void @llvm.cuda.syncthreads()
+  %6 = load float** %output.addr, align 8
+  %arrayidx6 = getelementptr inbounds float* %6, i64 0
+  %7 = load float* %arrayidx6, align 4
+  %conv7 = fpext float %7 to double
+  %cmp8 = fcmp olt double %conv7, 1.000000e+01
+  br i1 %cmp8, label %if.then9, label %if.else13
+
+if.then9:                                         ; preds = %if.end
+  %8 = load float** %output.addr, align 8
+  %9 = load float* %8, align 4
+  %conv10 = fpext float %9 to double
+  %add11 = fadd double %conv10, 3.000000e+00
+  %conv12 = fptrunc double %add11 to float
+  store float %conv12, float* %8, align 4
+  br label %if.end17
+
+if.else13:                                        ; preds = %if.end
+  %10 = load float** %output.addr, align 8
+  %11 = load float* %10, align 4
+  %conv14 = fpext float %11 to double
+  %add15 = fadd double %conv14, 4.000000e+00
+  %conv16 = fptrunc double %add15 to float
+  store float %conv16, float* %10, align 4
+  br label %if.end17
+
+if.end17:                                         ; preds = %if.else13, %if.then9
+  ret void
+}
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.cuda.syncthreads() #2
+
+!0 = metadata !{void (float*)* @foo, metadata !"kernel", i32 1}
+!1 = metadata !{null, metadata !"align", i32 8}
diff --git a/test/CodeGen/NVPTX/symbol-naming.ll b/test/CodeGen/NVPTX/symbol-naming.ll
new file mode 100644
index 0000000..bd1333f
--- /dev/null
+++ b/test/CodeGen/NVPTX/symbol-naming.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+; Verify that the NVPTX target removes invalid symbol names prior to emitting
+; PTX.
+
+; PTX32-NOT: .str
+; PTX64-NOT: .str
+
+; PTX32-DAG: _$_str1
+; PTX32-DAG: _$_str
+
+; PTX64-DAG: _$_str1
+; PTX64-DAG: _$_str
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+
+@.str = private unnamed_addr constant [13 x i8] c"%d %f %c %d\0A\00", align 1
+@_$_str = private unnamed_addr constant [13 x i8] c"%d %f %c %d\0A\00", align 1
+
+
+; Function Attrs: nounwind
+define void @foo(i32 %a, float %b, i8 signext %c, i32 %e) {
+entry:
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0))
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/NVPTX/vec-param-load.ll b/test/CodeGen/NVPTX/vec-param-load.ll
index a384348..4193ac4 100644
--- a/test/CodeGen/NVPTX/vec-param-load.ll
+++ b/test/CodeGen/NVPTX/vec-param-load.ll
@@ -5,9 +5,9 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 define <16 x float> @foo(<16 x float> %a) {
 ; Make sure we index into vectors properly
-; CHECK: ld.param.v4.f32         {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0];
-; CHECK: ld.param.v4.f32         {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+16];
-; CHECK: ld.param.v4.f32         {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+32];
 ; CHECK: ld.param.v4.f32         {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+48];
+; CHECK: ld.param.v4.f32         {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+32];
+; CHECK: ld.param.v4.f32         {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+16];
+; CHECK: ld.param.v4.f32         {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0];
   ret <16 x float> %a
 }
diff --git a/test/CodeGen/PowerPC/2007-04-24-InlineAsm-I-Modifier.ll b/test/CodeGen/PowerPC/2007-04-24-InlineAsm-I-Modifier.ll
index 73736c5..5eb6e37 100644
--- a/test/CodeGen/PowerPC/2007-04-24-InlineAsm-I-Modifier.ll
+++ b/test/CodeGen/PowerPC/2007-04-24-InlineAsm-I-Modifier.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8.8.0 | grep "foo r3, r4"
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8.8.0 | grep "bari r3, 47"
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8.8.0 -no-integrated-as | grep "foo r3, r4"
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8.8.0 -no-integrated-as | grep "bari r3, 47"
 
 ; PR1351
 
diff --git a/test/CodeGen/PowerPC/2007-05-03-InlineAsm-S-Constraint.ll b/test/CodeGen/PowerPC/2007-05-03-InlineAsm-S-Constraint.ll
index 1df5140..490aa0c 100644
--- a/test/CodeGen/PowerPC/2007-05-03-InlineAsm-S-Constraint.ll
+++ b/test/CodeGen/PowerPC/2007-05-03-InlineAsm-S-Constraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -no-integrated-as < %s
 ; PR1382
 
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll b/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
index 3d3728d..ccf5297 100644
--- a/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
+++ b/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -mcpu=g5 < %s | FileCheck %s
 ;; Formerly crashed, see PR 1508
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc64-apple-darwin8"
diff --git a/test/CodeGen/PowerPC/2008-12-12-EH.ll b/test/CodeGen/PowerPC/2008-12-12-EH.ll
deleted file mode 100644
index a2a5e9e..0000000
--- a/test/CodeGen/PowerPC/2008-12-12-EH.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -disable-cfi -march=ppc32 -mtriple=powerpc-apple-darwin9 | grep ^__Z1fv.eh
-
-define void @_Z1fv() {
-entry:
-	br label %return
-
-return:
-	ret void
-}
diff --git a/test/CodeGen/PowerPC/2009-08-23-linkerprivate.ll b/test/CodeGen/PowerPC/2009-08-23-linkerprivate.ll
deleted file mode 100644
index ae2acd4..0000000
--- a/test/CodeGen/PowerPC/2009-08-23-linkerprivate.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin | FileCheck %s
-
-; ModuleID = '/Volumes/MacOS9/tests/WebKit/JavaScriptCore/profiler/ProfilerServer.mm'
-
-@"\01l_objc_msgSend_fixup_alloc" = linker_private_weak hidden global i32 0, section "__DATA, __objc_msgrefs, coalesced", align 16
-
-; CHECK: .globl l_objc_msgSend_fixup_alloc
-; CHECK: .weak_definition l_objc_msgSend_fixup_alloc
diff --git a/test/CodeGen/PowerPC/Atomics-32.ll b/test/CodeGen/PowerPC/Atomics-32.ll
index 64f1495..b5c03e2 100644
--- a/test/CodeGen/PowerPC/Atomics-32.ll
+++ b/test/CodeGen/PowerPC/Atomics-32.ll
@@ -529,63 +529,63 @@ define void @test_compare_and_swap() nounwind {
 entry:
   %0 = load i8* @uc, align 1
   %1 = load i8* @sc, align 1
-  %2 = cmpxchg i8* @sc, i8 %0, i8 %1 monotonic
+  %2 = cmpxchg i8* @sc, i8 %0, i8 %1 monotonic monotonic
   store i8 %2, i8* @sc, align 1
   %3 = load i8* @uc, align 1
   %4 = load i8* @sc, align 1
-  %5 = cmpxchg i8* @uc, i8 %3, i8 %4 monotonic
+  %5 = cmpxchg i8* @uc, i8 %3, i8 %4 monotonic monotonic
   store i8 %5, i8* @uc, align 1
   %6 = load i8* @uc, align 1
   %7 = zext i8 %6 to i16
   %8 = load i8* @sc, align 1
   %9 = sext i8 %8 to i16
   %10 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %11 = cmpxchg i16* %10, i16 %7, i16 %9 monotonic
+  %11 = cmpxchg i16* %10, i16 %7, i16 %9 monotonic monotonic
   store i16 %11, i16* @ss, align 2
   %12 = load i8* @uc, align 1
   %13 = zext i8 %12 to i16
   %14 = load i8* @sc, align 1
   %15 = sext i8 %14 to i16
   %16 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %17 = cmpxchg i16* %16, i16 %13, i16 %15 monotonic
+  %17 = cmpxchg i16* %16, i16 %13, i16 %15 monotonic monotonic
   store i16 %17, i16* @us, align 2
   %18 = load i8* @uc, align 1
   %19 = zext i8 %18 to i32
   %20 = load i8* @sc, align 1
   %21 = sext i8 %20 to i32
   %22 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %23 = cmpxchg i32* %22, i32 %19, i32 %21 monotonic
+  %23 = cmpxchg i32* %22, i32 %19, i32 %21 monotonic monotonic
   store i32 %23, i32* @si, align 4
   %24 = load i8* @uc, align 1
   %25 = zext i8 %24 to i32
   %26 = load i8* @sc, align 1
   %27 = sext i8 %26 to i32
   %28 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %29 = cmpxchg i32* %28, i32 %25, i32 %27 monotonic
+  %29 = cmpxchg i32* %28, i32 %25, i32 %27 monotonic monotonic
   store i32 %29, i32* @ui, align 4
   %30 = load i8* @uc, align 1
   %31 = zext i8 %30 to i32
   %32 = load i8* @sc, align 1
   %33 = sext i8 %32 to i32
   %34 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %35 = cmpxchg i32* %34, i32 %31, i32 %33 monotonic
+  %35 = cmpxchg i32* %34, i32 %31, i32 %33 monotonic monotonic
   store i32 %35, i32* @sl, align 4
   %36 = load i8* @uc, align 1
   %37 = zext i8 %36 to i32
   %38 = load i8* @sc, align 1
   %39 = sext i8 %38 to i32
   %40 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %41 = cmpxchg i32* %40, i32 %37, i32 %39 monotonic
+  %41 = cmpxchg i32* %40, i32 %37, i32 %39 monotonic monotonic
   store i32 %41, i32* @ul, align 4
   %42 = load i8* @uc, align 1
   %43 = load i8* @sc, align 1
-  %44 = cmpxchg i8* @sc, i8 %42, i8 %43 monotonic
+  %44 = cmpxchg i8* @sc, i8 %42, i8 %43 monotonic monotonic
   %45 = icmp eq i8 %44, %42
   %46 = zext i1 %45 to i32
   store i32 %46, i32* @ui, align 4
   %47 = load i8* @uc, align 1
   %48 = load i8* @sc, align 1
-  %49 = cmpxchg i8* @uc, i8 %47, i8 %48 monotonic
+  %49 = cmpxchg i8* @uc, i8 %47, i8 %48 monotonic monotonic
   %50 = icmp eq i8 %49, %47
   %51 = zext i1 %50 to i32
   store i32 %51, i32* @ui, align 4
@@ -594,7 +594,7 @@ entry:
   %54 = load i8* @sc, align 1
   %55 = sext i8 %54 to i16
   %56 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %57 = cmpxchg i16* %56, i16 %53, i16 %55 monotonic
+  %57 = cmpxchg i16* %56, i16 %53, i16 %55 monotonic monotonic
   %58 = icmp eq i16 %57, %53
   %59 = zext i1 %58 to i32
   store i32 %59, i32* @ui, align 4
@@ -603,7 +603,7 @@ entry:
   %62 = load i8* @sc, align 1
   %63 = sext i8 %62 to i16
   %64 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %65 = cmpxchg i16* %64, i16 %61, i16 %63 monotonic
+  %65 = cmpxchg i16* %64, i16 %61, i16 %63 monotonic monotonic
   %66 = icmp eq i16 %65, %61
   %67 = zext i1 %66 to i32
   store i32 %67, i32* @ui, align 4
@@ -612,7 +612,7 @@ entry:
   %70 = load i8* @sc, align 1
   %71 = sext i8 %70 to i32
   %72 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %73 = cmpxchg i32* %72, i32 %69, i32 %71 monotonic
+  %73 = cmpxchg i32* %72, i32 %69, i32 %71 monotonic monotonic
   %74 = icmp eq i32 %73, %69
   %75 = zext i1 %74 to i32
   store i32 %75, i32* @ui, align 4
@@ -621,7 +621,7 @@ entry:
   %78 = load i8* @sc, align 1
   %79 = sext i8 %78 to i32
   %80 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %81 = cmpxchg i32* %80, i32 %77, i32 %79 monotonic
+  %81 = cmpxchg i32* %80, i32 %77, i32 %79 monotonic monotonic
   %82 = icmp eq i32 %81, %77
   %83 = zext i1 %82 to i32
   store i32 %83, i32* @ui, align 4
@@ -630,7 +630,7 @@ entry:
   %86 = load i8* @sc, align 1
   %87 = sext i8 %86 to i32
   %88 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %89 = cmpxchg i32* %88, i32 %85, i32 %87 monotonic
+  %89 = cmpxchg i32* %88, i32 %85, i32 %87 monotonic monotonic
   %90 = icmp eq i32 %89, %85
   %91 = zext i1 %90 to i32
   store i32 %91, i32* @ui, align 4
@@ -639,7 +639,7 @@ entry:
   %94 = load i8* @sc, align 1
   %95 = sext i8 %94 to i32
   %96 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %97 = cmpxchg i32* %96, i32 %93, i32 %95 monotonic
+  %97 = cmpxchg i32* %96, i32 %93, i32 %95 monotonic monotonic
   %98 = icmp eq i32 %97, %93
   %99 = zext i1 %98 to i32
   store i32 %99, i32* @ui, align 4
diff --git a/test/CodeGen/PowerPC/Atomics-64.ll b/test/CodeGen/PowerPC/Atomics-64.ll
index d35b848..122b54e 100644
--- a/test/CodeGen/PowerPC/Atomics-64.ll
+++ b/test/CodeGen/PowerPC/Atomics-64.ll
@@ -536,64 +536,64 @@ define void @test_compare_and_swap() nounwind {
 entry:
   %0 = load i8* @uc, align 1
   %1 = load i8* @sc, align 1
-  %2 = cmpxchg i8* @sc, i8 %0, i8 %1 monotonic
+  %2 = cmpxchg i8* @sc, i8 %0, i8 %1 monotonic monotonic
   store i8 %2, i8* @sc, align 1
   %3 = load i8* @uc, align 1
   %4 = load i8* @sc, align 1
-  %5 = cmpxchg i8* @uc, i8 %3, i8 %4 monotonic
+  %5 = cmpxchg i8* @uc, i8 %3, i8 %4 monotonic monotonic
   store i8 %5, i8* @uc, align 1
   %6 = load i8* @uc, align 1
   %7 = zext i8 %6 to i16
   %8 = load i8* @sc, align 1
   %9 = sext i8 %8 to i16
   %10 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %11 = cmpxchg i16* %10, i16 %7, i16 %9 monotonic
+  %11 = cmpxchg i16* %10, i16 %7, i16 %9 monotonic monotonic
   store i16 %11, i16* @ss, align 2
   %12 = load i8* @uc, align 1
   %13 = zext i8 %12 to i16
   %14 = load i8* @sc, align 1
   %15 = sext i8 %14 to i16
   %16 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %17 = cmpxchg i16* %16, i16 %13, i16 %15 monotonic
+  %17 = cmpxchg i16* %16, i16 %13, i16 %15 monotonic monotonic
   store i16 %17, i16* @us, align 2
   %18 = load i8* @uc, align 1
   %19 = zext i8 %18 to i32
   %20 = load i8* @sc, align 1
   %21 = sext i8 %20 to i32
   %22 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %23 = cmpxchg i32* %22, i32 %19, i32 %21 monotonic
+  %23 = cmpxchg i32* %22, i32 %19, i32 %21 monotonic monotonic
   store i32 %23, i32* @si, align 4
   %24 = load i8* @uc, align 1
   %25 = zext i8 %24 to i32
   %26 = load i8* @sc, align 1
   %27 = sext i8 %26 to i32
   %28 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %29 = cmpxchg i32* %28, i32 %25, i32 %27 monotonic
+  %29 = cmpxchg i32* %28, i32 %25, i32 %27 monotonic monotonic
   store i32 %29, i32* @ui, align 4
   %30 = load i8* @uc, align 1
   %31 = zext i8 %30 to i64
   %32 = load i8* @sc, align 1
   %33 = sext i8 %32 to i64
   %34 = bitcast i8* bitcast (i64* @sl to i8*) to i64*
-  %35 = cmpxchg i64* %34, i64 %31, i64 %33 monotonic
+  %35 = cmpxchg i64* %34, i64 %31, i64 %33 monotonic monotonic
   store i64 %35, i64* @sl, align 8
   %36 = load i8* @uc, align 1
   %37 = zext i8 %36 to i64
   %38 = load i8* @sc, align 1
   %39 = sext i8 %38 to i64
   %40 = bitcast i8* bitcast (i64* @ul to i8*) to i64*
-  %41 = cmpxchg i64* %40, i64 %37, i64 %39 monotonic
+  %41 = cmpxchg i64* %40, i64 %37, i64 %39 monotonic monotonic
   store i64 %41, i64* @ul, align 8
   %42 = load i8* @uc, align 1
   %43 = load i8* @sc, align 1
-  %44 = cmpxchg i8* @sc, i8 %42, i8 %43 monotonic
+  %44 = cmpxchg i8* @sc, i8 %42, i8 %43 monotonic monotonic
   %45 = icmp eq i8 %44, %42
   %46 = zext i1 %45 to i8
   %47 = zext i8 %46 to i32
   store i32 %47, i32* @ui, align 4
   %48 = load i8* @uc, align 1
   %49 = load i8* @sc, align 1
-  %50 = cmpxchg i8* @uc, i8 %48, i8 %49 monotonic
+  %50 = cmpxchg i8* @uc, i8 %48, i8 %49 monotonic monotonic
   %51 = icmp eq i8 %50, %48
   %52 = zext i1 %51 to i8
   %53 = zext i8 %52 to i32
@@ -603,7 +603,7 @@ entry:
   %56 = load i8* @sc, align 1
   %57 = sext i8 %56 to i16
   %58 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %59 = cmpxchg i16* %58, i16 %55, i16 %57 monotonic
+  %59 = cmpxchg i16* %58, i16 %55, i16 %57 monotonic monotonic
   %60 = icmp eq i16 %59, %55
   %61 = zext i1 %60 to i8
   %62 = zext i8 %61 to i32
@@ -613,7 +613,7 @@ entry:
   %65 = load i8* @sc, align 1
   %66 = sext i8 %65 to i16
   %67 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %68 = cmpxchg i16* %67, i16 %64, i16 %66 monotonic
+  %68 = cmpxchg i16* %67, i16 %64, i16 %66 monotonic monotonic
   %69 = icmp eq i16 %68, %64
   %70 = zext i1 %69 to i8
   %71 = zext i8 %70 to i32
@@ -623,7 +623,7 @@ entry:
   %74 = load i8* @sc, align 1
   %75 = sext i8 %74 to i32
   %76 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %77 = cmpxchg i32* %76, i32 %73, i32 %75 monotonic
+  %77 = cmpxchg i32* %76, i32 %73, i32 %75 monotonic monotonic
   %78 = icmp eq i32 %77, %73
   %79 = zext i1 %78 to i8
   %80 = zext i8 %79 to i32
@@ -633,7 +633,7 @@ entry:
   %83 = load i8* @sc, align 1
   %84 = sext i8 %83 to i32
   %85 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %86 = cmpxchg i32* %85, i32 %82, i32 %84 monotonic
+  %86 = cmpxchg i32* %85, i32 %82, i32 %84 monotonic monotonic
   %87 = icmp eq i32 %86, %82
   %88 = zext i1 %87 to i8
   %89 = zext i8 %88 to i32
@@ -643,7 +643,7 @@ entry:
   %92 = load i8* @sc, align 1
   %93 = sext i8 %92 to i64
   %94 = bitcast i8* bitcast (i64* @sl to i8*) to i64*
-  %95 = cmpxchg i64* %94, i64 %91, i64 %93 monotonic
+  %95 = cmpxchg i64* %94, i64 %91, i64 %93 monotonic monotonic
   %96 = icmp eq i64 %95, %91
   %97 = zext i1 %96 to i8
   %98 = zext i8 %97 to i32
@@ -653,7 +653,7 @@ entry:
   %101 = load i8* @sc, align 1
   %102 = sext i8 %101 to i64
   %103 = bitcast i8* bitcast (i64* @ul to i8*) to i64*
-  %104 = cmpxchg i64* %103, i64 %100, i64 %102 monotonic
+  %104 = cmpxchg i64* %103, i64 %100, i64 %102 monotonic monotonic
   %105 = icmp eq i64 %104, %100
   %106 = zext i1 %105 to i8
   %107 = zext i8 %106 to i32
diff --git a/test/CodeGen/PowerPC/aa-tbaa.ll b/test/CodeGen/PowerPC/aa-tbaa.ll
new file mode 100644
index 0000000..d7f80fa
--- /dev/null
+++ b/test/CodeGen/PowerPC/aa-tbaa.ll
@@ -0,0 +1,41 @@
+; RUN: llc -enable-misched -misched=shuffle -enable-aa-sched-mi -post-RA-scheduler=0 -mcpu=ppc64 < %s | FileCheck %s
+
+; REQUIRES: asserts
+; -misched=shuffle is NDEBUG only!
+
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%"class.llvm::MCOperand" = type { i8, %union.anon.110 }
+%union.anon.110 = type { i64 }
+
+define void @foo(i32 %v) {
+entry:
+  %MCOp = alloca %"class.llvm::MCOperand", align 8
+  br label %next
+
+; CHECK-LABEL: @foo
+
+next:
+  %sunkaddr18 = ptrtoint %"class.llvm::MCOperand"* %MCOp to i64
+  %sunkaddr19 = add i64 %sunkaddr18, 8
+  %sunkaddr20 = inttoptr i64 %sunkaddr19 to double*
+  store double 0.000000e+00, double* %sunkaddr20, align 8, !tbaa !1
+  %sunkaddr21 = ptrtoint %"class.llvm::MCOperand"* %MCOp to i64
+  %sunkaddr22 = add i64 %sunkaddr21, 8
+  %sunkaddr23 = inttoptr i64 %sunkaddr22 to i32*
+  store i32 %v, i32* %sunkaddr23, align 8, !tbaa !2
+  ret void
+
+; Make sure that the 64-bit store comes first, regardless of what TBAA says
+; about the two not aliasing!
+; CHECK: li [[REG:[0-9]+]], 0
+; CHECK: std [[REG]], -[[OFF:[0-9]+]](1)
+; CHECK: stw 3, -[[OFF]](1)
+; CHECK: blr
+}
+
+!0 = metadata !{ metadata !"root" }
+!1 = metadata !{ metadata !"set1", metadata !0 }
+!2 = metadata !{ metadata !"set2", metadata !0 }
+
diff --git a/test/CodeGen/PowerPC/anon_aggr.ll b/test/CodeGen/PowerPC/anon_aggr.ll
index 1525e05..3bae5c6 100644
--- a/test/CodeGen/PowerPC/anon_aggr.ll
+++ b/test/CodeGen/PowerPC/anon_aggr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu -fast-isel=false < %s | FileCheck %s
+; RUN: llc -O0 -mcpu=ppc64 -mtriple=powerpc64-unknown-linux-gnu -fast-isel=false < %s | FileCheck %s
 ; RUN: llc -O0 -mcpu=g4 -mtriple=powerpc-apple-darwin8 < %s | FileCheck -check-prefix=DARWIN32 %s
 ; RUN: llc -O0 -mcpu=ppc970 -mtriple=powerpc64-apple-darwin8 < %s | FileCheck -check-prefix=DARWIN64 %s
 
@@ -119,9 +119,9 @@ unequal:
 ; CHECK: ld 3, -[[OFFSET1]](1)
 
 ; DARWIN32: _func3:
-; DARWIN32: addi r[[REG1:[0-9]+]], r[[REGSP:[0-9]+]], 40
+; DARWIN32: addi r[[REG1:[0-9]+]], r[[REGSP:[0-9]+]], 36
 ; DARWIN32: addi r[[REG2:[0-9]+]], r[[REGSP]], 24
-; DARWIN32: lwz r[[REG3:[0-9]+]], 48(r[[REGSP]])
+; DARWIN32: lwz r[[REG3:[0-9]+]], 44(r[[REGSP]])
 ; DARWIN32: lwz r[[REG4:[0-9]+]], 32(r[[REGSP]])
 ; DARWIN32: cmplw cr{{[0-9]+}}, r[[REG4]], r[[REG3]]
 ; DARWIN32: stw r[[REG3]], -[[OFFSET1:[0-9]+]]
diff --git a/test/CodeGen/PowerPC/atomic-1.ll b/test/CodeGen/PowerPC/atomic-1.ll
index 1737916..083df47 100644
--- a/test/CodeGen/PowerPC/atomic-1.ll
+++ b/test/CodeGen/PowerPC/atomic-1.ll
@@ -11,7 +11,7 @@ define i32 @exchange_and_add(i32* %mem, i32 %val) nounwind {
 define i32 @exchange_and_cmp(i32* %mem) nounwind {
 ; CHECK-LABEL: exchange_and_cmp:
 ; CHECK: lwarx
-  %tmp = cmpxchg i32* %mem, i32 0, i32 1 monotonic
+  %tmp = cmpxchg i32* %mem, i32 0, i32 1 monotonic monotonic
 ; CHECK: stwcx.
 ; CHECK: stwcx.
   ret i32 %tmp
diff --git a/test/CodeGen/PowerPC/atomic-2.ll b/test/CodeGen/PowerPC/atomic-2.ll
index e56a779..261335e 100644
--- a/test/CodeGen/PowerPC/atomic-2.ll
+++ b/test/CodeGen/PowerPC/atomic-2.ll
@@ -11,7 +11,7 @@ define i64 @exchange_and_add(i64* %mem, i64 %val) nounwind {
 define i64 @exchange_and_cmp(i64* %mem) nounwind {
 ; CHECK-LABEL: exchange_and_cmp:
 ; CHECK: ldarx
-  %tmp = cmpxchg i64* %mem, i64 0, i64 1 monotonic
+  %tmp = cmpxchg i64* %mem, i64 0, i64 1 monotonic monotonic
 ; CHECK: stdcx.
 ; CHECK: stdcx.
   ret i64 %tmp
diff --git a/test/CodeGen/PowerPC/bdzlr.ll b/test/CodeGen/PowerPC/bdzlr.ll
index e487558..29b74c6 100644
--- a/test/CodeGen/PowerPC/bdzlr.ll
+++ b/test/CodeGen/PowerPC/bdzlr.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-crbits | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s -check-prefix=CHECK-CRB
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -54,6 +55,12 @@ for.end:                                          ; preds = %for.body, %if.end,
 ; CHECK: bnelr
 ; CHECK: bdzlr
 ; CHECK-NOT: blr
+
+; CHECK-CRB: @lua_xmove
+; CHECK-CRB: bclr 12,
+; CHECK-CRB: bclr 12,
+; CHECK-CRB: bdzlr
+; CHECK-CRB-NOT: blr
 }
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/PowerPC/byval-agg-info.ll b/test/CodeGen/PowerPC/byval-agg-info.ll
new file mode 100644
index 0000000..89ad8e4
--- /dev/null
+++ b/test/CodeGen/PowerPC/byval-agg-info.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -print-after=prologepilog >%t 2>&1 && FileCheck <%t %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.anon = type { i32, i32 }
+
+declare void @foo(%struct.anon* %v)
+define void @test(i32 %a, i32 %b, %struct.anon* byval nocapture %v) {
+entry:
+  call void @foo(%struct.anon* %v)
+  ret void
+}
+
+; Make sure that the MMO on the store has no offset from the byval
+; variable itself (we used to have mem:ST8[%v+64]).
+; CHECK: STD %X5<kill>, 176, %X1; mem:ST8[%v](align=16)
+
diff --git a/test/CodeGen/PowerPC/coalesce-ext.ll b/test/CodeGen/PowerPC/coalesce-ext.ll
index f19175c..eb7cd26 100644
--- a/test/CodeGen/PowerPC/coalesce-ext.ll
+++ b/test/CodeGen/PowerPC/coalesce-ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=ppc64 -mtriple=powerpc64-apple-darwin < %s | FileCheck %s
+; RUN: llc -march=ppc64 -mcpu=g5 -mtriple=powerpc64-apple-darwin < %s | FileCheck %s
 ; Check that the peephole optimizer knows about sext and zext instructions.
 ; CHECK: test1sext
 define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
diff --git a/test/CodeGen/PowerPC/complex-return.ll b/test/CodeGen/PowerPC/complex-return.ll
index 3eb30e9..5ac7524 100644
--- a/test/CodeGen/PowerPC/complex-return.ll
+++ b/test/CodeGen/PowerPC/complex-return.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+; RUN: llc -mcpu=ppc64 -O0 < %s | FileCheck %s
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/crash.ll b/test/CodeGen/PowerPC/crash.ll
new file mode 100644
index 0000000..5cecca7
--- /dev/null
+++ b/test/CodeGen/PowerPC/crash.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7
+
+define void @test1(i1 %x, i8 %x2, i8* %x3, i64 %x4) {
+entry:
+  %tmp3 = and i64 %x4, 16
+  %bf.shl = trunc i64 %tmp3 to i8
+  %bf.clear = and i8 %x2, -17
+  %bf.set = or i8 %bf.shl, %bf.clear
+  br i1 %x, label %if.then, label %if.end
+
+if.then:
+  ret void
+
+if.end:
+  store i8 %bf.set, i8* %x3, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/crbit-asm.ll b/test/CodeGen/PowerPC/crbit-asm.ll
new file mode 100644
index 0000000..373e334
--- /dev/null
+++ b/test/CodeGen/PowerPC/crbit-asm.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define zeroext i1 @testi1(i1 zeroext %b1, i1 zeroext %b2) #0 {
+entry:
+  %0 = tail call i8 asm "crand $0, $1, $2", "=^wc,^wc,^wc"(i1 %b1, i1 %b2) #0
+  %1 = and i8 %0, 1
+  %tobool3 = icmp ne i8 %1, 0
+  ret i1 %tobool3
+
+; CHECK-LABEL: @testi1
+; CHECK-DAG: andi. {{[0-9]+}}, 3, 1
+; CHECK-DAG: li [[REG1:[0-9]+]], 0
+; CHECK-DAG: cror [[REG2:[0-9]+]], 1, 1
+; CHECK-DAG: andi. {{[0-9]+}}, 4, 1
+; CHECK-DAG: crand [[REG3:[0-9]+]], [[REG2]], 1
+; CHECK-DAG: li [[REG4:[0-9]+]], 1
+; CHECK: isel 3, [[REG4]], [[REG1]], [[REG3]]
+; CHECK: blr
+}
+
+define signext i32 @testi32(i32 signext %b1, i32 signext %b2) #0 {
+entry:
+  %0 = tail call i32 asm "crand $0, $1, $2", "=^wc,^wc,^wc"(i32 %b1, i32 %b2) #0
+  ret i32 %0
+
+; The ABI sign_extend should combine with the any_extend from the asm result,
+; and the result will be 0 or -1. This highlights the fact that only the first
+; bit is meaningful.
+; CHECK-LABEL: @testi32
+; CHECK-DAG: andi. {{[0-9]+}}, 3, 1
+; CHECK-DAG: li [[REG1:[0-9]+]], 0
+; CHECK-DAG: cror [[REG2:[0-9]+]], 1, 1
+; CHECK-DAG: andi. {{[0-9]+}}, 4, 1
+; CHECK-DAG: crand [[REG3:[0-9]+]], [[REG2]], 1
+; CHECK-DAG: li [[REG4:[0-9]+]], -1
+; CHECK: isel 3, [[REG4]], [[REG1]], [[REG3]]
+; CHECK: blr
+}
+
+define zeroext i8 @testi8(i8 zeroext %b1, i8 zeroext %b2) #0 {
+entry:
+  %0 = tail call i8 asm "crand $0, $1, $2", "=^wc,^wc,^wc"(i8 %b1, i8 %b2) #0
+  ret i8 %0
+
+; CHECK-LABEL: @testi8
+; CHECK-DAG: andi. {{[0-9]+}}, 3, 1
+; CHECK-DAG: li [[REG1:[0-9]+]], 0
+; CHECK-DAG: cror [[REG2:[0-9]+]], 1, 1
+; CHECK-DAG: andi. {{[0-9]+}}, 4, 1
+; CHECK-DAG: crand [[REG3:[0-9]+]], [[REG2]], 1
+; CHECK-DAG: li [[REG4:[0-9]+]], 1
+; CHECK: isel 3, [[REG4]], [[REG1]], [[REG3]]
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/crbits.ll b/test/CodeGen/PowerPC/crbits.ll
new file mode 100644
index 0000000..06e9001
--- /dev/null
+++ b/test/CodeGen/PowerPC/crbits.ll
@@ -0,0 +1,192 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define zeroext i1 @test1(float %v1, float %v2) #0 {
+entry:
+  %cmp = fcmp oge float %v1, %v2
+  %cmp2 = fcmp ole float %v2, 0.000000e+00
+  %and5 = and i1 %cmp, %cmp2
+  ret i1 %and5
+
+; CHECK-LABEL: @test1
+; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
+; CHECK-DAG: li [[REG1:[0-9]+]], 1
+; CHECK-DAG: lfs [[REG2:[0-9]+]],
+; CHECK-DAG: fcmpu {{[0-9]+}}, 2, [[REG2]]
+; CHECK: crnor
+; CHECK: crnor
+; CHECK: crnand [[REG4:[0-9]+]],
+; CHECK: isel 3, 0, [[REG1]], [[REG4]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define zeroext i1 @test2(float %v1, float %v2) #0 {
+entry:
+  %cmp = fcmp oge float %v1, %v2
+  %cmp2 = fcmp ole float %v2, 0.000000e+00
+  %xor5 = xor i1 %cmp, %cmp2
+  ret i1 %xor5
+
+; CHECK-LABEL: @test2
+; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
+; CHECK-DAG: li [[REG1:[0-9]+]], 1
+; CHECK-DAG: lfs [[REG2:[0-9]+]],
+; CHECK-DAG: fcmpu {{[0-9]+}}, 2, [[REG2]]
+; CHECK: crnor
+; CHECK: crnor
+; CHECK: creqv [[REG4:[0-9]+]],
+; CHECK: isel 3, 0, [[REG1]], [[REG4]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define zeroext i1 @test3(float %v1, float %v2, i32 signext %x) #0 {
+entry:
+  %cmp = fcmp oge float %v1, %v2
+  %cmp2 = fcmp ole float %v2, 0.000000e+00
+  %cmp4 = icmp ne i32 %x, -2
+  %and7 = and i1 %cmp2, %cmp4
+  %xor8 = xor i1 %cmp, %and7
+  ret i1 %xor8
+
+; CHECK-LABEL: @test3
+; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
+; CHECK-DAG: li [[REG1:[0-9]+]], 1
+; CHECK-DAG: lfs [[REG2:[0-9]+]],
+; CHECK-DAG: fcmpu {{[0-9]+}}, 2, [[REG2]]
+; CHECK: crnor
+; CHECK: crnor
+; CHECK: crandc
+; CHECK: creqv [[REG4:[0-9]+]],
+; CHECK: isel 3, 0, [[REG1]], [[REG4]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define zeroext i1 @test4(i1 zeroext %v1, i1 zeroext %v2, i1 zeroext %v3) #0 {
+entry:
+  %and8 = and i1 %v1, %v2
+  %or9 = or i1 %and8, %v3
+  ret i1 %or9
+
+; CHECK-DAG: @test4
+; CHECK: and [[REG1:[0-9]+]], 3, 4
+; CHECK: or 3, [[REG1]], 5
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define zeroext i1 @test5(i1 zeroext %v1, i1 zeroext %v2, i32 signext %v3) #0 {
+entry:
+  %and6 = and i1 %v1, %v2
+  %cmp = icmp ne i32 %v3, -2
+  %or7 = or i1 %and6, %cmp
+  ret i1 %or7
+
+; CHECK-LABEL: @test5
+; CHECK-DAG: and [[REG1:[0-9]+]], 3, 4
+; CHECK-DAG: cmpwi {{[0-9]+}}, 5, -2
+; CHECK-DAG: li [[REG3:[0-9]+]], 1
+; CHECK-DAG: andi. {{[0-9]+}}, [[REG1]], 1
+; CHECK-DAG: crandc [[REG5:[0-9]+]],
+; CHECK: isel 3, 0, [[REG3]], [[REG5]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define zeroext i1 @test6(i1 zeroext %v1, i1 zeroext %v2, i32 signext %v3) #0 {
+entry:
+  %cmp = icmp ne i32 %v3, -2
+  %or6 = or i1 %cmp, %v2
+  %and7 = and i1 %or6, %v1
+  ret i1 %and7
+
+; CHECK-LABEL: @test6
+; CHECK-DAG: andi. {{[0-9]+}}, 3, 1
+; CHECK-DAG: cmpwi {{[0-9]+}}, 5, -2
+; CHECK-DAG: cror [[REG1:[0-9]+]], 1, 1
+; CHECK-DAG: andi. {{[0-9]+}}, 4, 1
+; CHECK-DAG: li [[REG2:[0-9]+]], 1
+; CHECK-DAG: crorc [[REG4:[0-9]+]], 1,
+; CHECK-DAG: crnand [[REG5:[0-9]+]], [[REG4]], [[REG1]]
+; CHECK: isel 3, 0, [[REG2]], [[REG5]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @test7(i1 zeroext %v2, i32 signext %i1, i32 signext %i2) #0 {
+entry:
+  %cond = select i1 %v2, i32 %i1, i32 %i2
+  ret i32 %cond
+
+; CHECK-LABEL: @test7
+; CHECK: andi. {{[0-9]+}}, 3, 1
+; CHECK: isel 3, 4, 5, 1
+; CHECK: blr
+}
+
+define signext i32 @exttest7(i32 signext %a) #0 {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  %cond = select i1 %cmp, i32 7, i32 8
+  ret i32 %cond
+
+; CHECK-LABEL: @exttest7
+; CHECK-DAG: cmplwi {{[0-9]+}}, 3, 5
+; CHECK-DAG: li [[REG1:[0-9]+]], 8
+; CHECK-DAG: li [[REG2:[0-9]+]], 7
+; CHECK: isel 3, [[REG2]], [[REG1]],
+; CHECK-NOT: rldicl
+; CHECK: blr
+}
+
+define zeroext i32 @exttest8() #0 {
+entry:
+  %v0 = load i64* undef, align 8
+  %sub = sub i64 80, %v0
+  %div = lshr i64 %sub, 1
+  %conv13 = trunc i64 %div to i32
+  %cmp14 = icmp ugt i32 %conv13, 80
+  %.conv13 = select i1 %cmp14, i32 0, i32 %conv13
+  ret i32 %.conv13
+; CHECK-LABEL: @exttest8
+; This is a don't-crash test: %conv13 is both one of the possible select output
+; values and also an input to the conditional feeding it.
+}
+
+; Function Attrs: nounwind readnone
+define float @test8(i1 zeroext %v2, float %v1, float %v3) #0 {
+entry:
+  %cond = select i1 %v2, float %v1, float %v3
+  ret float %cond
+
+; CHECK-LABEL: @test8
+; CHECK: andi. {{[0-9]+}}, 3, 1
+; CHECK: bclr 12, 1, 0
+; CHECK: fmr 1, 2
+; CHECK: blr
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @test10(i32 signext %v1, i32 signext %v2) #0 {
+entry:
+  %tobool = icmp ne i32 %v1, 0
+  %lnot = icmp eq i32 %v2, 0
+  %and3 = and i1 %tobool, %lnot
+  %and = zext i1 %and3 to i32
+  ret i32 %and
+
+; CHECK-LABEL: @test10
+; CHECK-DAG: cmpwi {{[0-9]+}}, 3, 0
+; CHECK-DAG: cmpwi {{[0-9]+}}, 4, 0
+; CHECK-DAG: li [[REG2:[0-9]+]], 1
+; CHECK-DAG: crorc [[REG3:[0-9]+]],
+; CHECK: isel 3, 0, [[REG2]], [[REG3]]
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/ctrloop-large-ec.ll b/test/CodeGen/PowerPC/ctrloop-large-ec.ll
index c18bdab..cce23fa 100644
--- a/test/CodeGen/PowerPC/ctrloop-large-ec.ll
+++ b/test/CodeGen/PowerPC/ctrloop-large-ec.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=ppc32 < %s
+; RUN: llc -mcpu=ppc32 < %s | FileCheck %s
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
 target triple = "powerpc-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/ctrloop-udivti3.ll b/test/CodeGen/PowerPC/ctrloop-udivti3.ll
new file mode 100644
index 0000000..d07a11f
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-udivti3.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define hidden void @_mpd_shortdiv(i64 %n) #0 {
+entry:
+  br i1 undef, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.018.in = phi i64 [ %n, %for.body.lr.ph ], [ %i.018, %for.body ]
+  %i.018 = add i64 %i.018.in, -1
+  %add.i = or i128 undef, undef
+  %div.i = udiv i128 %add.i, 0
+  %conv3.i11 = trunc i128 %div.i to i64
+  store i64 %conv3.i11, i64* undef, align 8
+  %cmp = icmp eq i64 %i.018, 0
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; CHECK-LABEL: @_mpd_shortdiv
+; CHECK-NOT: mtctr
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index cb93dec..0d6c4a6 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll
@@ -18,7 +18,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !llvm.module.flags = !{!22}
 
 !0 = metadata !{i32 720913, metadata !21, i32 12, metadata !"clang version 3.1", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, metadata !"", metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 0}
+!1 = metadata !{}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 720942, metadata !21, null, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !13, i32 0} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !21} ; [ DW_TAG_file_type ]
diff --git a/test/CodeGen/PowerPC/early-ret2.ll b/test/CodeGen/PowerPC/early-ret2.ll
index a274e2c..a8e456f 100644
--- a/test/CodeGen/PowerPC/early-ret2.ll
+++ b/test/CodeGen/PowerPC/early-ret2.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-crbits | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s -check-prefix=CHECK-CRB
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -17,6 +18,9 @@ while.end:                                        ; preds = %while.body, %while.
 
 ; CHECK: @_Z8example3iPiS_
 ; CHECK: bnelr
+
+; CHECK-CRB: @_Z8example3iPiS_
+; CHECK-CRB: bclr 12,
 }
 
 attributes #0 = { noinline nounwind }
diff --git a/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll b/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll
new file mode 100644
index 0000000..db0d8ed
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll
@@ -0,0 +1,153 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr5 | FileCheck %s --check-prefix=ELF64
+
+; Test sitofp
+
+define void @sitofp_double_i32(i32 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i32
+  %b.addr = alloca double, align 8
+  %conv = sitofp i32 %a to double
+; ELF64: std {{[0-9]+}}, -[[OFFSET:[0-9]+]](1)
+; ELF64: lfd {{[0-9]+}}, -[[OFFSET]](1)
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @sitofp_double_i64(i64 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i64
+  %b.addr = alloca double, align 8
+  %conv = sitofp i64 %a to double
+; ELF64: std {{[0-9]+}}, -[[OFFSET:[0-9]+]](1)
+; ELF64: lfd {{[0-9]+}}, -[[OFFSET]](1)
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @sitofp_double_i16(i16 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i16
+  %b.addr = alloca double, align 8
+  %conv = sitofp i16 %a to double
+; ELF64: extsh
+; ELF64: std {{[0-9]+}}, -[[OFFSET:[0-9]+]](1)
+; ELF64: lfd {{[0-9]+}}, -[[OFFSET]](1)
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @sitofp_double_i8(i8 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i8
+  %b.addr = alloca double, align 8
+  %conv = sitofp i8 %a to double
+; ELF64: extsb
+; ELF64: std {{[0-9]+}}, -[[OFFSET:[0-9]+]](1)
+; ELF64: lfd {{[0-9]+}}, -[[OFFSET]](1)
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+; Test fptosi
+
+define void @fptosi_float_i32(float %a) nounwind ssp {
+entry:
+; ELF64: fptosi_float_i32
+  %b.addr = alloca i32, align 4
+  %conv = fptosi float %a to i32
+; ELF64: fctiwz
+; ELF64: stfd
+; ELF64: lwa
+  store i32 %conv, i32* %b.addr, align 4
+  ret void
+}
+
+define void @fptosi_float_i64(float %a) nounwind ssp {
+entry:
+; ELF64: fptosi_float_i64
+  %b.addr = alloca i64, align 4
+  %conv = fptosi float %a to i64
+; ELF64: fctidz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 4
+  ret void
+}
+
+define void @fptosi_double_i32(double %a) nounwind ssp {
+entry:
+; ELF64: fptosi_double_i32
+  %b.addr = alloca i32, align 8
+  %conv = fptosi double %a to i32
+; ELF64: fctiwz
+; ELF64: stfd
+; ELF64: lwa
+  store i32 %conv, i32* %b.addr, align 8
+  ret void
+}
+
+define void @fptosi_double_i64(double %a) nounwind ssp {
+entry:
+; ELF64: fptosi_double_i64
+  %b.addr = alloca i64, align 8
+  %conv = fptosi double %a to i64
+; ELF64: fctidz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 8
+  ret void
+}
+
+; Test fptoui
+
+define void @fptoui_float_i32(float %a) nounwind ssp {
+entry:
+; ELF64: fptoui_float_i32
+  %b.addr = alloca i32, align 4
+  %conv = fptoui float %a to i32
+; ELF64: fctidz
+; ELF64: stfd
+; ELF64: lwz
+  store i32 %conv, i32* %b.addr, align 4
+  ret void
+}
+
+define void @fptoui_float_i64(float %a) nounwind ssp {
+entry:
+; ELF64: fptoui_float_i64
+  %b.addr = alloca i64, align 4
+  %conv = fptoui float %a to i64
+; ELF64: fctiduz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 4
+  ret void
+}
+
+define void @fptoui_double_i32(double %a) nounwind ssp {
+entry:
+; ELF64: fptoui_double_i32
+  %b.addr = alloca i32, align 8
+  %conv = fptoui double %a to i32
+; ELF64: fctidz
+; ELF64: stfd
+; ELF64: lwz
+  store i32 %conv, i32* %b.addr, align 8
+  ret void
+}
+
+define void @fptoui_double_i64(double %a) nounwind ssp {
+entry:
+; ELF64: fptoui_double_i64
+  %b.addr = alloca i64, align 8
+  %conv = fptoui double %a to i64
+; ELF64: fctiduz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 8
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/float-to-int.ll b/test/CodeGen/PowerPC/float-to-int.ll
index 39cd4f9..9c897cb 100644
--- a/test/CodeGen/PowerPC/float-to-int.ll
+++ b/test/CodeGen/PowerPC/float-to-int.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck -check-prefix=CHECK-VSX %s
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=g5
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
@@ -12,6 +13,12 @@ define i64 @foo(float %a) nounwind {
 ; CHECK: stfd [[REG]],
 ; CHECK: ld 3,
 ; CHECK: blr
+
+; CHECK-VSX: @foo
+; CHECK-VSX: xscvdpsxds [[REG:[0-9]+]], 1
+; CHECK-VSX: stxsdx [[REG]],
+; CHECK-VSX: ld 3,
+; CHECK-VSX: blr
 }
 
 define i64 @foo2(double %a) nounwind {
@@ -23,6 +30,12 @@ define i64 @foo2(double %a) nounwind {
 ; CHECK: stfd [[REG]],
 ; CHECK: ld 3,
 ; CHECK: blr
+
+; CHECK-VSX: @foo2
+; CHECK-VSX: xscvdpsxds [[REG:[0-9]+]], 1
+; CHECK-VSX: stxsdx [[REG]],
+; CHECK-VSX: ld 3,
+; CHECK-VSX: blr
 }
 
 define i64 @foo3(float %a) nounwind {
@@ -34,6 +47,12 @@ define i64 @foo3(float %a) nounwind {
 ; CHECK: stfd [[REG]],
 ; CHECK: ld 3,
 ; CHECK: blr
+
+; CHECK-VSX: @foo3
+; CHECK-VSX: xscvdpuxds [[REG:[0-9]+]], 1
+; CHECK-VSX: stxsdx [[REG]],
+; CHECK-VSX: ld 3,
+; CHECK-VSX: blr
 }
 
 define i64 @foo4(double %a) nounwind {
@@ -45,6 +64,12 @@ define i64 @foo4(double %a) nounwind {
 ; CHECK: stfd [[REG]],
 ; CHECK: ld 3,
 ; CHECK: blr
+
+; CHECK-VSX: @foo4
+; CHECK-VSX: xscvdpuxds [[REG:[0-9]+]], 1
+; CHECK-VSX: stxsdx [[REG]],
+; CHECK-VSX: ld 3,
+; CHECK-VSX: blr
 }
 
 define i32 @goo(float %a) nounwind {
@@ -56,6 +81,12 @@ define i32 @goo(float %a) nounwind {
 ; CHECK: stfiwx [[REG]],
 ; CHECK: lwz 3,
 ; CHECK: blr
+
+; CHECK-VSX: @goo
+; CHECK-VSX: xscvdpsxws [[REG:[0-9]+]], 1
+; CHECK-VSX: stfiwx [[REG]],
+; CHECK-VSX: lwz 3,
+; CHECK-VSX: blr
 }
 
 define i32 @goo2(double %a) nounwind {
@@ -67,6 +98,12 @@ define i32 @goo2(double %a) nounwind {
 ; CHECK: stfiwx [[REG]],
 ; CHECK: lwz 3,
 ; CHECK: blr
+
+; CHECK-VSX: @goo2
+; CHECK-VSX: xscvdpsxws [[REG:[0-9]+]], 1
+; CHECK-VSX: stfiwx [[REG]],
+; CHECK-VSX: lwz 3,
+; CHECK-VSX: blr
 }
 
 define i32 @goo3(float %a) nounwind {
@@ -78,6 +115,12 @@ define i32 @goo3(float %a) nounwind {
 ; CHECK: stfiwx [[REG]],
 ; CHECK: lwz 3,
 ; CHECK: blr
+
+; CHECK-VSX: @goo3
+; CHECK-VSX: xscvdpuxws [[REG:[0-9]+]], 1
+; CHECK-VSX: stfiwx [[REG]],
+; CHECK-VSX: lwz 3,
+; CHECK-VSX: blr
 }
 
 define i32 @goo4(double %a) nounwind {
@@ -89,5 +132,11 @@ define i32 @goo4(double %a) nounwind {
 ; CHECK: stfiwx [[REG]],
 ; CHECK: lwz 3,
 ; CHECK: blr
+
+; CHECK-VSX: @goo4
+; CHECK-VSX: xscvdpuxws [[REG:[0-9]+]], 1
+; CHECK-VSX: stfiwx [[REG]],
+; CHECK-VSX: lwz 3,
+; CHECK-VSX: blr
 }
 
diff --git a/test/CodeGen/PowerPC/fold-zero.ll b/test/CodeGen/PowerPC/fold-zero.ll
index c7ec6fa..c1eea43 100644
--- a/test/CodeGen/PowerPC/fold-zero.ll
+++ b/test/CodeGen/PowerPC/fold-zero.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-crbits | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck -check-prefix=CHECK-CRB %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -12,3 +13,13 @@ define i32 @test1(i1 %a, i32 %c) nounwind  {
 ; CHECK: blr
 }
 
+define i32 @test2(i1 %a, i32 %c) nounwind  {
+  %x = select i1 %a, i32 0, i32 %c
+  ret i32 %x
+
+; CHECK-CRB: @test2
+; CHECK-CRB-NOT: li {{[0-9]+}}, 0
+; CHECK-CRB: isel 3, 0,
+; CHECK-CRB: blr
+}
+
diff --git a/test/CodeGen/PowerPC/hello-reloc.s b/test/CodeGen/PowerPC/hello-reloc.s
index 9bbfb38..1e3fb8f 100644
--- a/test/CodeGen/PowerPC/hello-reloc.s
+++ b/test/CodeGen/PowerPC/hello-reloc.s
@@ -1,14 +1,10 @@
 ; This tests for the basic implementation of PPCMachObjectWriter.cpp,
 ; which is responsible for writing mach-o relocation entries for (PIC)
 ; PowerPC objects.
-; NOTE: Darwin PPC asm syntax is not yet supported by PPCAsmParser,
-; so this test case uses ELF PPC asm syntax to produce a mach-o object.
-; Once PPCAsmParser supports darwin asm syntax, this test case should
-; be updated accordingly.  
 
 ; RUN: llvm-mc -filetype=obj -relocation-model=pic -mcpu=g4 -triple=powerpc-apple-darwin8 %s -o - | llvm-readobj -relocations | FileCheck -check-prefix=DARWIN-G4-DUMP %s
 
-;	.machine ppc7400
+	.machine ppc7400
 	.section	__TEXT,__textcoal_nt,coalesced,pure_instructions
 	.section	__TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
 	.section	__TEXT,__text,regular,pure_instructions
@@ -16,40 +12,40 @@
 	.align	4
 _main:                                  ; @main
 ; BB#0:                                 ; %entry
-	mflr 0
-	stw 31, -4(1)
-	stw 0, 8(1)
-	stwu 1, -80(1)
+	mflr r0
+	stw r31, -4(r1)
+	stw r0, 8(r1)
+	stwu r1, -80(r1)
 	bl L0$pb
 L0$pb:
-	mr 31, 1
-	li 5, 0
+	mr r31, r1
+	li r5, 0
 	mflr 2
-	stw 3, 68(31)
-	stw 5, 72(31)
-	stw 4, 64(31)
-	addis 2, 2, (L_.str-L0$pb)@ha
-	la 3, (L_.str-L0$pb)@l(2)
+	stw r3, 68(r31)
+	stw r5, 72(r31)
+	stw r4, 64(r31)
+	addis r2, r2, ha16(L_.str-L0$pb)
+	la r3, lo16(L_.str-L0$pb)(r2)
 	bl L_puts$stub
-	li 3, 0
-	addi 1, 1, 80
-	lwz 0, 8(1)
-	lwz 31, -4(1)
-	mtlr 0
+	li r3, 0
+	addi r1, r1, 80
+	lwz r0, 8(r1)
+	lwz r31, -4(r1)
+	mtlr r0
 	blr
 
 	.section	__TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
 	.align	4
 L_puts$stub:
 	.indirect_symbol	_puts
-	mflr 0
+	mflr r0
 	bcl 20, 31, L_puts$stub$tmp
 L_puts$stub$tmp:
-	mflr 11
-	addis 11, 11, (L_puts$lazy_ptr-L_puts$stub$tmp)@ha
-	mtlr 0
-	lwzu 12, (L_puts$lazy_ptr-L_puts$stub$tmp)@l(11)
-	mtctr 12
+	mflr r11
+	addis r11, r11, ha16(L_puts$lazy_ptr-L_puts$stub$tmp)
+	mtlr r0
+	lwzu r12, lo16(L_puts$lazy_ptr-L_puts$stub$tmp)(r11)
+	mtctr r12
 	bctr
 	.section	__DATA,__la_symbol_ptr,lazy_symbol_pointers
 L_puts$lazy_ptr:
diff --git a/test/CodeGen/PowerPC/i1-to-double.ll b/test/CodeGen/PowerPC/i1-to-double.ll
new file mode 100644
index 0000000..e3d9fc2
--- /dev/null
+++ b/test/CodeGen/PowerPC/i1-to-double.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=ppc32 -mcpu=ppc32 -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s
+define double @test(i1 %X) {
+        %Y = uitofp i1 %X to double
+        ret double %Y
+}
+
+; CHECK-LABEL: @test
+
+; CHECK: andi. {{[0-9]+}}, 3, 1
+; CHECK: bc 12, 1,
+
+; CHECK: li 3, .LCP[[L1:[A-Z0-9_]+]]@l
+; CHECK: addis 3, 3, .LCP[[L1]]@ha
+; CHECK: lfs 1, 0(3)
+; CHECK: blr
+
+; CHECK: li 3, .LCP[[L2:[A-Z0-9_]+]]@l
+; CHECK: addis 3, 3, .LCP[[L2]]@ha
+; CHECK: lfs 1, 0(3)
+; CHECK: blr
+
diff --git a/test/CodeGen/PowerPC/i32-to-float.ll b/test/CodeGen/PowerPC/i32-to-float.ll
index 2707d03..371f4e8 100644
--- a/test/CodeGen/PowerPC/i32-to-float.ll
+++ b/test/CodeGen/PowerPC/i32-to-float.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=g5 | FileCheck %s
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr6 | FileCheck -check-prefix=CHECK-PWR6 %s
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 | FileCheck -check-prefix=CHECK-A2 %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -29,6 +30,12 @@ entry:
 ; CHECK-A2: lfiwax [[REG:[0-9]+]],
 ; CHECK-A2: fcfids 1, [[REG]]
 ; CHECK-A2: blr
+
+; CHECK-VSX: @foo
+; CHECK-VSX: stw 3,
+; CHECK-VSX: lfiwax [[REG:[0-9]+]],
+; CHECK-VSX: fcfids 1, [[REG]]
+; CHECK-VSX: blr
 }
 
 define double @goo(i32 %a) nounwind {
@@ -54,6 +61,12 @@ entry:
 ; CHECK-A2: lfiwax [[REG:[0-9]+]],
 ; CHECK-A2: fcfid 1, [[REG]]
 ; CHECK-A2: blr
+
+; CHECK-VSX: @goo
+; CHECK-VSX: stw 3,
+; CHECK-VSX: lfiwax [[REG:[0-9]+]],
+; CHECK-VSX: xscvsxddp 1, [[REG]]
+; CHECK-VSX: blr
 }
 
 define float @foou(i32 %a) nounwind {
@@ -66,6 +79,12 @@ entry:
 ; CHECK-A2: lfiwzx [[REG:[0-9]+]],
 ; CHECK-A2: fcfidus 1, [[REG]]
 ; CHECK-A2: blr
+
+; CHECK-VSX: @foou
+; CHECK-VSX: stw 3,
+; CHECK-VSX: lfiwzx [[REG:[0-9]+]],
+; CHECK-VSX: fcfidus 1, [[REG]]
+; CHECK-VSX: blr
 }
 
 define double @goou(i32 %a) nounwind {
@@ -78,5 +97,11 @@ entry:
 ; CHECK-A2: lfiwzx [[REG:[0-9]+]],
 ; CHECK-A2: fcfidu 1, [[REG]]
 ; CHECK-A2: blr
+
+; CHECK-VSX: @goou
+; CHECK-VSX: stw 3,
+; CHECK-VSX: lfiwzx [[REG:[0-9]+]],
+; CHECK-VSX: xscvuxddp 1, [[REG]]
+; CHECK-VSX: blr
 }
 
diff --git a/test/CodeGen/PowerPC/i64-to-float.ll b/test/CodeGen/PowerPC/i64-to-float.ll
index b81d109..025a875 100644
--- a/test/CodeGen/PowerPC/i64-to-float.ll
+++ b/test/CodeGen/PowerPC/i64-to-float.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -12,6 +13,12 @@ entry:
 ; CHECK: lfd [[REG:[0-9]+]],
 ; CHECK: fcfids 1, [[REG]]
 ; CHECK: blr
+
+; CHECK-VSX: @foo
+; CHECK-VSX: std 3,
+; CHECK-VSX: lxsdx [[REG:[0-9]+]],
+; CHECK-VSX: fcfids 1, [[REG]]
+; CHECK-VSX: blr
 }
 
 define double @goo(i64 %a) nounwind {
@@ -24,6 +31,12 @@ entry:
 ; CHECK: lfd [[REG:[0-9]+]],
 ; CHECK: fcfid 1, [[REG]]
 ; CHECK: blr
+
+; CHECK-VSX: @goo
+; CHECK-VSX: std 3,
+; CHECK-VSX: lxsdx [[REG:[0-9]+]],
+; CHECK-VSX: xscvsxddp 1, [[REG]]
+; CHECK-VSX: blr
 }
 
 define float @foou(i64 %a) nounwind {
@@ -36,6 +49,12 @@ entry:
 ; CHECK: lfd [[REG:[0-9]+]],
 ; CHECK: fcfidus 1, [[REG]]
 ; CHECK: blr
+
+; CHECK-VSX: @foou
+; CHECK-VSX: std 3,
+; CHECK-VSX: lxsdx [[REG:[0-9]+]],
+; CHECK-VSX: fcfidus 1, [[REG]]
+; CHECK-VSX: blr
 }
 
 define double @goou(i64 %a) nounwind {
@@ -48,5 +67,11 @@ entry:
 ; CHECK: lfd [[REG:[0-9]+]],
 ; CHECK: fcfidu 1, [[REG]]
 ; CHECK: blr
+
+; CHECK-VSX: @goou
+; CHECK-VSX: std 3,
+; CHECK-VSX: lxsdx [[REG:[0-9]+]],
+; CHECK-VSX: xscvuxddp 1, [[REG]]
+; CHECK-VSX: blr
 }
 
diff --git a/test/CodeGen/PowerPC/inlineasm-copy.ll b/test/CodeGen/PowerPC/inlineasm-copy.ll
index 59c3388..0d5f6a6 100644
--- a/test/CodeGen/PowerPC/inlineasm-copy.ll
+++ b/test/CodeGen/PowerPC/inlineasm-copy.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=ppc32 -no-integrated-as -verify-machineinstrs | FileCheck %s
 
 ; CHECK-NOT: mr
 define i32 @test(i32 %Y, i32 %X) {
diff --git a/test/CodeGen/PowerPC/jaggedstructs.ll b/test/CodeGen/PowerPC/jaggedstructs.ll
index 82d4fef..9365e58 100644
--- a/test/CodeGen/PowerPC/jaggedstructs.ll
+++ b/test/CodeGen/PowerPC/jaggedstructs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=pwr7 -O0 -fast-isel=false < %s | FileCheck %s
+; RUN: llc -mcpu=ppc64 -O0 -fast-isel=false < %s | FileCheck %s
 
 ; This tests receiving and re-passing parameters consisting of structures
 ; of size 3, 5, 6, and 7.  They are to be found/placed right-adjusted in
diff --git a/test/CodeGen/PowerPC/lsa.ll b/test/CodeGen/PowerPC/lsa.ll
index 8a6338e..a892a4c 100644
--- a/test/CodeGen/PowerPC/lsa.ll
+++ b/test/CodeGen/PowerPC/lsa.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=ppc64 | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/mature-mc-support.ll b/test/CodeGen/PowerPC/mature-mc-support.ll
new file mode 100644
index 0000000..7c83e18
--- /dev/null
+++ b/test/CodeGen/PowerPC/mature-mc-support.ll
@@ -0,0 +1,27 @@
+; Test that inline assembly is parsed by the MC layer when MC support is mature
+; (even when the output is assembly).
+; FIXME: PowerPC doesn't use the integrated assembler by default in all cases
+; so we only test that -filetype=obj tries to parse the assembly.
+; FIXME: PowerPC doesn't appear to support -filetype=obj for ppc64le
+
+; SKIP: not llc -march=ppc32 < %s > /dev/null 2> %t1
+; SKIP: FileCheck %s < %t1
+
+; RUN: not llc -march=ppc32 -filetype=obj < %s > /dev/null 2> %t2
+; RUN: FileCheck %s < %t2
+
+; SKIP: not llc -march=ppc64 < %s > /dev/null 2> %t3
+; SKIP: FileCheck %s < %t3
+
+; RUN: not llc -march=ppc64 -filetype=obj < %s > /dev/null 2> %t4
+; RUN: FileCheck %s < %t4
+
+; SKIP: not llc -march=ppc64le < %s > /dev/null 2> %t5
+; SKIP: FileCheck %s < %t5
+
+; SKIP: not llc -march=ppc64le -filetype=obj < %s > /dev/null 2> %t6
+; SKIP: FileCheck %s < %t6
+
+module asm "	.this_directive_is_very_unlikely_to_exist"
+
+; CHECK: LLVM ERROR: Error parsing inline asm
diff --git a/test/CodeGen/PowerPC/optcmp.ll b/test/CodeGen/PowerPC/optcmp.ll
index 35aabfa..d929eae 100644
--- a/test/CodeGen/PowerPC/optcmp.ll
+++ b/test/CodeGen/PowerPC/optcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -disable-ppc-cmp-opt=0 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -mattr=-crbits -disable-ppc-cmp-opt=0 | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll b/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll
new file mode 100644
index 0000000..6e0aec2
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=ppc32 -mcpu=ppc32 | FileCheck %s
+; RUN: llc < %s -march=ppc32 -mcpu=ppc32 -mtriple=powerpc-darwin | FileCheck %s -check-prefix=CHECK-D
+target triple = "powerpc-unknown-linux-gnu"
+
+declare void @printf(i8*, ...)
+
+define void @main() {
+  call void (i8*, ...)* @printf(i8* undef, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @main
+; CHECK-DAG li 4, 0
+; CHECK-DAG: crxor 6, 6, 6
+; CHECK: bl printf
+
+; CHECK-D-LABEL: @main
+; CHECK-D: li r4, 0
+; CHECK-D: bl L_printf$stub
+
diff --git a/test/CodeGen/PowerPC/ppc32-vacopy.ll b/test/CodeGen/PowerPC/ppc32-vacopy.ll
index bc39412..fa54045 100644
--- a/test/CodeGen/PowerPC/ppc32-vacopy.ll
+++ b/test/CodeGen/PowerPC/ppc32-vacopy.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple="powerpc-unknown-linux-gnu" < %s | FileCheck %s
+; RUN: llc -mtriple="powerpc-unknown-linux-gnu" -mcpu=ppc64 < %s | FileCheck %s
 ; PR15286
 
 %va_list = type {i8, i8, i16, i8*, i8*}
diff --git a/test/CodeGen/PowerPC/pr17168.ll b/test/CodeGen/PowerPC/pr17168.ll
index 2848221..24bcda0 100644
--- a/test/CodeGen/PowerPC/pr17168.ll
+++ b/test/CodeGen/PowerPC/pr17168.ll
@@ -56,7 +56,7 @@ attributes #1 = { nounwind readnone }
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 190311)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !298, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"bt.c", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
-!2 = metadata !{i32 0}
+!2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !82, metadata !102, metadata !114, metadata !132, metadata !145, metadata !154, metadata !155, metadata !162, metadata !183, metadata !200, metadata !201, metadata !207, metadata !208, metadata !215, metadata !221, metadata !230, metadata !238, metadata !246, metadata !255, metadata !260, metadata !261, metadata !268, metadata !274, metadata !279, metadata !280, metadata !287, metadata !293}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 74, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !12, i32 74} ; [ DW_TAG_subprogram ] [line 74] [def] [main]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
diff --git a/test/CodeGen/PowerPC/private.ll b/test/CodeGen/PowerPC/private.ll
index f9405f6..633fa65 100644
--- a/test/CodeGen/PowerPC/private.ll
+++ b/test/CodeGen/PowerPC/private.ll
@@ -1,24 +1,28 @@
 ; Test to make sure that the 'private' is used correctly.
 ;
-; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu > %t
-; RUN: grep .Lfoo: %t
-; RUN: grep bl.*\.Lfoo %t
-; RUN: grep .Lbaz: %t
-; RUN: grep lis.*\.Lbaz %t
-; RUN: llc < %s -mtriple=powerpc-apple-darwin > %t
-; RUN: grep L_foo: %t
-; RUN: grep bl.*\L_foo %t
-; RUN: grep L_baz: %t
-; RUN: grep lis.*\L_baz %t
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | \
+; RUN: FileCheck --check-prefix=LINUX %s
+;
+; RUN: llc < %s -mtriple=powerpc-apple-darwin | \
+; RUN: FileCheck --check-prefix=OSX %s
 
+; LINUX: .Lfoo:
+; OSX: l_foo:
 define private void @foo() nounwind {
         ret void
 }
 
-@baz = private global i32 4
-
 define i32 @bar() nounwind {
+; LINUX: bl{{.*}}.Lfoo
+; OSX: bl{{.*}}l_foo
         call void @foo()
+
+; LINUX: lis{{.*}}.Lbaz
+; OSX:  lis{{.*}}l_baz
 	%1 = load i32* @baz, align 4
         ret i32 %1
 }
+
+; LINUX: .Lbaz:
+; OSX: l_baz:
+@baz = private global i32 4
diff --git a/test/CodeGen/PowerPC/pwr7-gt-nop.ll b/test/CodeGen/PowerPC/pwr7-gt-nop.ll
new file mode 100644
index 0000000..8c8545d
--- /dev/null
+++ b/test/CodeGen/PowerPC/pwr7-gt-nop.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mcpu=pwr7 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c, float* nocapture %d) #0 {
+
+; CHECK-LABEL: @foo
+
+entry:
+  %0 = load float* %b, align 4
+  store float %0, float* %a, align 4
+  %1 = load float* %c, align 4
+  store float %1, float* %b, align 4
+  %2 = load float* %a, align 4
+  store float %2, float* %d, align 4
+  ret void
+
+; CHECK: lfs [[REG1:[0-9]+]], 0(4)
+; CHECK: stfs [[REG1]], 0(3)
+; CHECK: ori 2, 2, 0
+; CHECK: lfs [[REG2:[0-9]+]], 0(5)
+; CHECK: stfs [[REG2]], 0(4)
+; CHECK: ori 2, 2, 0
+; CHECK: lfs [[REG3:[0-9]+]], 0(3)
+; CHECK: stfs [[REG3]], 0(6)
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/rlwimi-and.ll b/test/CodeGen/PowerPC/rlwimi-and.ll
index 7963249..213363e 100644
--- a/test/CodeGen/PowerPC/rlwimi-and.ll
+++ b/test/CodeGen/PowerPC/rlwimi-and.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mattr=-crbits < %s | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-bgq-linux"
 
diff --git a/test/CodeGen/PowerPC/sdag-ppcf128.ll b/test/CodeGen/PowerPC/sdag-ppcf128.ll
index 535ece6..c46bc6b 100644
--- a/test/CodeGen/PowerPC/sdag-ppcf128.ll
+++ b/test/CodeGen/PowerPC/sdag-ppcf128.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mattr=-crbits < %s | FileCheck %s
 ;
 ; PR14751: Unsupported type in SelectionDAG::getConstantFP()
 
diff --git a/test/CodeGen/PowerPC/setcc_no_zext.ll b/test/CodeGen/PowerPC/setcc_no_zext.ll
index 9b2036e..467e921 100644
--- a/test/CodeGen/PowerPC/setcc_no_zext.ll
+++ b/test/CodeGen/PowerPC/setcc_no_zext.ll
@@ -1,5 +1,9 @@
 ; RUN: llc < %s -march=ppc32 | not grep rlwinm
 
+; FIXME: This optimization has temporarily regressed with crbits enabled by
+; default at the default CodeOpt level.
+; XFAIL: *
+
 define i32 @setcc_one_or_zero(i32* %a) {
 entry:
         %tmp.1 = icmp ne i32* %a, null          ; <i1> [#uses=1]
diff --git a/test/CodeGen/PowerPC/seteq-0.ll b/test/CodeGen/PowerPC/seteq-0.ll
index 7319583..b7dd780 100644
--- a/test/CodeGen/PowerPC/seteq-0.ll
+++ b/test/CodeGen/PowerPC/seteq-0.ll
@@ -1,9 +1,12 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | \
-; RUN:   grep "srwi r., r., 5"
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s
 
 define i32 @eq0(i32 %a) {
         %tmp.1 = icmp eq i32 %a, 0              ; <i1> [#uses=1]
         %tmp.2 = zext i1 %tmp.1 to i32          ; <i32> [#uses=1]
         ret i32 %tmp.2
+
+; CHECK: cntlzw [[REG:r[0-9]+]], r3
+; CHECK: rlwinm r3, [[REG]], 27, 31, 31
+; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/sjlj.ll b/test/CodeGen/PowerPC/sjlj.ll
index 414640b..f9f887a 100644
--- a/test/CodeGen/PowerPC/sjlj.ll
+++ b/test/CodeGen/PowerPC/sjlj.ll
@@ -134,8 +134,8 @@ return:                                           ; preds = %if.end, %if.then
 ; CHECK: @main2
 
 ; CHECK: addis [[REG:[0-9]+]], 2, env_sigill@toc@ha
-; CHECK: std 31, env_sigill@toc@l([[REG]])
-; CHECK: addi [[REGB:[0-9]+]], [[REG]], env_sigill@toc@l
+; CHECK-DAG: std 31, env_sigill@toc@l([[REG]])
+; CHECK-DAG: addi [[REGB:[0-9]+]], [[REG]], env_sigill@toc@l
 ; CHECK-DAG: std [[REGB]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
 ; CHECK-DAG: std 1, 16([[REGB]])
 ; CHECK-DAG: std 2, 24([[REGB]])
diff --git a/test/CodeGen/PowerPC/spill-nor0.ll b/test/CodeGen/PowerPC/spill-nor0.ll
new file mode 100644
index 0000000..65bdc09
--- /dev/null
+++ b/test/CodeGen/PowerPC/spill-nor0.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -O0 -mcpu=ppc64 | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @_ZN4llvm3sys17RunningOnValgrindEv() #0 {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  ret void
+
+if.end:                                           ; preds = %entry
+  %0 = call i64 asm sideeffect "mr 3,$1\0A\09mr 4,$2\0A\09rotldi 0,0,3  ; rotldi 0,0,13\0A\09rotldi 0,0,61 ; rotldi 0,0,51\0A\09or 1,1,1\0A\09mr $0,3", "=b,b,b,~{cc},~{memory},~{r3},~{r4}"(i32 0, i64* undef) #0
+  unreachable
+
+; CHECK-LABEL: @_ZN4llvm3sys17RunningOnValgrindEv
+; CHECK: stw
+; CHECK: lwz
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/srl-mask.ll b/test/CodeGen/PowerPC/srl-mask.ll
new file mode 100644
index 0000000..2749df9
--- /dev/null
+++ b/test/CodeGen/PowerPC/srl-mask.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i64 @foo(i64 %x) #0 {
+entry:
+; CHECK-LABEL: @foo
+  %a = lshr i64 %x, 35
+  %b = and i64 %a, 65535
+; CHECK: rldicl 3, 3, 29, 48
+  ret i64 %b
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/stfiwx.ll b/test/CodeGen/PowerPC/stfiwx.ll
index 1ad558c..588e44f 100644
--- a/test/CodeGen/PowerPC/stfiwx.ll
+++ b/test/CodeGen/PowerPC/stfiwx.ll
@@ -1,18 +1,27 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=stfiwx -o %t1
-; RUN: grep stfiwx %t1
-; RUN: not grep r1 %t1
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=-stfiwx \
-; RUN:   -o %t2
-; RUN: not grep stfiwx %t2
-; RUN: grep r1 %t2
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=stfiwx | FileCheck %s
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=-stfiwx | FileCheck -check-prefix=CHECK-LS %s
 
-define void @test(float %a, i32* %b) nounwind {
+define void @test1(float %a, i32* %b) nounwind {
+; CHECK-LABEL: @test1
+; CHECK-LS-LABEL: @test1
         %tmp.2 = fptosi float %a to i32         ; <i32> [#uses=1]
         store i32 %tmp.2, i32* %b
         ret void
+
+; CHECK-NOT: lwz
+; CHECK-NOT: stw
+; CHECK: stfiwx
+; CHECK: blr
+
+; CHECK-LS: lwz
+; CHECK-LS: stw
+; CHECK-LS-NOT: stfiwx
+; CHECK-LS: blr
 }
 
 define void @test2(float %a, i32* %b, i32 %i) nounwind {
+; CHECK-LABEL: @test2
+; CHECK-LS-LABEL: @test2
         %tmp.2 = getelementptr i32* %b, i32 1           ; <i32*> [#uses=1]
         %tmp.5 = getelementptr i32* %b, i32 %i          ; <i32*> [#uses=1]
         %tmp.7 = fptosi float %a to i32         ; <i32> [#uses=3]
@@ -20,5 +29,15 @@ define void @test2(float %a, i32* %b, i32 %i) nounwind {
         store i32 %tmp.7, i32* %tmp.2
         store i32 %tmp.7, i32* %b
         ret void
+
+; CHECK-NOT: lwz
+; CHECK-NOT: stw
+; CHECK: stfiwx
+; CHECK: blr
+
+; CHECK-LS: lwz
+; CHECK-LS: stw
+; CHECK-LS-NOT: stfiwx
+; CHECK-LS: blr
 }
 
diff --git a/test/CodeGen/PowerPC/structsinmem.ll b/test/CodeGen/PowerPC/structsinmem.ll
index 5b8dead..b5552af 100644
--- a/test/CodeGen/PowerPC/structsinmem.ll
+++ b/test/CodeGen/PowerPC/structsinmem.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=pwr7 -O0 -disable-fp-elim -fast-isel=false < %s | FileCheck %s
+; RUN: llc -mcpu=ppc64 -O0 -disable-fp-elim -fast-isel=false < %s | FileCheck %s
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/structsinregs.ll b/test/CodeGen/PowerPC/structsinregs.ll
index fb3bd7c..cfe32e9 100644
--- a/test/CodeGen/PowerPC/structsinregs.ll
+++ b/test/CodeGen/PowerPC/structsinregs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=pwr7 -O0 -disable-fp-elim -fast-isel=false < %s | FileCheck %s
+; RUN: llc -mcpu=ppc64 -O0 -disable-fp-elim -fast-isel=false < %s | FileCheck %s
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/subsumes-pred-regs.ll b/test/CodeGen/PowerPC/subsumes-pred-regs.ll
index 97ac788..da637cd 100644
--- a/test/CodeGen/PowerPC/subsumes-pred-regs.ll
+++ b/test/CodeGen/PowerPC/subsumes-pred-regs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=ppc64 | FileCheck %s
+; RUN: llc < %s -mcpu=ppc64 -mattr=-crbits | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/PowerPC/tls-2.ll b/test/CodeGen/PowerPC/tls-2.ll
deleted file mode 100644
index c2faf90..0000000
--- a/test/CodeGen/PowerPC/tls-2.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-freebsd10.0"
-; RUN: llc -O1 < %s -march=ppc64 | FileCheck %s
-
-@a = thread_local global i32 0, align 4
-
-;CHECK-LABEL:          localexec:
-define i32 @localexec() nounwind {
-entry:
-;CHECK:          addis [[REG1:[0-9]+]], 13, a@tprel@ha
-;CHECK-NEXT:     li [[REG2:[0-9]+]], 42
-;CHECK-NEXT:     stw [[REG2]], a@tprel@l([[REG1]])
-  store i32 42, i32* @a, align 4
-  ret i32 0
-}
diff --git a/test/CodeGen/PowerPC/tls-gd.ll b/test/CodeGen/PowerPC/tls-gd.ll
deleted file mode 100644
index 5f0ef9a..0000000
--- a/test/CodeGen/PowerPC/tls-gd.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -mcpu=pwr7 -O0 -relocation-model=pic < %s | FileCheck %s
-
-; Test correct assembly code generation for thread-local storage using
-; the general dynamic model.
-
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-@a = thread_local global i32 0, align 4
-
-define signext i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @a, align 4
-  ret i32 %0
-}
-
-; CHECK: addis [[REG:[0-9]+]], 2, a@got@tlsgd@ha
-; CHECK-NEXT: addi 3, [[REG]], a@got@tlsgd@l
-; CHECK:      bl __tls_get_addr(a@tlsgd)
-; CHECK-NEXT: nop
-
diff --git a/test/CodeGen/PowerPC/tls-ie.ll b/test/CodeGen/PowerPC/tls-ie.ll
deleted file mode 100644
index c5cfba7..0000000
--- a/test/CodeGen/PowerPC/tls-ie.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -mcpu=pwr7 -O0 <%s | FileCheck %s
-
-; Test correct assembly code generation for thread-local storage
-; using the initial-exec model.
-
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-@a = external thread_local global i32
-
-define signext i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @a, align 4
-  ret i32 %0
-}
-
-; CHECK: addis [[REG1:[0-9]+]], 2, a@got@tprel@ha
-; CHECK: ld [[REG2:[0-9]+]], a@got@tprel@l([[REG1]])
-; CHECK: add {{[0-9]+}}, [[REG2]], a@tls
-
diff --git a/test/CodeGen/PowerPC/tls-ld-2.ll b/test/CodeGen/PowerPC/tls-ld-2.ll
deleted file mode 100644
index 4399b33..0000000
--- a/test/CodeGen/PowerPC/tls-ld-2.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc -mcpu=pwr7 -O1 -relocation-model=pic < %s | FileCheck %s
-
-; Test peephole optimization for thread-local storage using the
-; local dynamic model.
-
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-@a = hidden thread_local global i32 0, align 4
-
-define signext i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @a, align 4
-  ret i32 %0
-}
-
-; CHECK:      addis [[REG:[0-9]+]], 2, a@got@tlsld@ha
-; CHECK-NEXT: addi 3, [[REG]], a@got@tlsld@l
-; CHECK:      bl __tls_get_addr(a@tlsld)
-; CHECK-NEXT: nop
-; CHECK:      addis [[REG2:[0-9]+]], 3, a@dtprel@ha
-; CHECK-NEXT: lwa {{[0-9]+}}, a@dtprel@l([[REG2]])
diff --git a/test/CodeGen/PowerPC/tls-ld.ll b/test/CodeGen/PowerPC/tls-ld.ll
deleted file mode 100644
index db02a56..0000000
--- a/test/CodeGen/PowerPC/tls-ld.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc -mcpu=pwr7 -O0 -relocation-model=pic < %s | FileCheck %s
-
-; Test correct assembly code generation for thread-local storage using
-; the local dynamic model.
-
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-@a = hidden thread_local global i32 0, align 4
-
-define signext i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @a, align 4
-  ret i32 %0
-}
-
-; CHECK:      addis [[REG:[0-9]+]], 2, a@got@tlsld@ha
-; CHECK-NEXT: addi 3, [[REG]], a@got@tlsld@l
-; CHECK:      bl __tls_get_addr(a@tlsld)
-; CHECK-NEXT: nop
-; CHECK:      addis [[REG2:[0-9]+]], 3, a@dtprel@ha
-; CHECK-NEXT: addi {{[0-9]+}}, [[REG2]], a@dtprel@l
diff --git a/test/CodeGen/PowerPC/tls-pic.ll b/test/CodeGen/PowerPC/tls-pic.ll
new file mode 100644
index 0000000..9f3ab6e
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-pic.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=ppc64 -mcpu=pwr7 -O0 -relocation-model=pic < %s | FileCheck -check-prefix=OPT0 %s
+; RUN: llc -march=ppc64 -mcpu=pwr7 -O1 -relocation-model=pic < %s | FileCheck -check-prefix=OPT1 %s
+
+target triple = "powerpc64-unknown-linux-gnu"
+; Test correct assembly code generation for thread-local storage using
+; the local dynamic model.
+
+@a = hidden thread_local global i32 0, align 4
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; OPT0-LABEL: main:
+; OPT0:      addis [[REG:[0-9]+]], 2, a@got@tlsld@ha
+; OPT0-NEXT: addi 3, [[REG]], a@got@tlsld@l
+; OPT0:      bl __tls_get_addr(a@tlsld)
+; OPT0-NEXT: nop
+; OPT0:      addis [[REG2:[0-9]+]], 3, a@dtprel@ha
+; OPT0-NEXT: addi {{[0-9]+}}, [[REG2]], a@dtprel@l
+
+; Test peephole optimization for thread-local storage using the
+; local dynamic model.
+
+; OPT1-LABEL: main:
+; OPT1:      addis [[REG:[0-9]+]], 2, a@got@tlsld@ha
+; OPT1-NEXT: addi 3, [[REG]], a@got@tlsld@l
+; OPT1:      bl __tls_get_addr(a@tlsld)
+; OPT1-NEXT: nop
+; OPT1:      addis [[REG2:[0-9]+]], 3, a@dtprel@ha
+; OPT1-NEXT: lwa {{[0-9]+}}, a@dtprel@l([[REG2]])
+
+; Test correct assembly code generation for thread-local storage using
+; the general dynamic model.
+
+@a2 = thread_local global i32 0, align 4
+
+define signext i32 @main2() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a2, align 4
+  ret i32 %0
+}
+
+; OPT1-LABEL: main2
+; OPT1: addis [[REG:[0-9]+]], 2, a2@got@tlsgd@ha
+; OPT1-NEXT: addi 3, [[REG]], a2@got@tlsgd@l
+; OPT1:      bl __tls_get_addr(a2@tlsgd)
+; OPT1-NEXT: nop
+
diff --git a/test/CodeGen/PowerPC/tls.ll b/test/CodeGen/PowerPC/tls.ll
index 4e0a822..59b4de7 100644
--- a/test/CodeGen/PowerPC/tls.ll
+++ b/test/CodeGen/PowerPC/tls.ll
@@ -1,7 +1,8 @@
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-freebsd10.0"
-; RUN: llc -O0 < %s -march=ppc64 | FileCheck -check-prefix=OPT0 %s
-; RUN: llc -O1 < %s -march=ppc64 | FileCheck -check-prefix=OPT1 %s
+; RUN: llc -O0 < %s -march=ppc64 -mcpu=ppc64 | FileCheck -check-prefix=OPT0 %s
+; RUN: llc -O1 < %s -march=ppc64 -mcpu=ppc64 | FileCheck -check-prefix=OPT1 %s
+; RUN: llc -O0 < %s -march=ppc32 -mcpu=ppc | FileCheck -check-prefix=OPT0-PPC32 %s
+
+target triple = "powerpc64-unknown-linux-gnu"
 
 @a = thread_local global i32 0, align 4
 
@@ -19,3 +20,27 @@ entry:
   store i32 42, i32* @a, align 4
   ret i32 0
 }
+
+; Test correct assembly code generation for thread-local storage
+; using the initial-exec model.
+
+@a2 = external thread_local global i32
+
+define signext i32 @main2() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a2, align 4
+  ret i32 %0
+}
+
+; OPT1-LABEL: main2:
+; OPT1: addis [[REG1:[0-9]+]], 2, a2@got@tprel@ha
+; OPT1: ld [[REG2:[0-9]+]], a2@got@tprel@l([[REG1]])
+; OPT1: add {{[0-9]+}}, [[REG2]], a2@tls
+
+;OPT0-PPC32-LABEL:    main2:
+;OPT0-PPC32:       li [[REG1:[0-9]+]], _GLOBAL_OFFSET_TABLE_@l
+;OPT0-PPC32:       addis [[REG1]], [[REG1]], _GLOBAL_OFFSET_TABLE_@ha
+;OPT0-PPC32:       lwz [[REG2:[0-9]+]], a2@got@tprel@l([[REG1]])
+;OPT0-PPC32:       add 3, [[REG2]], a2@tls
diff --git a/test/CodeGen/PowerPC/unaligned.ll b/test/CodeGen/PowerPC/unaligned.ll
index d050803..d469c62 100644
--- a/test/CodeGen/PowerPC/unaligned.ll
+++ b/test/CodeGen/PowerPC/unaligned.ll
@@ -65,9 +65,9 @@ entry:
 ; These loads and stores are legalized into aligned loads and stores
 ; using aligned stack slots.
 ; CHECK: @foo6
-; CHECK: ld
-; CHECK: ld
-; CHECK: std
-; CHECK: std
+; CHECK-DAG: ld
+; CHECK-DAG: ld
+; CHECK-DAG: stdx
+; CHECK: stdx
 }
 
diff --git a/test/CodeGen/PowerPC/unwind-dw2-g.ll b/test/CodeGen/PowerPC/unwind-dw2-g.ll
index 260d036..24b5207 100644
--- a/test/CodeGen/PowerPC/unwind-dw2-g.ll
+++ b/test/CodeGen/PowerPC/unwind-dw2-g.ll
@@ -23,7 +23,7 @@ attributes #0 = { nounwind }
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/unwind-dw2.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"/tmp/unwind-dw2.c", metadata !"/tmp"}
-!2 = metadata !{i32 0}
+!2 = metadata !{}
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/unwind-dw2.c]
diff --git a/test/CodeGen/PowerPC/vec_cmp.ll b/test/CodeGen/PowerPC/vec_cmp.ll
index 83e0e02..4bce8c8 100644
--- a/test/CodeGen/PowerPC/vec_cmp.ll
+++ b/test/CodeGen/PowerPC/vec_cmp.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
 
-; Check vector comparisons using altivec. For non native types, just basic
+; Check vector comparisons using altivec. For non-native types, just basic
 ; comparison instruction check is done. For altivec supported type (16i8,
 ; 8i16, 4i32, and 4f32) all the comparisons operators (==, !=, >, >=, <, <=)
 ; are checked.
diff --git a/test/CodeGen/PowerPC/vsx-args.ll b/test/CodeGen/PowerPC/vsx-args.ll
new file mode 100644
index 0000000..520aeb5
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-args.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <2 x double> @sv(<2 x double>, <2 x i64>, <4 x float>) #0
+
+define <2 x double> @main(<4 x float> %a, <2 x double> %b, <2 x i64> %c) #1 {
+entry:
+  %ca = tail call <2 x double> @sv(<2 x double> %b, <2 x i64> %c,  <4 x float> %a)
+  %v = fadd <2 x double> %ca, <double 1.0, double 1.0>
+  ret <2 x double> %v
+
+; CHECK-LABEL: @main
+; CHECK-DAG: vor [[V:[0-9]+]], 2, 2
+; CHECK-DAG: xxlor 34, 35, 35
+; CHECK-DAG: xxlor 35, 36, 36
+; CHECK-DAG: vor 4, [[V]], [[V]]
+; CHECK-DAG: bl sv
+; CHECK-DAG: lxvd2x [[VC:[0-9]+]],
+; CHECK: xvadddp 34, 34, [[VC]]
+; CHECK: blr
+}
+
+attributes #0 = { noinline nounwind readnone }
+attributes #1 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/vsx-fma-m.ll b/test/CodeGen/PowerPC/vsx-fma-m.ll
new file mode 100644
index 0000000..da4a204
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-fma-m.ll
@@ -0,0 +1,238 @@
+; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx | FileCheck %s
+
+; Also run with -schedule-ppc-vsx-fma-mutation-early as a stress test for the
+; live-interval-updating logic.
+; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx -schedule-ppc-vsx-fma-mutation-early
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @test1(double %a, double %b, double %c, double %e, double* nocapture %d) #0 {
+entry:
+  %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
+  store double %0, double* %d, align 8
+  %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
+  %arrayidx1 = getelementptr inbounds double* %d, i64 1
+  store double %1, double* %arrayidx1, align 8
+  ret void
+
+; CHECK-LABEL: @test1
+; CHECK-DAG: li [[C1:[0-9]+]], 8
+; CHECK-DAG: xsmaddmdp 3, 2, 1
+; CHECK-DAG: xsmaddadp 1, 2, 4
+; CHECK-DAG: stxsdx 3, 0, 7
+; CHECK-DAG: stxsdx 1, 7, [[C1]]
+; CHECK: blr
+}
+
+define void @test2(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
+entry:
+  %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
+  store double %0, double* %d, align 8
+  %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
+  %arrayidx1 = getelementptr inbounds double* %d, i64 1
+  store double %1, double* %arrayidx1, align 8
+  %2 = tail call double @llvm.fma.f64(double %b, double %f, double %a)
+  %arrayidx2 = getelementptr inbounds double* %d, i64 2
+  store double %2, double* %arrayidx2, align 8
+  ret void
+
+; CHECK-LABEL: @test2
+; CHECK-DAG: li [[C1:[0-9]+]], 8
+; CHECK-DAG: li [[C2:[0-9]+]], 16
+; CHECK-DAG: xsmaddmdp 3, 2, 1
+; CHECK-DAG: xsmaddmdp 4, 2, 1
+; CHECK-DAG: xsmaddadp 1, 2, 5
+; CHECK-DAG: stxsdx 3, 0, 8
+; CHECK-DAG: stxsdx 4, 8, [[C1]]
+; CHECK-DAG: stxsdx 1, 8, [[C2]]
+; CHECK: blr
+}
+
+define void @test3(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
+entry:
+  %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
+  store double %0, double* %d, align 8
+  %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
+  %2 = tail call double @llvm.fma.f64(double %b, double %c, double %1)
+  %arrayidx1 = getelementptr inbounds double* %d, i64 3
+  store double %2, double* %arrayidx1, align 8
+  %3 = tail call double @llvm.fma.f64(double %b, double %f, double %a)
+  %arrayidx2 = getelementptr inbounds double* %d, i64 2
+  store double %3, double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %d, i64 1
+  store double %1, double* %arrayidx3, align 8
+  ret void
+
+; CHECK-LABEL: @test3
+; CHECK-DAG: fmr [[F1:[0-9]+]], 1
+; CHECK-DAG: li [[C1:[0-9]+]], 24
+; CHECK-DAG: li [[C2:[0-9]+]], 16
+; CHECK-DAG: li [[C3:[0-9]+]], 8
+; CHECK-DAG: xsmaddmdp 4, 2, 1
+; CHECK-DAG: xsmaddadp 1, 2, 5
+
+; Note: We could convert this next FMA to M-type as well, but it would require
+; re-ordering the instructions.
+; CHECK-DAG: xsmaddadp [[F1]], 2, 3
+
+; CHECK-DAG: xsmaddmdp 2, 3, 4
+; CHECK-DAG: stxsdx [[F1]], 0, 8
+; CHECK-DAG: stxsdx 2, 8, [[C1]]
+; CHECK-DAG: stxsdx 1, 8, [[C2]]
+; CHECK-DAG: stxsdx 4, 8, [[C3]]
+; CHECK: blr
+}
+
+define void @test4(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
+entry:
+  %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
+  store double %0, double* %d, align 8
+  %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
+  %arrayidx1 = getelementptr inbounds double* %d, i64 1
+  store double %1, double* %arrayidx1, align 8
+  %2 = tail call double @llvm.fma.f64(double %b, double %c, double %1)
+  %arrayidx3 = getelementptr inbounds double* %d, i64 3
+  store double %2, double* %arrayidx3, align 8
+  %3 = tail call double @llvm.fma.f64(double %b, double %f, double %a)
+  %arrayidx4 = getelementptr inbounds double* %d, i64 2
+  store double %3, double* %arrayidx4, align 8
+  ret void
+
+; CHECK-LABEL: @test4
+; CHECK-DAG: fmr [[F1:[0-9]+]], 1
+; CHECK-DAG: li [[C1:[0-9]+]], 8
+; CHECK-DAG: li [[C2:[0-9]+]], 16
+; CHECK-DAG: xsmaddmdp 4, 2, 1
+
+; Note: We could convert this next FMA to M-type as well, but it would require
+; re-ordering the instructions.
+; CHECK-DAG: xsmaddadp 1, 2, 5
+
+; CHECK-DAG: xsmaddadp [[F1]], 2, 3
+; CHECK-DAG: stxsdx [[F1]], 0, 8
+; CHECK-DAG: stxsdx 4, 8, [[C1]]
+; CHECK-DAG: li [[C3:[0-9]+]], 24
+; CHECK-DAG: xsmaddadp 4, 2, 3
+; CHECK-DAG: stxsdx 4, 8, [[C3]]
+; CHECK-DAG: stxsdx 1, 8, [[C2]]
+; CHECK: blr
+}
+
+declare double @llvm.fma.f64(double, double, double) #0
+
+define void @testv1(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %e, <2 x double>* nocapture %d) #0 {
+entry:
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %c, <2 x double> %a)
+  store <2 x double> %0, <2 x double>* %d, align 8
+  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %e, <2 x double> %a)
+  %arrayidx1 = getelementptr inbounds <2 x double>* %d, i64 1
+  store <2 x double> %1, <2 x double>* %arrayidx1, align 8
+  ret void
+
+; CHECK-LABEL: @testv1
+; CHECK-DAG: xvmaddmdp 36, 35, 34
+; CHECK-DAG: xvmaddadp 34, 35, 37
+; CHECK-DAG: li [[C1:[0-9]+]], 16
+; CHECK-DAG: stxvd2x 36, 0, 3
+; CHECK-DAG: stxvd2x 34, 3, [[C1:[0-9]+]]
+; CHECK: blr
+}
+
+define void @testv2(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %e, <2 x double> %f, <2 x double>* nocapture %d) #0 {
+entry:
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %c, <2 x double> %a)
+  store <2 x double> %0, <2 x double>* %d, align 8
+  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %e, <2 x double> %a)
+  %arrayidx1 = getelementptr inbounds <2 x double>* %d, i64 1
+  store <2 x double> %1, <2 x double>* %arrayidx1, align 8
+  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %f, <2 x double> %a)
+  %arrayidx2 = getelementptr inbounds <2 x double>* %d, i64 2
+  store <2 x double> %2, <2 x double>* %arrayidx2, align 8
+  ret void
+
+; CHECK-LABEL: @testv2
+; CHECK-DAG: xvmaddmdp 36, 35, 34
+; CHECK-DAG: xvmaddmdp 37, 35, 34
+; CHECK-DAG: li [[C1:[0-9]+]], 16
+; CHECK-DAG: li [[C2:[0-9]+]], 32
+; CHECK-DAG: xvmaddadp 34, 35, 38
+; CHECK-DAG: stxvd2x 36, 0, 3
+; CHECK-DAG: stxvd2x 37, 3, [[C1:[0-9]+]]
+; CHECK-DAG: stxvd2x 34, 3, [[C2:[0-9]+]]
+; CHECK: blr
+}
+
+define void @testv3(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %e, <2 x double> %f, <2 x double>* nocapture %d) #0 {
+entry:
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %c, <2 x double> %a)
+  store <2 x double> %0, <2 x double>* %d, align 8
+  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %e, <2 x double> %a)
+  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %c, <2 x double> %1)
+  %arrayidx1 = getelementptr inbounds <2 x double>* %d, i64 3
+  store <2 x double> %2, <2 x double>* %arrayidx1, align 8
+  %3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %f, <2 x double> %a)
+  %arrayidx2 = getelementptr inbounds <2 x double>* %d, i64 2
+  store <2 x double> %3, <2 x double>* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds <2 x double>* %d, i64 1
+  store <2 x double> %1, <2 x double>* %arrayidx3, align 8
+  ret void
+
+; CHECK-LABEL: @testv3
+; CHECK-DAG: xxlor [[V1:[0-9]+]], 34, 34
+; CHECK-DAG: xvmaddmdp 37, 35, 34
+; CHECK-DAG: li [[C1:[0-9]+]], 48
+; CHECK-DAG: li [[C2:[0-9]+]], 32
+; CHECK-DAG: xvmaddadp 34, 35, 38
+; CHECK-DAG: li [[C3:[0-9]+]], 16
+
+; Note: We could convert this next FMA to M-type as well, but it would require
+; re-ordering the instructions.
+; CHECK-DAG: xvmaddadp [[V1]], 35, 36
+
+; CHECK-DAG: xvmaddmdp 35, 36, 37
+; CHECK-DAG: stxvd2x 32, 0, 3
+; CHECK-DAG: stxvd2x 35, 3, [[C1]]
+; CHECK-DAG: stxvd2x 34, 3, [[C2]]
+; CHECK-DAG: stxvd2x 37, 3, [[C3]]
+; CHECK: blr
+}
+
+define void @testv4(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %e, <2 x double> %f, <2 x double>* nocapture %d) #0 {
+entry:
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %c, <2 x double> %a)
+  store <2 x double> %0, <2 x double>* %d, align 8
+  %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %e, <2 x double> %a)
+  %arrayidx1 = getelementptr inbounds <2 x double>* %d, i64 1
+  store <2 x double> %1, <2 x double>* %arrayidx1, align 8
+  %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %c, <2 x double> %1)
+  %arrayidx3 = getelementptr inbounds <2 x double>* %d, i64 3
+  store <2 x double> %2, <2 x double>* %arrayidx3, align 8
+  %3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %f, <2 x double> %a)
+  %arrayidx4 = getelementptr inbounds <2 x double>* %d, i64 2
+  store <2 x double> %3, <2 x double>* %arrayidx4, align 8
+  ret void
+
+; CHECK-LABEL: @testv4
+; CHECK-DAG: xxlor [[V1:[0-9]+]], 34, 34
+; CHECK-DAG: xvmaddmdp 37, 35, 34
+; CHECK-DAG: li [[C1:[0-9]+]], 16
+; CHECK-DAG: li [[C2:[0-9]+]], 32
+; CHECK-DAG: xvmaddadp 34, 35, 38
+
+; Note: We could convert this next FMA to M-type as well, but it would require
+; re-ordering the instructions.
+; CHECK-DAG: xvmaddadp [[V1]], 35, 36
+
+; CHECK-DAG: stxvd2x 32, 0, 3
+; CHECK-DAG: stxvd2x 37, 3, [[C1]]
+; CHECK-DAG: li [[C3:[0-9]+]], 48
+; CHECK-DAG: xvmaddadp 37, 35, 36
+; CHECK-DAG: stxvd2x 37, 3, [[C3]]
+; CHECK-DAG: stxvd2x 34, 3, [[C2]]
+; CHECK: blr
+}
+
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #0
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/vsx-self-copy.ll b/test/CodeGen/PowerPC/vsx-self-copy.ll
new file mode 100644
index 0000000..23615ca
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-self-copy.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mcpu=pwr7 -mattr=+vsx < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define double @takFP(double %x, double %y, double %z) #0 {
+entry:
+  br i1 undef, label %if.then, label %return
+
+if.then:                                          ; preds = %if.then, %entry
+  %x.tr16 = phi double [ %call, %if.then ], [ %x, %entry ]
+  %call = tail call double @takFP(double undef, double undef, double undef)
+  %call4 = tail call double @takFP(double undef, double %x.tr16, double undef)
+  %cmp = fcmp olt double undef, %call
+  br i1 %cmp, label %if.then, label %return
+
+return:                                           ; preds = %if.then, %entry
+  %z.tr.lcssa = phi double [ %z, %entry ], [ %call4, %if.then ]
+  ret double %z.tr.lcssa
+
+; CHECK: @takFP
+; CHECK-NOT: xxlor 0, 0, 0
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/vsx-spill.ll b/test/CodeGen/PowerPC/vsx-spill.ll
new file mode 100644
index 0000000..29bc6fc
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-spill.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mcpu=pwr7 -mattr=+vsx < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define double @foo1(double %a) nounwind {
+entry:
+  call void asm sideeffect "", "~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"() nounwind
+  br label %return
+
+; CHECK: @foo1
+; CHECK: xxlor [[R1:[0-9]+]], 1, 1
+; CHECK: xxlor 1, [[R1]], [[R1]]
+; CHECK: blr
+
+return:                                           ; preds = %entry
+  ret double %a
+}
+
+define double @foo2(double %a) nounwind {
+entry:
+  %b = fadd double %a, %a
+  call void asm sideeffect "", "~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"() nounwind
+  br label %return
+
+; CHECK: @foo2
+; CHECK: {{xxlor|xsadddp}} [[R1:[0-9]+]], 1, 1
+; CHECK: {{xxlor|xsadddp}} 1, [[R1]], [[R1]]
+; CHECK: blr
+
+return:                                           ; preds = %entry
+  ret double %b
+}
+
+define double @foo3(double %a) nounwind {
+entry:
+  call void asm sideeffect "", "~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31},~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() nounwind
+  br label %return
+
+; CHECK: @foo3
+; CHECK: stxsdx 1,
+; CHECK: lxsdx [[R1:[0-9]+]],
+; CHECK: xsadddp 1, [[R1]], [[R1]]
+; CHECK: blr
+
+return:                                           ; preds = %entry
+  %b = fadd double %a, %a
+  ret double %b
+}
+
diff --git a/test/CodeGen/PowerPC/vsx.ll b/test/CodeGen/PowerPC/vsx.ll
new file mode 100644
index 0000000..f5ac577
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx.ll
@@ -0,0 +1,651 @@
+; RUN: llc -mcpu=pwr7 -mattr=+vsx < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define double @test1(double %a, double %b) {
+entry:
+  %v = fmul double %a, %b
+  ret double %v
+
+; CHECK-LABEL: @test1
+; CHECK: xsmuldp 1, 1, 2
+; CHECK: blr
+}
+
+define double @test2(double %a, double %b) {
+entry:
+  %v = fdiv double %a, %b
+  ret double %v
+
+; CHECK-LABEL: @test2
+; CHECK: xsdivdp 1, 1, 2
+; CHECK: blr
+}
+
+define double @test3(double %a, double %b) {
+entry:
+  %v = fadd double %a, %b
+  ret double %v
+
+; CHECK-LABEL: @test3
+; CHECK: xsadddp 1, 1, 2
+; CHECK: blr
+}
+
+define <2 x double> @test4(<2 x double> %a, <2 x double> %b) {
+entry:
+  %v = fadd <2 x double> %a, %b
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test4
+; CHECK: xvadddp 34, 34, 35
+; CHECK: blr
+}
+
+define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %v = xor <4 x i32> %a, %b
+  ret <4 x i32> %v
+
+; CHECK-LABEL: @test5
+; CHECK: xxlxor 34, 34, 35
+; CHECK: blr
+}
+
+define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %v = xor <8 x i16> %a, %b
+  ret <8 x i16> %v
+
+; CHECK-LABEL: @test6
+; CHECK: xxlxor 34, 34, 35
+; CHECK: blr
+}
+
+define <16 x i8> @test7(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %v = xor <16 x i8> %a, %b
+  ret <16 x i8> %v
+
+; CHECK-LABEL: @test7
+; CHECK: xxlxor 34, 34, 35
+; CHECK: blr
+}
+
+define <4 x i32> @test8(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %v = or <4 x i32> %a, %b
+  ret <4 x i32> %v
+
+; CHECK-LABEL: @test8
+; CHECK: xxlor 34, 34, 35
+; CHECK: blr
+}
+
+define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %v = or <8 x i16> %a, %b
+  ret <8 x i16> %v
+
+; CHECK-LABEL: @test9
+; CHECK: xxlor 34, 34, 35
+; CHECK: blr
+}
+
+define <16 x i8> @test10(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %v = or <16 x i8> %a, %b
+  ret <16 x i8> %v
+
+; CHECK-LABEL: @test10
+; CHECK: xxlor 34, 34, 35
+; CHECK: blr
+}
+
+define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %v = and <4 x i32> %a, %b
+  ret <4 x i32> %v
+
+; CHECK-LABEL: @test11
+; CHECK: xxland 34, 34, 35
+; CHECK: blr
+}
+
+define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %v = and <8 x i16> %a, %b
+  ret <8 x i16> %v
+
+; CHECK-LABEL: @test12
+; CHECK: xxland 34, 34, 35
+; CHECK: blr
+}
+
+define <16 x i8> @test13(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %v = and <16 x i8> %a, %b
+  ret <16 x i8> %v
+
+; CHECK-LABEL: @test13
+; CHECK: xxland 34, 34, 35
+; CHECK: blr
+}
+
+define <4 x i32> @test14(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %v = or <4 x i32> %a, %b
+  %w = xor <4 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %w
+
+; CHECK-LABEL: @test14
+; CHECK: xxlnor 34, 34, 35
+; CHECK: blr
+}
+
+define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %v = or <8 x i16> %a, %b
+  %w = xor <8 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  ret <8 x i16> %w
+
+; CHECK-LABEL: @test15
+; CHECK: xxlnor 34, 34, 35
+; CHECK: blr
+}
+
+define <16 x i8> @test16(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %v = or <16 x i8> %a, %b
+  %w = xor <16 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  ret <16 x i8> %w
+
+; CHECK-LABEL: @test16
+; CHECK: xxlnor 34, 34, 35
+; CHECK: blr
+}
+
+define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %w = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %v = and <4 x i32> %a, %w
+  ret <4 x i32> %v
+
+; CHECK-LABEL: @test17
+; CHECK: xxlandc 34, 34, 35
+; CHECK: blr
+}
+
+define <8 x i16> @test18(<8 x i16> %a, <8 x i16> %b) {
+entry:
+  %w = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %v = and <8 x i16> %a, %w
+  ret <8 x i16> %v
+
+; CHECK-LABEL: @test18
+; CHECK: xxlandc 34, 34, 35
+; CHECK: blr
+}
+
+define <16 x i8> @test19(<16 x i8> %a, <16 x i8> %b) {
+entry:
+  %w = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %v = and <16 x i8> %a, %w
+  ret <16 x i8> %v
+
+; CHECK-LABEL: @test19
+; CHECK: xxlandc 34, 34, 35
+; CHECK: blr
+}
+
+define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
+entry:
+  %m = icmp eq <4 x i32> %c, %d
+  %v = select <4 x i1> %m, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %v
+
+; CHECK-LABEL: @test20
+; CHECK: vcmpequw {{[0-9]+}}, 4, 5
+; CHECK: xxsel 34, 35, 34, {{[0-9]+}}
+; CHECK: blr
+}
+
+define <4 x float> @test21(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
+entry:
+  %m = fcmp oeq <4 x float> %c, %d
+  %v = select <4 x i1> %m, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %v
+
+; CHECK-LABEL: @test21
+; CHECK: xvcmpeqsp [[V1:[0-9]+]], 36, 37
+; CHECK: xxsel 34, 35, 34, [[V1]]
+; CHECK: blr
+}
+
+define <4 x float> @test22(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
+entry:
+  %m = fcmp ueq <4 x float> %c, %d
+  %v = select <4 x i1> %m, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %v
+
+; CHECK-LABEL: @test22
+; CHECK-DAG: xvcmpeqsp {{[0-9]+}}, 37, 37
+; CHECK-DAG: xvcmpeqsp {{[0-9]+}}, 36, 36
+; CHECK-DAG: xvcmpeqsp {{[0-9]+}}, 36, 37
+; CHECK-DAG: xxlnor
+; CHECK-DAG: xxlnor
+; CHECK-DAG: xxlor
+; CHECK-DAG: xxlor
+; CHECK: xxsel 34, 35, 34, {{[0-9]+}}
+; CHECK: blr
+}
+
+define <8 x i16> @test23(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
+entry:
+  %m = icmp eq <8 x i16> %c, %d
+  %v = select <8 x i1> %m, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %v
+
+; CHECK-LABEL: @test23
+; CHECK: vcmpequh {{[0-9]+}}, 4, 5
+; CHECK: xxsel 34, 35, 34, {{[0-9]+}}
+; CHECK: blr
+}
+
+define <16 x i8> @test24(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+entry:
+  %m = icmp eq <16 x i8> %c, %d
+  %v = select <16 x i1> %m, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %v
+
+; CHECK-LABEL: @test24
+; CHECK: vcmpequb {{[0-9]+}}, 4, 5
+; CHECK: xxsel 34, 35, 34, {{[0-9]+}}
+; CHECK: blr
+}
+
+define <2 x double> @test25(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
+entry:
+  %m = fcmp oeq <2 x double> %c, %d
+  %v = select <2 x i1> %m, <2 x double> %a, <2 x double> %b
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test25
+; CHECK: xvcmpeqdp [[V1:[0-9]+]], 36, 37
+; CHECK: xxsel 34, 35, 34, [[V1]]
+; CHECK: blr
+}
+
+define <2 x i64> @test26(<2 x i64> %a, <2 x i64> %b) {
+  %v = add <2 x i64> %a, %b
+  ret <2 x i64> %v
+
+; CHECK-LABEL: @test26
+
+; Make sure we use only two stores (one for each operand).
+; CHECK: stxvd2x 35,
+; CHECK: stxvd2x 34,
+; CHECK-NOT: stxvd2x
+
+; FIXME: The code quality here is not good; just make sure we do something for now.
+; CHECK: add
+; CHECK: add
+; CHECK: blr
+}
+
+define <2 x i64> @test27(<2 x i64> %a, <2 x i64> %b) {
+  %v = and <2 x i64> %a, %b
+  ret <2 x i64> %v
+
+; CHECK-LABEL: @test27
+; CHECK: xxland 34, 34, 35
+; CHECK: blr
+}
+
+define <2 x double> @test28(<2 x double>* %a) {
+  %v = load <2 x double>* %a, align 16
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test28
+; CHECK: lxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test29(<2 x double>* %a, <2 x double> %b) {
+  store <2 x double> %b, <2 x double>* %a, align 16
+  ret void
+
+; CHECK-LABEL: @test29
+; CHECK: stxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define <2 x double> @test28u(<2 x double>* %a) {
+  %v = load <2 x double>* %a, align 8
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test28u
+; CHECK: lxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test29u(<2 x double>* %a, <2 x double> %b) {
+  store <2 x double> %b, <2 x double>* %a, align 8
+  ret void
+
+; CHECK-LABEL: @test29u
+; CHECK: stxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define <2 x i64> @test30(<2 x i64>* %a) {
+  %v = load <2 x i64>* %a, align 16
+  ret <2 x i64> %v
+
+; CHECK-LABEL: @test30
+; CHECK: lxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test31(<2 x i64>* %a, <2 x i64> %b) {
+  store <2 x i64> %b, <2 x i64>* %a, align 16
+  ret void
+
+; CHECK-LABEL: @test31
+; CHECK: stxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define <2 x double> @test40(<2 x i64> %a) {
+  %v = uitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test40
+; CHECK: xvcvuxddp 34, 34
+; CHECK: blr
+}
+
+define <2 x double> @test41(<2 x i64> %a) {
+  %v = sitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test41
+; CHECK: xvcvsxddp 34, 34
+; CHECK: blr
+}
+
+define <2 x i64> @test42(<2 x double> %a) {
+  %v = fptoui <2 x double> %a to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK-LABEL: @test42
+; CHECK: xvcvdpuxds 34, 34
+; CHECK: blr
+}
+
+define <2 x i64> @test43(<2 x double> %a) {
+  %v = fptosi <2 x double> %a to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK-LABEL: @test43
+; CHECK: xvcvdpsxds 34, 34
+; CHECK: blr
+}
+
+define <2 x float> @test44(<2 x i64> %a) {
+  %v = uitofp <2 x i64> %a to <2 x float>
+  ret <2 x float> %v
+
+; CHECK-LABEL: @test44
+; FIXME: The code quality here looks pretty bad.
+; CHECK: blr
+}
+
+define <2 x float> @test45(<2 x i64> %a) {
+  %v = sitofp <2 x i64> %a to <2 x float>
+  ret <2 x float> %v
+
+; CHECK-LABEL: @test45
+; FIXME: The code quality here looks pretty bad.
+; CHECK: blr
+}
+
+define <2 x i64> @test46(<2 x float> %a) {
+  %v = fptoui <2 x float> %a to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK-LABEL: @test46
+; FIXME: The code quality here looks pretty bad.
+; CHECK: blr
+}
+
+define <2 x i64> @test47(<2 x float> %a) {
+  %v = fptosi <2 x float> %a to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK-LABEL: @test47
+; FIXME: The code quality here looks pretty bad.
+; CHECK: blr
+}
+
+define <2 x double> @test50(double* %a) {
+  %v = load double* %a, align 8
+  %w = insertelement <2 x double> undef, double %v, i32 0
+  %x = insertelement <2 x double> %w, double %v, i32 1
+  ret <2 x double> %x
+
+; CHECK-LABEL: @test50
+; CHECK: lxvdsx 34, 0, 3
+; CHECK: blr
+}
+
+define <2 x double> @test51(<2 x double> %a, <2 x double> %b) {
+  %v = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test51
+; CHECK: xxpermdi 34, 34, 34, 0
+; CHECK: blr
+}
+
+define <2 x double> @test52(<2 x double> %a, <2 x double> %b) {
+  %v = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test52
+; CHECK: xxpermdi 34, 34, 35, 0
+; CHECK: blr
+}
+
+define <2 x double> @test53(<2 x double> %a, <2 x double> %b) {
+  %v = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test53
+; CHECK: xxpermdi 34, 35, 34, 0
+; CHECK: blr
+}
+
+define <2 x double> @test54(<2 x double> %a, <2 x double> %b) {
+  %v = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test54
+; CHECK: xxpermdi 34, 34, 35, 2
+; CHECK: blr
+}
+
+define <2 x double> @test55(<2 x double> %a, <2 x double> %b) {
+  %v = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test55
+; CHECK: xxpermdi 34, 34, 35, 3
+; CHECK: blr
+}
+
+define <2 x i64> @test56(<2 x i64> %a, <2 x i64> %b) {
+  %v = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %v
+
+; CHECK-LABEL: @test56
+; CHECK: xxpermdi 34, 34, 35, 3
+; CHECK: blr
+}
+
+define <2 x i64> @test60(<2 x i64> %a, <2 x i64> %b) {
+  %v = shl <2 x i64> %a, %b
+  ret <2 x i64> %v
+
+; CHECK-LABEL: @test60
+; This should scalarize, and the current code quality is not good.
+; CHECK: stxvd2x
+; CHECK: stxvd2x
+; CHECK: sld
+; CHECK: sld
+; CHECK: lxvd2x
+; CHECK: blr
+}
+
+define <2 x i64> @test61(<2 x i64> %a, <2 x i64> %b) {
+  %v = lshr <2 x i64> %a, %b
+  ret <2 x i64> %v
+
+; CHECK-LABEL: @test61
+; This should scalarize, and the current code quality is not good.
+; CHECK: stxvd2x
+; CHECK: stxvd2x
+; CHECK: srd
+; CHECK: srd
+; CHECK: lxvd2x
+; CHECK: blr
+}
+
+define <2 x i64> @test62(<2 x i64> %a, <2 x i64> %b) {
+  %v = ashr <2 x i64> %a, %b
+  ret <2 x i64> %v
+
+; CHECK-LABEL: @test62
+; This should scalarize, and the current code quality is not good.
+; CHECK: stxvd2x
+; CHECK: stxvd2x
+; CHECK: srad
+; CHECK: srad
+; CHECK: lxvd2x
+; CHECK: blr
+}
+
+define double @test63(<2 x double> %a) {
+  %v = extractelement <2 x double> %a, i32 0
+  ret double %v
+
+; CHECK-LABEL: @test63
+; CHECK: xxlor 1, 34, 34
+; CHECK: blr
+}
+
+define double @test64(<2 x double> %a) {
+  %v = extractelement <2 x double> %a, i32 1
+  ret double %v
+
+; CHECK-LABEL: @test64
+; CHECK: xxpermdi 1, 34, 34, 2
+; CHECK: blr
+}
+
+define <2 x i1> @test65(<2 x i64> %a, <2 x i64> %b) {
+  %w = icmp eq <2 x i64> %a, %b
+  ret <2 x i1> %w
+
+; CHECK-LABEL: @test65
+; CHECK: vcmpequw 2, 2, 3
+; CHECK: blr
+}
+
+define <2 x i1> @test66(<2 x i64> %a, <2 x i64> %b) {
+  %w = icmp ne <2 x i64> %a, %b
+  ret <2 x i1> %w
+
+; CHECK-LABEL: @test66
+; CHECK: vcmpequw {{[0-9]+}}, 2, 3
+; CHECK: xxlnor 34, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: blr
+}
+
+define <2 x i1> @test67(<2 x i64> %a, <2 x i64> %b) {
+  %w = icmp ult <2 x i64> %a, %b
+  ret <2 x i1> %w
+
+; CHECK-LABEL: @test67
+; This should scalarize, and the current code quality is not good.
+; CHECK: stxvd2x
+; CHECK: stxvd2x
+; CHECK: cmpld
+; CHECK: cmpld
+; CHECK: lxvd2x
+; CHECK: blr
+}
+
+define <2 x double> @test68(<2 x i32> %a) {
+  %w = sitofp <2 x i32> %a to <2 x double>
+  ret <2 x double> %w
+
+; CHECK-LABEL: @test68
+; CHECK: xxsldwi [[V1:[0-9]+]], 34, 34, 1
+; CHECK: xvcvsxwdp 34, [[V1]]
+; CHECK: blr
+}
+
+define <2 x double> @test69(<2 x i16> %a) {
+  %w = sitofp <2 x i16> %a to <2 x double>
+  ret <2 x double> %w
+
+; CHECK-LABEL: @test69
+; CHECK: vspltisw [[V1:[0-9]+]], 8
+; CHECK: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
+; CHECK: vslw [[V3:[0-9]+]], 2, [[V2]]
+; CHECK: vsraw {{[0-9]+}}, [[V3]], [[V2]]
+; CHECK: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
+; CHECK: xvcvsxwdp 34, [[V4]]
+; CHECK: blr
+}
+
+define <2 x double> @test70(<2 x i8> %a) {
+  %w = sitofp <2 x i8> %a to <2 x double>
+  ret <2 x double> %w
+
+; CHECK-LABEL: @test70
+; CHECK: vspltisw [[V1:[0-9]+]], 12
+; CHECK: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
+; CHECK: vslw [[V3:[0-9]+]], 2, [[V2]]
+; CHECK: vsraw {{[0-9]+}}, [[V3]], [[V2]]
+; CHECK: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
+; CHECK: xvcvsxwdp 34, [[V4]]
+; CHECK: blr
+}
+
+define <2 x i32> @test80(i32 %v) {
+  %b1 = insertelement <2 x i32> undef, i32 %v, i32 0
+  %b2 = shufflevector <2 x i32> %b1, <2 x i32> undef, <2 x i32> zeroinitializer
+  %i = add <2 x i32> %b2, <i32 2, i32 3>
+  ret <2 x i32> %i
+
+; CHECK-LABEL: @test80
+; CHECK-DAG: addi [[R1:[0-9]+]], 3, 3
+; CHECK-DAG: addi [[R2:[0-9]+]], 1, -16
+; CHECK-DAG: addi [[R3:[0-9]+]], 3, 2
+; CHECK: std [[R1]], 8([[R2]])
+; CHECK: std [[R3]], -16(1)
+; CHECK: lxvd2x 34, 0, [[R2]]
+; CHECK-NOT: stxvd2x
+; CHECK: blr
+}
+
+define <2 x double> @test81(<4 x float> %b) {
+  %w = bitcast <4 x float> %b to <2 x double>
+  ret <2 x double> %w
+
+; CHECK-LABEL: @test81
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/vtable-reloc.ll b/test/CodeGen/PowerPC/vtable-reloc.ll
new file mode 100644
index 0000000..995a5d0
--- /dev/null
+++ b/test/CodeGen/PowerPC/vtable-reloc.ll
@@ -0,0 +1,11 @@
+; RUN: llc -O0 < %s | FileCheck %s
+
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@_ZTV3foo = linkonce_odr unnamed_addr constant [1 x i8*] [i8* bitcast (void ()* @__cxa_pure_virtual to i8*)]
+declare void @__cxa_pure_virtual()
+
+; CHECK: .section .data.rel.ro
+; CHECK: .quad __cxa_pure_virtual
+
diff --git a/test/CodeGen/PowerPC/weak_def_can_be_hidden.ll b/test/CodeGen/PowerPC/weak_def_can_be_hidden.ll
new file mode 100644
index 0000000..e038b3f
--- /dev/null
+++ b/test/CodeGen/PowerPC/weak_def_can_be_hidden.ll
@@ -0,0 +1,50 @@
+; taken from X86 version of the same test
+; RUN: llc -mtriple=powerpc-apple-darwin10 -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc-apple-darwin9 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
+; RUN: llc -mtriple=powerpc-apple-darwin8 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
+
+@v1 = linkonce_odr constant i32 32
+; CHECK: .globl  _v1
+; CHECK: .weak_def_can_be_hidden _v1
+
+; CHECK-D89: .globl  _v1
+; CHECK-D89: .weak_definition _v1
+
+define i32 @f1() {
+  %x = load i32 * @v1
+  ret i32 %x
+}
+
+@v2 = linkonce_odr constant i32 32
+; CHECK: .globl  _v2
+; CHECK: .weak_definition _v2
+
+; CHECK-D89: .globl  _v2
+; CHECK-D89: .weak_definition _v2
+
+define i32* @f2() {
+  ret i32* @v2
+}
+
+@v3 = linkonce_odr unnamed_addr global i32 32
+; CHECK: .globl  _v3
+; CHECK: .weak_def_can_be_hidden _v3
+
+; CHECK-D89: .globl  _v3
+; CHECK-D89: .weak_definition _v3
+
+define i32* @f3() {
+  ret i32* @v3
+}
+
+@v4 = linkonce_odr global i32 32
+; CHECK: .globl  _v4
+; CHECK: .weak_definition _v4
+
+; CHECK-D89: .globl  _v4
+; CHECK-D89: .weak_definition _v4
+
+define i32 @f4() {
+  %x = load i32 * @v4
+  ret i32 %x
+}
diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll
index 7a12687..fffaefe 100644
--- a/test/CodeGen/R600/32-bit-local-address-space.ll
+++ b/test/CodeGen/R600/32-bit-local-address-space.ll
@@ -11,7 +11,7 @@
 
 ; CHECK-LABEL: @local_address_load
 ; CHECK: V_MOV_B32_e{{32|64}} [[PTR:v[0-9]]]
-; CHECK: DS_READ_B32 [[PTR]]
+; CHECK: DS_READ_B32 v{{[0-9]+}}, [[PTR]]
 define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = load i32 addrspace(3)* %in
@@ -32,9 +32,8 @@ entry:
 }
 
 ; CHECK-LABEL: @local_address_gep_const_offset
-; CHECK: S_ADD_I32 [[SPTR:s[0-9]]]
-; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; CHECK: DS_READ_B32 [[VPTR]]
+; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
+; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VPTR]], 4,
 define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(3)* %in, i32 1
@@ -43,6 +42,19 @@ entry:
   ret void
 }
 
+; Offset too large, can't fold into 16-bit immediate offset.
+; CHECK-LABEL: @local_address_gep_large_const_offset
+; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 65540
+; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; CHECK: DS_READ_B32 [[VPTR]]
+define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = getelementptr i32 addrspace(3)* %in, i32 16385
+  %1 = load i32 addrspace(3)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
 ; CHECK-LABEL: @null_32bit_lds_ptr:
 ; CHECK: V_CMP_NE_I32
 ; CHECK-NOT: V_CMP_NE_I32
@@ -69,7 +81,7 @@ define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %
 
 ; CHECK-LABEL: @infer_ptr_alignment_global_offset:
 ; CHECK: V_MOV_B32_e32 [[REG:v[0-9]+]], 0
-; CHECK: DS_READ_B32 v{{[0-9]+}}, 0, [[REG]]
+; CHECK: DS_READ_B32 v{{[0-9]+}}, [[REG]]
 define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
   %val = load float addrspace(3)* @g_lds
   store float %val, float addrspace(1)* %out
@@ -80,9 +92,47 @@ define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %ti
 @ptr = addrspace(3) global i32 addrspace(3)* null
 @dst = addrspace(3) global [16384 x i32] zeroinitializer
 
-; SI-LABEL: @global_ptr:
-; SI-CHECK: DS_WRITE_B32
+; CHECK-LABEL: @global_ptr:
+; CHECK: DS_WRITE_B32
 define void @global_ptr() nounwind {
   store i32 addrspace(3)* getelementptr ([16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
   ret void
 }
+
+; CHECK-LABEL: @local_address_store
+; CHECK: DS_WRITE_B32
+define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
+  store i32 %val, i32 addrspace(3)* %out
+  ret void
+}
+
+; CHECK-LABEL: @local_address_gep_store
+; CHECK: S_ADD_I32 [[SADDR:s[0-9]+]],
+; CHECK: V_MOV_B32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
+; CHECK: DS_WRITE_B32 [[ADDR]], v{{[0-9]+}},
+define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
+  %gep = getelementptr i32 addrspace(3)* %out, i32 %offset
+  store i32 %val, i32 addrspace(3)* %gep, align 4
+  ret void
+}
+
+; CHECK-LABEL: @local_address_gep_const_offset_store
+; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
+; CHECK: V_MOV_B32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
+; CHECK: DS_WRITE_B32 [[VPTR]], [[VAL]], 4
+define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+  %gep = getelementptr i32 addrspace(3)* %out, i32 1
+  store i32 %val, i32 addrspace(3)* %gep, align 4
+  ret void
+}
+
+; Offset too large, can't fold into 16-bit immediate offset.
+; CHECK-LABEL: @local_address_gep_large_const_offset_store
+; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 65540
+; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; CHECK: DS_WRITE_B32 [[VPTR]], v{{[0-9]+}}, 0
+define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+  %gep = getelementptr i32 addrspace(3)* %out, i32 16385
+  store i32 %val, i32 addrspace(3)* %gep, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll
index 3d5506b..e9db52a 100644
--- a/test/CodeGen/R600/add.ll
+++ b/test/CodeGen/R600/add.ll
@@ -1,10 +1,9 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
 
-;EG-CHECK-LABEL: @test1:
+;FUNC-LABEL: @test1:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: @test1:
 ;SI-CHECK: V_ADD_I32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
 ;SI-CHECK-NOT: [[REG]]
 ;SI-CHECK: BUFFER_STORE_DWORD [[REG]],
@@ -17,11 +16,10 @@ define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK-LABEL: @test2:
+;FUNC-LABEL: @test2:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: @test2:
 ;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
@@ -34,13 +32,12 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK-LABEL: @test4:
+;FUNC-LABEL: @test4:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: @test4:
 ;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
@@ -54,3 +51,92 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @test8
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
+entry:
+  %0 = add <8 x i32> %a, %b
+  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @test16
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; EG-CHECK: ADD_INT
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADD_I32
+define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
+entry:
+  %0 = add <16 x i32> %a, %b
+  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @add64
+; SI-CHECK: S_ADD_I32
+; SI-CHECK: S_ADDC_U32
+define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = add i64 %a, %b
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
+
+; The V_ADDC_U32 and V_ADD_I32 instruction can't read SGPRs, because they
+; use VCC.  The test is designed so that %a will be stored in an SGPR and
+; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
+; to a VGPR before doing the add.
+
+; FUNC-LABEL: @add64_sgpr_vgpr
+; SI-CHECK-NOT: V_ADDC_U32_e32 s
+define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
+entry:
+  %0 = load i64 addrspace(1)* %in
+  %1 = add i64 %a, %0
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
index 303a1cb..7081b07 100644
--- a/test/CodeGen/R600/add_i64.ll
+++ b/test/CodeGen/R600/add_i64.ll
@@ -1,14 +1,13 @@
-; XFAIL: *
-; This will fail until i64 add is enabled
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI %s
 
-
-declare i32 @llvm.SI.tid() readnone
+declare i32 @llvm.r600.read.tidig.x() readnone
 
 ; SI-LABEL: @test_i64_vreg:
+; SI: V_ADD_I32
+; SI: V_ADDC_U32
 define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.SI.tid() readnone
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64 addrspace(1)* %inB, i32 %tid
   %a = load i64 addrspace(1)* %a_ptr
@@ -20,6 +19,8 @@ define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa
 
 ; Check that the SGPR add operand is correctly moved to a VGPR.
 ; SI-LABEL: @sgpr_operand:
+; SI: V_ADD_I32
+; SI: V_ADDC_U32
 define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
   %foo = load i64 addrspace(1)* %in, align 8
   %result = add i64 %foo, %a
@@ -31,6 +32,8 @@ define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noal
 ; SGPR as other operand.
 ;
 ; SI-LABEL: @sgpr_operand_reversed:
+; SI: V_ADD_I32
+; SI: V_ADDC_U32
 define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
   %foo = load i64 addrspace(1)* %in, align 8
   %result = add i64 %a, %foo
@@ -40,6 +43,10 @@ define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace
 
 
 ; SI-LABEL: @test_v2i64_sreg:
+; SI: S_ADD_I32
+; SI: S_ADDC_U32
+; SI: S_ADD_I32
+; SI: S_ADDC_U32
 define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
   %result = add <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
@@ -47,8 +54,12 @@ define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a,
 }
 
 ; SI-LABEL: @test_v2i64_vreg:
+; SI: V_ADD_I32
+; SI: V_ADDC_U32
+; SI: V_ADD_I32
+; SI: V_ADDC_U32
 define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.SI.tid() readnone
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64> addrspace(1)* %inB, i32 %tid
   %a = load <2 x i64> addrspace(1)* %a_ptr
@@ -57,3 +68,17 @@ define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> add
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
 }
+
+; SI-LABEL: @trunc_i64_add_to_i32
+; SI: S_LOAD_DWORD [[SREG0:s[0-9]+]],
+; SI: S_LOAD_DWORD [[SREG1:s[0-9]+]],
+; SI: S_ADD_I32 [[SRESULT:s[0-9]+]], [[SREG1]], [[SREG0]]
+; SI-NOT: ADDC
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+  %add = add i64 %b, %a
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll
index 1fc616a..15d2ed2 100644
--- a/test/CodeGen/R600/address-space.ll
+++ b/test/CodeGen/R600/address-space.ll
@@ -4,11 +4,14 @@
 
 %struct.foo = type { [3 x float], [3 x float] }
 
+; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is
+; already in a VGPR after the first read.
+
 ; CHECK-LABEL: @do_as_ptr_calcs:
-; CHECK: S_ADD_I32 {{s[0-9]+}},
-; CHECK: S_ADD_I32 [[SREG1:s[0-9]+]],
+; CHECK: S_LOAD_DWORD [[SREG1:s[0-9]+]],
 ; CHECK: V_MOV_B32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
-; CHECK: DS_READ_B32 [[VREG1]],
+; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 20
+; CHECK: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 12
 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
diff --git a/test/CodeGen/R600/anyext.ll b/test/CodeGen/R600/anyext.ll
new file mode 100644
index 0000000..bbe5d0a
--- /dev/null
+++ b/test/CodeGen/R600/anyext.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: @anyext_i1_i32
+; CHECK: V_CNDMASK_B32_e64
+define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  %1 = zext i1 %0 to i8
+  %2 = xor i8 %1, -1
+  %3 = and i8 %2, 1
+  %4 = zext i8 %3 to i32
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll
new file mode 100644
index 0000000..cb2a1c8
--- /dev/null
+++ b/test/CodeGen/R600/array-ptr-calc-i32.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.SI.tid() nounwind readnone
+declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+
+; The required pointer calculations for the alloca'd actually requires
+; an add and won't be folded into the addressing, which fails with a
+; 64-bit pointer add. This should work since private pointers should
+; be 32-bits.
+
+; SI-LABEL: @test_private_array_ptr_calc:
+; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
+; SI: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]]
+define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+  %alloca = alloca [4 x i32], i32 4, align 16
+  %tid = call i32 @llvm.SI.tid() readnone
+  %a_ptr = getelementptr i32 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i32 addrspace(1)* %inB, i32 %tid
+  %a = load i32 addrspace(1)* %a_ptr
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  %alloca_ptr = getelementptr [4 x i32]* %alloca, i32 1, i32 %b
+  store i32 %result, i32* %alloca_ptr, align 4
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+  %reload = load i32* %alloca_ptr, align 4
+  %out_ptr = getelementptr i32 addrspace(1)* %out, i32 %tid
+  store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll
index 0bc48a3..cb0242c 100644
--- a/test/CodeGen/R600/atomic_load_add.ll
+++ b/test/CodeGen/R600/atomic_load_add.ll
@@ -4,7 +4,7 @@
 ; R600-CHECK-LABEL: @atomic_add_local
 ; R600-CHECK: LDS_ADD *
 ; SI-CHECK-LABEL: @atomic_add_local
-; SI-CHECK: DS_ADD_U32_RTN 0
+; SI-CHECK: DS_ADD_U32_RTN
 define void @atomic_add_local(i32 addrspace(3)* %local) {
 entry:
    %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
@@ -14,7 +14,7 @@ entry:
 ; R600-CHECK-LABEL: @atomic_add_ret_local
 ; R600-CHECK: LDS_ADD_RET *
 ; SI-CHECK-LABEL: @atomic_add_ret_local
-; SI-CHECK: DS_ADD_U32_RTN 0
+; SI-CHECK: DS_ADD_U32_RTN
 define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
 entry:
   %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
diff --git a/test/CodeGen/R600/atomic_load_sub.ll b/test/CodeGen/R600/atomic_load_sub.ll
index e4a6829..7c26e52 100644
--- a/test/CodeGen/R600/atomic_load_sub.ll
+++ b/test/CodeGen/R600/atomic_load_sub.ll
@@ -4,7 +4,7 @@
 ; R600-CHECK-LABEL: @atomic_sub_local
 ; R600-CHECK: LDS_SUB *
 ; SI-CHECK-LABEL: @atomic_sub_local
-; SI-CHECK: DS_SUB_U32_RTN 0
+; SI-CHECK: DS_SUB_U32_RTN
 define void @atomic_sub_local(i32 addrspace(3)* %local) {
 entry:
    %0 = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
@@ -14,7 +14,7 @@ entry:
 ; R600-CHECK-LABEL: @atomic_sub_ret_local
 ; R600-CHECK: LDS_SUB_RET *
 ; SI-CHECK-LABEL: @atomic_sub_ret_local
-; SI-CHECK: DS_SUB_U32_RTN 0
+; SI-CHECK: DS_SUB_U32_RTN
 define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
 entry:
   %0 = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
diff --git a/test/CodeGen/R600/basic-branch.ll b/test/CodeGen/R600/basic-branch.ll
new file mode 100644
index 0000000..d084132
--- /dev/null
+++ b/test/CodeGen/R600/basic-branch.ll
@@ -0,0 +1,15 @@
+; XFAIL: *
+; RUN: llc -O0 -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: @test_branch(
+define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+  %cmp = icmp ne i32 %val, 0
+  br i1 %cmp, label %store, label %end
+
+store:
+  store i32 222, i32 addrspace(1)* %out
+  ret void
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/R600/basic-loop.ll b/test/CodeGen/R600/basic-loop.ll
new file mode 100644
index 0000000..6d0ff07
--- /dev/null
+++ b/test/CodeGen/R600/basic-loop.ll
@@ -0,0 +1,18 @@
+; XFAIL: *
+; RUN: llc -O0 -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s
+
+; CHECK-LABEL: @test_loop:
+define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+entry:
+  br label %loop.body
+
+loop.body:
+  %i = phi i32 [0, %entry], [%i.inc, %loop.body]
+  store i32 222, i32 addrspace(1)* %out
+  %cmp = icmp ne i32 %i, %val
+  %i.inc = add i32 %i, 1
+  br i1 %cmp, label %loop.body, label %end
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/R600/bfe_uint.ll b/test/CodeGen/R600/bfe_uint.ll
index 92570c3..fe466e6 100644
--- a/test/CodeGen/R600/bfe_uint.ll
+++ b/test/CodeGen/R600/bfe_uint.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; XFAIL: *
+
 ; CHECK: @bfe_def
 ; CHECK: BFE_UINT
 define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
diff --git a/test/CodeGen/R600/bitcast.ll b/test/CodeGen/R600/bitcast.ll
index bccc416..5bfc008 100644
--- a/test/CodeGen/R600/bitcast.ll
+++ b/test/CodeGen/R600/bitcast.ll
@@ -19,3 +19,12 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 
 attributes #0 = { "ShaderType"="0" }
 
+; CHECK-LABEL: @i8ptr_v16i8ptr
+; CHECK: S_ENDPGM
+define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
+  %1 = load <16 x i8> addrspace(1)* %0
+  store <16 x i8> %1, <16 x i8> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/cayman-loop-bug.ll b/test/CodeGen/R600/cayman-loop-bug.ll
new file mode 100644
index 0000000..a873528
--- /dev/null
+++ b/test/CodeGen/R600/cayman-loop-bug.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
+
+; CHECK-LABEL: @main
+; CHECK: LOOP_START_DX10
+; CHECK: ALU_PUSH_BEFORE
+; CHECK: LOOP_START_DX10
+; CHECK: PUSH
+; CHECK-NOT: ALU_PUSH_BEFORE
+; CHECK: END_LOOP
+; CHECK: END_LOOP
+define void @main (<4 x float> inreg %reg0) #0 {
+entry:
+  br label %outer_loop
+outer_loop:
+  %cnt = phi i32 [0, %entry], [%cnt_incr, %inner_loop]
+  %cond = icmp eq i32 %cnt, 16
+  br i1 %cond, label %outer_loop_body, label %exit
+outer_loop_body:
+  %cnt_incr = add i32 %cnt, 1
+  br label %inner_loop
+inner_loop:
+  %cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body]
+  %cond2 = icmp eq i32 %cnt2, 16
+  br i1 %cond, label %inner_loop_body, label %outer_loop
+inner_loop_body:
+  %cnt2_incr = add i32 %cnt2, 1
+  br label %inner_loop
+exit:
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/cf-stack-bug.ll b/test/CodeGen/R600/cf-stack-bug.ll
new file mode 100644
index 0000000..c3a4612
--- /dev/null
+++ b/test/CodeGen/R600/cf-stack-bug.ll
@@ -0,0 +1,227 @@
+; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG32 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC
+
+; REQUIRES: asserts
+
+; We are currently allocating 2 extra sub-entries on Evergreen / NI for
+; non-WQM push instructions if we change this to 1, then we will need to
+; add one level of depth to each of these tests.
+
+; BUG64-NOT: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested3
+define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.store.1
+
+if.store.1:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested4
+define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested7
+define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  %4 = icmp sgt i32 %cond, 40
+  br i1 %4, label %if.5, label %if.4.store
+
+if.4.store:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+if.5:
+  %5 = icmp sgt i32 %cond, 50
+  br i1 %5, label %if.6, label %if.5.store
+
+if.5.store:
+  store i32 5, i32 addrspace(1)* %out
+  br label %end
+
+if.6:
+  %6 = icmp sgt i32 %cond, 60
+  br i1 %6, label %if.7, label %if.6.store
+
+if.6.store:
+  store i32 6, i32 addrspace(1)* %out
+  br label %end
+
+if.7:
+  store i32 7, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested8
+define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  %4 = icmp sgt i32 %cond, 40
+  br i1 %4, label %if.5, label %if.4.store
+
+if.4.store:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+if.5:
+  %5 = icmp sgt i32 %cond, 50
+  br i1 %5, label %if.6, label %if.5.store
+
+if.5.store:
+  store i32 5, i32 addrspace(1)* %out
+  br label %end
+
+if.6:
+  %6 = icmp sgt i32 %cond, 60
+  br i1 %6, label %if.7, label %if.6.store
+
+if.6.store:
+  store i32 6, i32 addrspace(1)* %out
+  br label %end
+
+if.7:
+  %7 = icmp sgt i32 %cond, 70
+  br i1 %7, label %if.8, label %if.7.store
+
+if.7.store:
+  store i32 7, i32 addrspace(1)* %out
+  br label %end
+
+if.8:
+  store i32 8, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
new file mode 100644
index 0000000..f8b4a61
--- /dev/null
+++ b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
@@ -0,0 +1,19 @@
+; RUN: opt -codegenprepare -S -o - %s | FileCheck --check-prefix=OPT --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-LLC --check-prefix=FUNC %s
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "r600--"
+
+; FUNC-LABEL: @test
+; OPT: mul nsw i32
+; OPT-NEXT: sext
+; SI-LLC: V_MUL_LO_I32
+; SI-LLC-NOT: V_MUL_HI
+define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
+entry:
+  %0 = mul nsw i32 %a, 3
+  %1 = sext i32 %0 to i64
+  %2 = getelementptr i8 addrspace(1)* %in, i64 %1
+  store i8 %b, i8 addrspace(1)* %2
+  ret void
+}
diff --git a/test/CodeGen/R600/elf.r600.ll b/test/CodeGen/R600/elf.r600.ll
index 0590efb..4436c07 100644
--- a/test/CodeGen/R600/elf.r600.ll
+++ b/test/CodeGen/R600/elf.r600.ll
@@ -6,7 +6,7 @@
 
 ; CONFIG-CHECK: .section .AMDGPU.config
 ; CONFIG-CHECK-NEXT: .long   166100
-; CONFIG-CHECK-NEXT: .long   258
+; CONFIG-CHECK-NEXT: .long   2
 ; CONFIG-CHECK-NEXT: .long   165900
 ; CONFIG-CHECK-NEXT: .long   0
 define void @test(float addrspace(1)* %out, i32 %p) {
diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
index aa660b3..2e70d47 100644
--- a/test/CodeGen/R600/extload.ll
+++ b/test/CodeGen/R600/extload.ll
@@ -1,8 +1,9 @@
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; EG-LABEL: @anyext_load_i8:
+; FUNC-LABEL: @anyext_load_i8:
 ; EG: AND_INT
-; EG-NEXT: 255
+; EG: 255
 define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32 addrspace(1)* %cast, align 1
@@ -12,10 +13,11 @@ define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspac
   ret void
 }
 
-; EG-LABEL: @anyext_load_i16:
+; FUNC-LABEL: @anyext_load_i16:
 ; EG: AND_INT
-; EG: LSHL
-; EG: 65535
+; EG: AND_INT
+; EG-DAG: 65535
+; EG-DAG: -65536
 define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32 addrspace(1)* %cast, align 1
@@ -25,9 +27,9 @@ define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrs
   ret void
 }
 
-; EG-LABEL: @anyext_load_lds_i8:
+; FUNC-LABEL: @anyext_load_lds_i8:
 ; EG: AND_INT
-; EG-NEXT: 255
+; EG: 255
 define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32 addrspace(3)* %cast, align 1
@@ -37,10 +39,11 @@ define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addr
   ret void
 }
 
-; EG-LABEL: @anyext_load_lds_i16:
+; FUNC-LABEL: @anyext_load_lds_i16:
+; EG: AND_INT
 ; EG: AND_INT
-; EG: LSHL
-; EG: 65535
+; EG-DAG: 65535
+; EG-DAG: -65536
 define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32 addrspace(3)* %cast, align 1
@@ -49,3 +52,69 @@ define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 a
   store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1
   ret void
 }
+
+; FUNC-LABEL: @sextload_global_i8_to_i64
+; SI: BUFFER_LOAD_SBYTE [[LOAD:v[0-9]+]],
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: BUFFER_STORE_DWORDX2
+define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8 addrspace(1)* %in, align 8
+  %ext = sext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @sextload_global_i16_to_i64
+; SI: BUFFER_LOAD_SSHORT [[LOAD:v[0-9]+]],
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: BUFFER_STORE_DWORDX2
+define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16 addrspace(1)* %in, align 8
+  %ext = sext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @sextload_global_i32_to_i64
+; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: BUFFER_STORE_DWORDX2
+define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %a = load i32 addrspace(1)* %in, align 8
+  %ext = sext i32 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @zextload_global_i8_to_i64
+; SI: BUFFER_LOAD_UBYTE [[LOAD:v[0-9]+]],
+; SI: V_MOV_B32_e32 {{v[0-9]+}}, 0
+; SI: BUFFER_STORE_DWORDX2
+define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8 addrspace(1)* %in, align 8
+  %ext = zext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @zextload_global_i16_to_i64
+; SI: BUFFER_LOAD_USHORT [[LOAD:v[0-9]+]],
+; SI: V_MOV_B32_e32 {{v[0-9]+}}, 0
+; SI: BUFFER_STORE_DWORDX2
+define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16 addrspace(1)* %in, align 8
+  %ext = zext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @zextload_global_i32_to_i64
+; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
+; SI: V_MOV_B32_e32 {{v[0-9]+}}, 0
+; SI: BUFFER_STORE_DWORDX2
+define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %a = load i32 addrspace(1)* %in, align 8
+  %ext = zext i32 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
index a5f5df9..2cd3a4f 100644
--- a/test/CodeGen/R600/fabs.ll
+++ b/test/CodeGen/R600/fabs.ll
@@ -9,7 +9,7 @@
 ; R600-CHECK-NOT: AND
 ; R600-CHECK: |PV.{{[XYZW]}}|
 ; SI-CHECK-LABEL: @fabs_free
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK: V_AND_B32
 
 define void @fabs_free(float addrspace(1)* %out, i32 %in) {
 entry:
@@ -23,8 +23,8 @@ entry:
 ; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
 ; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
 ; SI-CHECK-LABEL: @fabs_v2
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK: V_AND_B32
+; SI-CHECK: V_AND_B32
 define void @fabs_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
@@ -38,10 +38,10 @@ entry:
 ; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
 ; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
 ; SI-CHECK-LABEL: @fabs_v4
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK: V_AND_B32
+; SI-CHECK: V_AND_B32
+; SI-CHECK: V_AND_B32
+; SI-CHECK: V_AND_B32
 define void @fabs_v4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index f467bb7..5d2b806 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll
@@ -1,9 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 
-; R600-CHECK: @fadd_f32
+; FUNC-LABEL: @fadd_f32
 ; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI-CHECK: @fadd_f32
 ; SI-CHECK: V_ADD_F32
 define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
 entry:
@@ -12,10 +11,9 @@ entry:
    ret void
 }
 
-; R600-CHECK: @fadd_v2f32
+; FUNC-LABEL: @fadd_v2f32
 ; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
 ; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
-; SI-CHECK: @fadd_v2f32
 ; SI-CHECK: V_ADD_F32
 ; SI-CHECK: V_ADD_F32
 define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
@@ -25,12 +23,11 @@ entry:
   ret void
 }
 
-; R600-CHECK: @fadd_v4f32
+; FUNC-LABEL: @fadd_v4f32
 ; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: @fadd_v4f32
 ; SI-CHECK: V_ADD_F32
 ; SI-CHECK: V_ADD_F32
 ; SI-CHECK: V_ADD_F32
@@ -43,3 +40,27 @@ define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @fadd_v8f32
+; R600-CHECK: ADD
+; R600-CHECK: ADD
+; R600-CHECK: ADD
+; R600-CHECK: ADD
+; R600-CHECK: ADD
+; R600-CHECK: ADD
+; R600-CHECK: ADD
+; R600-CHECK: ADD
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
+define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) {
+entry:
+  %0 = fadd <8 x float> %a, %b
+  store <8 x float> %0, <8 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fceil.ll b/test/CodeGen/R600/fceil.ll
new file mode 100644
index 0000000..b8b945f
--- /dev/null
+++ b/test/CodeGen/R600/fceil.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI %s
+
+declare double @llvm.ceil.f64(double) nounwind readnone
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
+
+; CI-LABEL: @fceil_f64:
+; CI: V_CEIL_F64_e32
+define void @fceil_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.ceil.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CI-LABEL: @fceil_v2f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-CI-LABEL: @fceil_v3f64:
+; FIXME-CI: V_CEIL_F64_e32
+; FIXME-CI: V_CEIL_F64_e32
+; FIXME-CI: V_CEIL_F64_e32
+; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; CI-LABEL: @fceil_v4f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; CI-LABEL: @fceil_v8f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; CI-LABEL: @fceil_v16f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/ffloor.ll b/test/CodeGen/R600/ffloor.ll
new file mode 100644
index 0000000..51d2b89
--- /dev/null
+++ b/test/CodeGen/R600/ffloor.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI %s
+
+declare double @llvm.floor.f64(double) nounwind readnone
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
+
+; CI-LABEL: @ffloor_f64:
+; CI: V_FLOOR_F64_e32
+define void @ffloor_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.floor.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CI-LABEL: @ffloor_v2f64:
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-CI-LABEL: @ffloor_v3f64:
+; FIXME-CI: V_FLOOR_F64_e32
+; FIXME-CI: V_FLOOR_F64_e32
+; FIXME-CI: V_FLOOR_F64_e32
+; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; CI-LABEL: @ffloor_v4f64:
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; CI-LABEL: @ffloor_v8f64:
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; CI-LABEL: @ffloor_v16f64:
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+; CI: V_FLOOR_F64_e32
+define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fneg-fabs.ll b/test/CodeGen/R600/fneg-fabs.ll
new file mode 100644
index 0000000..d95e131
--- /dev/null
+++ b/test/CodeGen/R600/fneg-fabs.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+
+; DAGCombiner will transform:
+; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
+; unless isFabsFree returns true
+
+; R600-CHECK-LABEL: @fneg_fabs_free
+; R600-CHECK-NOT: AND
+; R600-CHECK: |PV.{{[XYZW]}}|
+; R600-CHECK: -PV
+; SI-CHECK-LABEL: @fneg_fabs_free
+; SI-CHECK: V_OR_B32
+
+define void @fneg_fabs_free(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = bitcast i32 %in to float
+  %1 = call float @fabs(float %0)
+  %2 = fsub float -0.000000e+00, %1
+  store float %2, float addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @fneg_fabs_v2
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600-CHECK: -PV
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600-CHECK: -PV
+; SI-CHECK-LABEL: @fneg_fabs_v2
+; SI-CHECK: V_OR_B32
+; SI-CHECK: V_OR_B32
+define void @fneg_fabs_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  %1 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %0
+  store <2 x float> %1, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @fneg_fabs_v4
+; SI-CHECK: V_OR_B32
+; SI-CHECK: V_OR_B32
+; SI-CHECK: V_OR_B32
+; SI-CHECK: V_OR_B32
+define void @fneg_fabs_v4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
+  store <4 x float> %1, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @fabs(float ) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float> ) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> ) readnone
diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll
index 9446aa8..f4e6be6 100644
--- a/test/CodeGen/R600/fneg.ll
+++ b/test/CodeGen/R600/fneg.ll
@@ -4,7 +4,7 @@
 ; R600-CHECK-LABEL: @fneg
 ; R600-CHECK: -PV
 ; SI-CHECK-LABEL: @fneg
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+; SI-CHECK: V_XOR_B32
 define void @fneg(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fsub float -0.000000e+00, %in
@@ -16,8 +16,8 @@ entry:
 ; R600-CHECK: -PV
 ; R600-CHECK: -PV
 ; SI-CHECK-LABEL: @fneg_v2
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+; SI-CHECK: V_XOR_B32
+; SI-CHECK: V_XOR_B32
 define void @fneg_v2(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
 entry:
   %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
@@ -31,10 +31,10 @@ entry:
 ; R600-CHECK: -PV
 ; R600-CHECK: -PV
 ; SI-CHECK-LABEL: @fneg_v4
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
-; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+; SI-CHECK: V_XOR_B32
+; SI-CHECK: V_XOR_B32
+; SI-CHECK: V_XOR_B32
+; SI-CHECK: V_XOR_B32
 define void @fneg_v4(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
 entry:
   %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
diff --git a/test/CodeGen/R600/ftrunc.ll b/test/CodeGen/R600/ftrunc.ll
new file mode 100644
index 0000000..6b235ff
--- /dev/null
+++ b/test/CodeGen/R600/ftrunc.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI %s
+
+declare double @llvm.trunc.f64(double) nounwind readnone
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
+
+; CI-LABEL: @ftrunc_f64:
+; CI: V_TRUNC_F64_e32
+define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CI-LABEL: @ftrunc_v2f64:
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-CI-LABEL: @ftrunc_v3f64:
+; FIXME-CI: V_TRUNC_F64_e32
+; FIXME-CI: V_TRUNC_F64_e32
+; FIXME-CI: V_TRUNC_F64_e32
+; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; CI-LABEL: @ftrunc_v4f64:
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; CI-LABEL: @ftrunc_v8f64:
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; CI-LABEL: @ftrunc_v16f64:
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+; CI: V_TRUNC_F64_e32
+define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll
index 4ea21dd..ee914fa 100644
--- a/test/CodeGen/R600/gep-address-space.ll
+++ b/test/CodeGen/R600/gep-address-space.ll
@@ -1,13 +1,23 @@
 ; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
 
 define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
-; CHECK-LABEL @use_gep_address_space:
-; CHECK: S_ADD_I32
+; CHECK-LABEL: @use_gep_address_space:
+; CHECK: V_MOV_B32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
+; CHECK: DS_WRITE_B32 [[PTR]], v{{[0-9]+}}, 64
   %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16
   store i32 99, i32 addrspace(3)* %p
   ret void
 }
 
+define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
+; CHECK-LABEL: @use_gep_address_space_large_offset:
+; CHECK: S_ADD_I32
+; CHECK: DS_WRITE_B32
+  %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
+  store i32 99, i32 addrspace(3)* %p
+  ret void
+}
+
 define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
 ; CHECK-LABEL: @gep_as_vector_v4:
 ; CHECK: S_ADD_I32
diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
new file mode 100644
index 0000000..cda7ab1
--- /dev/null
+++ b/test/CodeGen/R600/gv-const-addrspace.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
+
+; XXX: Test on SI once 64-bit adds are supportes.
+
+@float_gv = internal addrspace(2) unnamed_addr constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
+
+; FUNC-LABEL: @float
+
+; R600-DAG: MOV {{\** *}}T2.X
+; R600-DAG: MOV {{\** *}}T3.X
+; R600-DAG: MOV {{\** *}}T4.X
+; R600-DAG: MOV {{\** *}}T5.X
+; R600-DAG: MOV {{\** *}}T6.X
+; R600: MOVA_INT
+
+define void @float(float addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+  %1 = load float addrspace(2)* %0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+@i32_gv = internal addrspace(2) unnamed_addr constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
+
+; FUNC-LABEL: @i32
+
+; R600-DAG: MOV {{\** *}}T2.X
+; R600-DAG: MOV {{\** *}}T3.X
+; R600-DAG: MOV {{\** *}}T4.X
+; R600-DAG: MOV {{\** *}}T5.X
+; R600-DAG: MOV {{\** *}}T6.X
+; R600: MOVA_INT
+
+define void @i32(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
+  %1 = load i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/icmp64.ll b/test/CodeGen/R600/icmp64.ll
new file mode 100644
index 0000000..c9e62ff
--- /dev/null
+++ b/test/CodeGen/R600/icmp64.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @test_i64_eq:
+; SI: V_CMP_EQ_I64
+define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp eq i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_i64_ne:
+; SI: V_CMP_NE_I64
+define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ne i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_i64_slt:
+; SI: V_CMP_LT_I64
+define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp slt i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_i64_ult:
+; SI: V_CMP_LT_U64
+define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ult i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_i64_sle:
+; SI: V_CMP_LE_I64
+define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp sle i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_i64_ule:
+; SI: V_CMP_LE_U64
+define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ule i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_i64_sgt:
+; SI: V_CMP_GT_I64
+define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp sgt i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_i64_ugt:
+; SI: V_CMP_GT_U64
+define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ugt i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_i64_sge:
+; SI: V_CMP_GE_I64
+define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp sge i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_i64_uge:
+; SI: V_CMP_GE_U64
+define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp uge i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll
new file mode 100644
index 0000000..4d1f734
--- /dev/null
+++ b/test/CodeGen/R600/indirect-private-64.ll
@@ -0,0 +1,75 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+
+; SI-LABEL: @private_access_f64_alloca:
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELS_B32_e32
+; SI: V_MOVRELS_B32_e32
+define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load double addrspace(1)* %in, align 8
+  %array = alloca double, i32 16, align 8
+  %ptr = getelementptr double* %array, i32 %b
+  store double %val, double* %ptr, align 8
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load double* %ptr, align 8
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @private_access_v2f64_alloca:
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELS_B32_e32
+; SI: V_MOVRELS_B32_e32
+; SI: V_MOVRELS_B32_e32
+; SI: V_MOVRELS_B32_e32
+define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load <2 x double> addrspace(1)* %in, align 16
+  %array = alloca <2 x double>, i32 16, align 16
+  %ptr = getelementptr <2 x double>* %array, i32 %b
+  store <2 x double> %val, <2 x double>* %ptr, align 16
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load <2 x double>* %ptr, align 16
+  store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @private_access_i64_alloca:
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELS_B32_e32
+; SI: V_MOVRELS_B32_e32
+define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load i64 addrspace(1)* %in, align 8
+  %array = alloca i64, i32 16, align 8
+  %ptr = getelementptr i64* %array, i32 %b
+  store i64 %val, i64* %ptr, align 8
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load i64* %ptr, align 8
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @private_access_v2i64_alloca:
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELD_B32_e32
+; SI: V_MOVRELS_B32_e32
+; SI: V_MOVRELS_B32_e32
+; SI: V_MOVRELS_B32_e32
+; SI: V_MOVRELS_B32_e32
+define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load <2 x i64> addrspace(1)* %in, align 16
+  %array = alloca <2 x i64>, i32 16, align 16
+  %ptr = getelementptr <2 x i64>* %array, i32 %b
+  store <2 x i64> %val, <2 x i64>* %ptr, align 16
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load <2 x i64>* %ptr, align 16
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/infinite-loop-evergreen.ll b/test/CodeGen/R600/infinite-loop-evergreen.ll
new file mode 100644
index 0000000..f6e39b3
--- /dev/null
+++ b/test/CodeGen/R600/infinite-loop-evergreen.ll
@@ -0,0 +1,10 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
+
+define void @inf_loop_irreducible_cfg() nounwind {
+entry:
+  br label %block
+
+block:
+  br label %block
+}
diff --git a/test/CodeGen/R600/infinite-loop.ll b/test/CodeGen/R600/infinite-loop.ll
new file mode 100644
index 0000000..a60bc37
--- /dev/null
+++ b/test/CodeGen/R600/infinite-loop.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @infinite_loop:
+; SI: V_MOV_B32_e32 [[REG:v[0-9]+]], 999
+; SI: BB0_1:
+; SI: BUFFER_STORE_DWORD [[REG]]
+; SI: S_WAITCNT vmcnt(0) expcnt(0)
+; SI: S_BRANCH BB0_1
+define void @infinite_loop(i32 addrspace(1)* %out) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  store i32 999, i32 addrspace(1)* %out, align 4
+  br label %for.body
+}
+
diff --git a/test/CodeGen/R600/insert_vector_elt.ll b/test/CodeGen/R600/insert_vector_elt.ll
index 05aecce..530d1cc 100644
--- a/test/CodeGen/R600/insert_vector_elt.ll
+++ b/test/CodeGen/R600/insert_vector_elt.ll
@@ -1,16 +1,175 @@
-; XFAIL: *
-; RUN: llc < %s -march=r600 -mcpu=redwood -o %t
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 
-define void @var_insert(<4 x i32> addrspace(1)* %out, <4 x i32> %x, i32 %val, i32 %idx) nounwind  {
-entry:
-  %tmp3 = insertelement <4 x i32> %x, i32 %val, i32 %idx		; <<4 x i32>> [#uses=1]
-  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out
+; FIXME: Broken on evergreen
+; FIXME: For some reason the 8 and 16 vectors are being stored as
+; individual elements instead of 128-bit stores.
+
+
+; FIXME: Why is the constant moved into the intermediate register and
+; not just directly into the vector component?
+
+; SI-LABEL: @insertelement_v4f32_0:
+; S_LOAD_DWORDX4 s{{[}}[[LOW_REG:[0-9]+]]:
+; V_MOV_B32_e32
+; V_MOV_B32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
+; V_MOV_B32_e32 v[[LOW_REG]], [[CONSTREG]]
+; BUFFER_STORE_DWORDX4 v{{[}}[[LOW_REG]]:
+define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @insertelement_v4f32_1:
+define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @insertelement_v4f32_2:
+define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @insertelement_v4f32_3:
+define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @insertelement_v4i32_0:
+define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
+  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
+  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v2f32:
+; SI: V_MOV_B32_e32 [[CONST:v[0-9]+]], 5.000000e+00
+; SI: V_MOVRELD_B32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; SI: BUFFER_STORE_DWORDX2 {{v\[}}[[LOW_RESULT_REG]]:
+define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
+  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v4f32:
+; SI: V_MOV_B32_e32 [[CONST:v[0-9]+]], 5.000000e+00
+; SI: V_MOVRELD_B32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; SI: BUFFER_STORE_DWORDX4 {{v\[}}[[LOW_RESULT_REG]]:
+define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v8f32:
+; FIXMESI: BUFFER_STORE_DWORDX4
+; FIXMESI: BUFFER_STORE_DWORDX4
+define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
+  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v16f32:
+; FIXMESI: BUFFER_STORE_DWORDX4
+; FIXMESI: BUFFER_STORE_DWORDX4
+; FIXMESI: BUFFER_STORE_DWORDX4
+; FIXMESI: BUFFER_STORE_DWORDX4
+define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
+  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v2i32:
+; SI: BUFFER_STORE_DWORDX2
+define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
+  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v4i32:
+; SI: BUFFER_STORE_DWORDX4
+define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x i32> %a, i32 5, i32 %b
+  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v8i32:
+; FIXMESI: BUFFER_STORE_DWORDX4
+; FIXMESI: BUFFER_STORE_DWORDX4
+define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
+  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v16i32:
+; FIXMESI: BUFFER_STORE_DWORDX4
+; FIXMESI: BUFFER_STORE_DWORDX4
+; FIXMESI: BUFFER_STORE_DWORDX4
+; FIXMESI: BUFFER_STORE_DWORDX4
+define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
+  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
+  ret void
+}
+
+
+; SI-LABEL: @dynamic_insertelement_v2i16:
+; FIXMESI: BUFFER_STORE_DWORDX2
+define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v4i16:
+; FIXMESI: BUFFER_STORE_DWORDX4
+define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
+  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16
+  ret void
+}
+
+
+; SI-LABEL: @dynamic_insertelement_v2i8:
+; FIXMESI: BUFFER_STORE_USHORT
+define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
+  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v4i8:
+; FIXMESI: BUFFER_STORE_DWORD
+define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
+  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v8i8:
+; FIXMESI: BUFFER_STORE_DWORDX2
+define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
+  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16
   ret void
 }
 
-define void @var_extract(i32 addrspace(1)* %out, <4 x i32> %x, i32 %idx) nounwind  {
-entry:
-  %tmp3 = extractelement <4 x i32> %x, i32 %idx		; <<i32>> [#uses=1]
-  store i32 %tmp3, i32 addrspace(1)* %out
+; SI-LABEL: @dynamic_insertelement_v16i8:
+; FIXMESI: BUFFER_STORE_DWORDX4
+define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
+  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
   ret void
 }
diff --git a/test/CodeGen/R600/insert_vector_elt_f64.ll b/test/CodeGen/R600/insert_vector_elt_f64.ll
new file mode 100644
index 0000000..e334be1
--- /dev/null
+++ b/test/CodeGen/R600/insert_vector_elt_f64.ll
@@ -0,0 +1,36 @@
+; REQUIRES: asserts
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+
+; SI-LABEL: @dynamic_insertelement_v2f64:
+; SI: BUFFER_STORE_DWORDX4
+define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
+  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v2f64:
+; SI: BUFFER_STORE_DWORDX4
+define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
+  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v4f64:
+; SI: BUFFER_STORE_DWORDX4
+define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
+  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @dynamic_insertelement_v8f64:
+; SI: BUFFER_STORE_DWORDX4
+define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
+  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/jump-address.ll b/test/CodeGen/R600/jump-address.ll
index ae9c8bb..a1cd388 100644
--- a/test/CodeGen/R600/jump-address.ll
+++ b/test/CodeGen/R600/jump-address.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: JUMP @3
+; CHECK: JUMP @6
 ; CHECK: EXPORT
 ; CHECK-NOT: EXPORT
 
diff --git a/test/CodeGen/R600/lds-oqap-crash.ll b/test/CodeGen/R600/lds-oqap-crash.ll
new file mode 100644
index 0000000..7959150
--- /dev/null
+++ b/test/CodeGen/R600/lds-oqap-crash.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
+
+; The test is for a bug in R600EmitClauseMarkers.cpp where this pass
+; was searching for a use of the OQAP register in order to determine
+; if an LDS instruction could fit in the current clause, but never finding
+; one.  This created an infinite loop and hung the compiler.
+;
+; The LDS instruction should not have been defining OQAP in the first place,
+; because the LDS instructions are pseudo instructions and the OQAP
+; reads and writes are bundled together in the same instruction.
+
+; CHECK: @lds_crash
+define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = load i32 addrspace(3)* %in
+  ; This block needs to be > 115 ISA instructions to hit the bug,
+  ; so we'll use udiv instructions.
+  %div0 = udiv i32 %0, %b
+  %div1 = udiv i32 %div0, %a
+  %div2 = udiv i32 %div1, 11
+  %div3 = udiv i32 %div2, %a
+  %div4 = udiv i32 %div3, %b
+  %div5 = udiv i32 %div4, %c
+  %div6 = udiv i32 %div5, %div0
+  %div7 = udiv i32 %div6, %div1
+  store i32 %div7, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/lds-output-queue.ll b/test/CodeGen/R600/lds-output-queue.ll
index 63a4332..af0db0d 100644
--- a/test/CodeGen/R600/lds-output-queue.ll
+++ b/test/CodeGen/R600/lds-output-queue.ll
@@ -87,7 +87,7 @@ declare void @llvm.AMDGPU.barrier.local()
 ; CHECK-LABEL: @local_global_alias
 ; CHECK: LDS_READ_RET
 ; CHECK-NOT: ALU clause
-; CHECK MOV * T{{[0-9]\.[XYZW]}}, OQAP
+; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
 define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
new file mode 100644
index 0000000..c3f000a
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @bfe_i32_arg_arg_arg
+; SI: V_BFE_I32
+; EG: BFE_INT
+define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_arg_arg_imm
+; SI: V_BFE_I32
+; EG: BFE_INT
+define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_arg_imm_arg
+; SI: V_BFE_I32
+; EG: BFE_INT
+define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_imm_arg_arg
+; SI: V_BFE_I32
+; EG: BFE_INT
+define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
new file mode 100644
index 0000000..0d47863
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @bfe_u32_arg_arg_arg
+; SI: V_BFE_U32
+; EG: BFE_UINT
+define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_arg_arg_imm
+; SI: V_BFE_U32
+; EG: BFE_UINT
+define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_arg_imm_arg
+; SI: V_BFE_U32
+; EG: BFE_UINT
+define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_imm_arg_arg
+; SI: V_BFE_U32
+; EG: BFE_UINT
+define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfi.ll b/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
new file mode 100644
index 0000000..e1de45b
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @bfi_arg_arg_arg
+; SI: V_BFI_B32
+; EG: BFI_INT
+define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfi_arg_arg_imm
+; SI: V_BFI_B32
+; EG: BFI_INT
+define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfi_arg_imm_arg
+; SI: V_BFI_B32
+; EG: BFI_INT
+define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfi_imm_arg_arg
+; SI: V_BFI_B32
+; EG: BFI_INT
+define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
new file mode 100644
index 0000000..ef8721e
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone
+
+; FUNC-LABEL: @bfm_arg_arg
+; SI: V_BFM
+; EG: BFM_INT
+define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfm_arg_imm
+; SI: V_BFM
+; EG: BFM_INT
+define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfm_imm_arg
+; SI: V_BFM
+; EG: BFM_INT
+define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfm_imm_imm
+; SI: V_BFM
+; EG: BFM_INT
+define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imax.ll b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
index 1336f4e..01c9f43 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
@@ -1,12 +1,23 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-;CHECK: V_MAX_I32_e32
-
-define void @main(i32 %p0, i32 %p1) #0 {
+; SI-LABEL: @vector_imax
+; SI: V_MAX_I32_e32
+define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
-  %0 = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1)
-  %1 = bitcast i32 %0 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %1, float %1, float %1, float %1)
+  %load = load i32 addrspace(1)* %in, align 4
+  %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %load)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: @scalar_imax
+; SI: S_MAX_I32
+define void @scalar_imax(i32 %p0, i32 %p1) #0 {
+entry:
+  %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
   ret void
 }
 
@@ -15,7 +26,7 @@ declare i32 @llvm.AMDGPU.imax(i32, i32) #1
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
 
 !0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imin.ll b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
index 3435ea4..565bf34 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
@@ -1,12 +1,23 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-;CHECK: V_MIN_I32_e32
-
-define void @main(i32 %p0, i32 %p1) #0 {
+; SI-LABEL: @vector_imin
+; SI: V_MIN_I32_e32
+define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
-  %0 = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1)
-  %1 = bitcast i32 %0 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %1, float %1, float %1, float %1)
+  %load = load i32 addrspace(1)* %in, align 4
+  %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %load)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: @scalar_imin
+; SI: S_MIN_I32
+define void @scalar_imin(i32 %p0, i32 %p1) #0 {
+entry:
+  %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
   ret void
 }
 
@@ -15,7 +26,7 @@ declare i32 @llvm.AMDGPU.imin(i32, i32) #1
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
 
 !0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.kill.ll b/test/CodeGen/R600/llvm.AMDGPU.kill.ll
new file mode 100644
index 0000000..4ab6a8a
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.kill.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+; SI-LABEL: @kill_gs_const
+; SI-NOT: V_CMPX_LE_F32
+; SI: S_MOV_B64 exec, 0
+
+define void @kill_gs_const() #0 {
+main_body:
+  %0 = icmp ule i32 0, 3
+  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %1)
+  %2 = icmp ule i32 3, 0
+  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %3)
+  ret void
+}
+
+declare void @llvm.AMDGPU.kill(float)
+
+attributes #0 = { "ShaderType"="2" }
+
+!0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umax.ll b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
index 4cfa133..1b8da2e 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
@@ -1,12 +1,38 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-;CHECK: V_MAX_U32_e32
-
-define void @main(i32 %p0, i32 %p1) #0 {
+; SI-LABEL: @vector_umax
+; SI: V_MAX_U32_e32
+define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
-  %0 = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1)
-  %1 = bitcast i32 %0 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %1, float %1, float %1, float %1)
+  %load = load i32 addrspace(1)* %in, align 4
+  %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %load)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: @scalar_umax
+; SI: S_MAX_U32
+define void @scalar_umax(i32 %p0, i32 %p1) #0 {
+entry:
+  %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: @trunc_zext_umax
+; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
+; SI: V_MAX_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
+; SI-NOT: AND
+; SI: BUFFER_STORE_SHORT [[RESULT]],
+define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
+  %tmp5 = load i8 addrspace(1)* %src, align 1
+  %tmp2 = zext i8 %tmp5 to i32
+  %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone
+  %tmp4 = trunc i32 %tmp3 to i8
+  %tmp6 = zext i8 %tmp4 to i16
+  store i16 %tmp6, i16 addrspace(1)* %out, align 2
   ret void
 }
 
@@ -15,7 +41,7 @@ declare i32 @llvm.AMDGPU.umax(i32, i32) #1
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
 
 !0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umin.ll b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
index 14af051..08397f8 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
@@ -1,12 +1,38 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-;CHECK: V_MIN_U32_e32
-
-define void @main(i32 %p0, i32 %p1) #0 {
+; SI-LABEL: @vector_umin
+; SI: V_MIN_U32_e32
+define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
-  %0 = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1)
-  %1 = bitcast i32 %0 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %1, float %1, float %1, float %1)
+  %load = load i32 addrspace(1)* %in, align 4
+  %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %load)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: @scalar_umin
+; SI: S_MIN_U32
+define void @scalar_umin(i32 %p0, i32 %p1) #0 {
+entry:
+  %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: @trunc_zext_umin
+; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
+; SI: V_MIN_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
+; SI-NOT: AND
+; SI: BUFFER_STORE_SHORT [[RESULT]],
+define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
+  %tmp5 = load i8 addrspace(1)* %src, align 1
+  %tmp2 = zext i8 %tmp5 to i32
+  %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone
+  %tmp4 = trunc i32 %tmp3 to i8
+  %tmp6 = zext i8 %tmp4 to i16
+  store i16 %tmp6, i16 addrspace(1)* %out, align 2
   ret void
 }
 
@@ -15,7 +41,7 @@ declare i32 @llvm.AMDGPU.umin(i32, i32) #1
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
 
 !0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.SI.load.dword.ll b/test/CodeGen/R600/llvm.SI.load.dword.ll
new file mode 100644
index 0000000..a622775
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.load.dword.ll
@@ -0,0 +1,40 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+; Example of a simple geometry shader loading vertex attributes from the
+; ESGS ring buffer
+
+; CHECK-LABEL: @main
+; CHECK: BUFFER_LOAD_DWORD
+; CHECK: BUFFER_LOAD_DWORD
+; CHECK: BUFFER_LOAD_DWORD
+; CHECK: BUFFER_LOAD_DWORD
+
+define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, [2 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* inreg, [17 x <16 x i8>] addrspace(2)* inreg, i32, i32, i32, i32) #0 {
+main_body:
+  %10 = getelementptr [2 x <16 x i8>] addrspace(2)* %3, i64 0, i32 1
+  %11 = load <16 x i8> addrspace(2)* %10, !tbaa !0
+  %12 = shl i32 %6, 2
+  %13 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
+  %14 = bitcast i32 %13 to float
+  %15 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %11, i32 %12, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %16 = bitcast i32 %15 to float
+  %17 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %11, i32 %12, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0)
+  %18 = bitcast i32 %17 to float
+  %19 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %11, <2 x i32> <i32 0, i32 0>, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %20 = bitcast i32 %19 to float
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %14, float %16, float %18, float %20)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readonly
+declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { nounwind readonly }
+
+!0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.SI.sample-masked.ll b/test/CodeGen/R600/llvm.SI.sample-masked.ll
index e5e4ec4..445359a 100644
--- a/test/CodeGen/R600/llvm.SI.sample-masked.ll
+++ b/test/CodeGen/R600/llvm.SI.sample-masked.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: @v1
 ; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 13
-define void @v1(i32 %a1) {
+define void @v1(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
@@ -15,7 +15,7 @@ entry:
 
 ; CHECK-LABEL: @v2
 ; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 11
-define void @v2(i32 %a1) {
+define void @v2(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
@@ -28,7 +28,7 @@ entry:
 
 ; CHECK-LABEL: @v3
 ; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 14
-define void @v3(i32 %a1) {
+define void @v3(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
@@ -41,7 +41,7 @@ entry:
 
 ; CHECK-LABEL: @v4
 ; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 7
-define void @v4(i32 %a1) {
+define void @v4(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
@@ -54,7 +54,7 @@ entry:
 
 ; CHECK-LABEL: @v5
 ; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 10
-define void @v5(i32 %a1) {
+define void @v5(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
@@ -66,7 +66,7 @@ entry:
 
 ; CHECK-LABEL: @v6
 ; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 6
-define void @v6(i32 %a1) {
+define void @v6(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
@@ -78,7 +78,7 @@ entry:
 
 ; CHECK-LABEL: @v7
 ; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 9
-define void @v7(i32 %a1) {
+define void @v7(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
@@ -91,3 +91,5 @@ entry:
 declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll
index d41737c..24e8f64 100644
--- a/test/CodeGen/R600/llvm.SI.sample.ll
+++ b/test/CodeGen/R600/llvm.SI.sample.ll
@@ -17,7 +17,7 @@
 ;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 14
 ;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 8
 
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
    %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
    %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
@@ -137,7 +137,7 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
 
 ; CHECK: @v1
 ; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 15
-define void @v1(i32 %a1) {
+define void @v1(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
@@ -155,3 +155,5 @@ declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32)
 declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll
index 21ac725..366456f 100644
--- a/test/CodeGen/R600/llvm.SI.sampled.ll
+++ b/test/CodeGen/R600/llvm.SI.sampled.ll
@@ -17,7 +17,7 @@
 ;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 14
 ;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 8
 
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
    %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
    %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
@@ -138,3 +138,5 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
 declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.SI.sendmsg.ll b/test/CodeGen/R600/llvm.SI.sendmsg.ll
new file mode 100644
index 0000000..581d422
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.sendmsg.ll
@@ -0,0 +1,21 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: @main
+; CHECK: S_SENDMSG Gs(emit stream 0)
+; CHECK: S_SENDMSG Gs(cut stream 1)
+; CHECK: S_SENDMSG Gs(emit-cut stream 2)
+; CHECK: S_SENDMSG Gs_done(nop)
+
+define void @main() {
+main_body:
+  call void @llvm.SI.sendmsg(i32 34, i32 0);
+  call void @llvm.SI.sendmsg(i32 274, i32 0);
+  call void @llvm.SI.sendmsg(i32 562, i32 0);
+  call void @llvm.SI.sendmsg(i32 3, i32 0);
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.SI.sendmsg(i32, i32) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
index fa7c3ca..569efb6 100644
--- a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
+++ b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
@@ -1,8 +1,8 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK_LABEL: @test1
+;CHECK-LABEL: @test1
 ;CHECK: TBUFFER_STORE_FORMAT_XYZW {{v\[[0-9]+:[0-9]+\]}}, 32, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test1(i32 %a1, i32 %vaddr) {
+define void @test1(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
@@ -10,9 +10,9 @@ define void @test1(i32 %a1, i32 %vaddr) {
     ret void
 }
 
-;CHECK_LABEL: @test2
+;CHECK-LABEL: @test2
 ;CHECK: TBUFFER_STORE_FORMAT_XYZ {{v\[[0-9]+:[0-9]+\]}}, 24, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test2(i32 %a1, i32 %vaddr) {
+define void @test2(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
         i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
@@ -20,9 +20,9 @@ define void @test2(i32 %a1, i32 %vaddr) {
     ret void
 }
 
-;CHECK_LABEL: @test3
+;CHECK-LABEL: @test3
 ;CHECK: TBUFFER_STORE_FORMAT_XY {{v\[[0-9]+:[0-9]+\]}}, 16, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test3(i32 %a1, i32 %vaddr) {
+define void @test3(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
         i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
@@ -30,9 +30,9 @@ define void @test3(i32 %a1, i32 %vaddr) {
     ret void
 }
 
-;CHECK_LABEL: @test4
+;CHECK-LABEL: @test4
 ;CHECK: TBUFFER_STORE_FORMAT_X {{v[0-9]+}}, 8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test4(i32 %vdata, i32 %vaddr) {
+define void @test4(i32 %vdata, i32 %vaddr) #0 {
     call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
         i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
         i32 1, i32 0)
@@ -42,3 +42,5 @@ define void @test4(i32 %vdata, i32 %vaddr) {
 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
 declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
 declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/llvm.exp2.ll b/test/CodeGen/R600/llvm.exp2.ll
new file mode 100644
index 0000000..13bfbab
--- /dev/null
+++ b/test/CodeGen/R600/llvm.exp2.ll
@@ -0,0 +1,26 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
+
+;EG-CHECK-LABEL: @test
+;EG-CHECK: EXP_IEEE *
+;CM-CHECK-LABEL: @test
+;CM-CHECK: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X|
+;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X|
+;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
+;CM-CHECK: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X|
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = call float @llvm.fabs.f32(float %r0)
+   %r2 = fsub float -0.000000e+00, %r1
+   %r3 = call float @llvm.exp2.f32(float %r2)
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @llvm.exp2.f32(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
index b587d2b..c4ae652 100644
--- a/test/CodeGen/R600/llvm.pow.ll
+++ b/test/CodeGen/R600/llvm.pow.ll
@@ -1,10 +1,11 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
-;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;CHECK-LABEL: test1:
+;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define void @test1(<4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = call float @llvm.pow.f32( float %r0, float %r1)
@@ -13,7 +14,27 @@ define void @test(<4 x float> inreg %reg0) #0 {
    ret void
 }
 
+;CHECK-LABEL: test2:
+;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+   %vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1)
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
 declare float @llvm.pow.f32(float ,float ) readonly
+declare <4 x float> @llvm.pow.v4f32(<4 x float> ,<4 x float> ) readonly
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.trunc.ll b/test/CodeGen/R600/llvm.trunc.ll
new file mode 100644
index 0000000..fa6fb99
--- /dev/null
+++ b/test/CodeGen/R600/llvm.trunc.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK-LABEL: @trunc_f32
+; CHECK: TRUNC
+
+define void @trunc_f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.trunc.f32(float %in)
+  store float %0, float  addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.trunc.f32(float)
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index e4492d7..1486c4d 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -1,16 +1,15 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK  %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
 
 ;===------------------------------------------------------------------------===;
 ; GLOBAL ADDRESS SPACE
 ;===------------------------------------------------------------------------===;
 
 ; Load an i8 value from the global address space.
-; R600-CHECK-LABEL: @load_i8
+; FUNC-LABEL: @load_i8
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
 
-; SI-CHECK-LABEL: @load_i8
 ; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
 define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8 addrspace(1)* %in
@@ -19,13 +18,12 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   ret void
 }
 
-; R600-CHECK-LABEL: @load_i8_sext
+; FUNC-LABEL: @load_i8_sext
 ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 24
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 24
-; SI-CHECK-LABEL: @load_i8_sext
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
@@ -35,10 +33,9 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v2i8
+; FUNC-LABEL: @load_v2i8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @load_v2i8
 ; SI-CHECK: BUFFER_LOAD_UBYTE
 ; SI-CHECK: BUFFER_LOAD_UBYTE
 define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
@@ -49,7 +46,7 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v2i8_sext
+; FUNC-LABEL: @load_v2i8_sext
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
@@ -60,7 +57,6 @@ entry:
 ; R600-CHECK-DAG: 24
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
 ; R600-CHECK-DAG: 24
-; SI-CHECK-LABEL: @load_v2i8_sext
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
@@ -71,12 +67,11 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v4i8
+; FUNC-LABEL: @load_v4i8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @load_v4i8
 ; SI-CHECK: BUFFER_LOAD_UBYTE
 ; SI-CHECK: BUFFER_LOAD_UBYTE
 ; SI-CHECK: BUFFER_LOAD_UBYTE
@@ -89,7 +84,7 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v4i8_sext
+; FUNC-LABEL: @load_v4i8_sext
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
@@ -110,7 +105,6 @@ entry:
 ; R600-CHECK-DAG: 24
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
 ; R600-CHECK-DAG: 24
-; SI-CHECK-LABEL: @load_v4i8_sext
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 ; SI-CHECK: BUFFER_LOAD_SBYTE
@@ -124,9 +118,8 @@ entry:
 }
 
 ; Load an i16 value from the global address space.
-; R600-CHECK-LABEL: @load_i16
+; FUNC-LABEL: @load_i16
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK-LABEL: @load_i16
 ; SI-CHECK: BUFFER_LOAD_USHORT
 define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
@@ -136,13 +129,12 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_i16_sext
+; FUNC-LABEL: @load_i16_sext
 ; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 16
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 16
-; SI-CHECK-LABEL: @load_i16_sext
 ; SI-CHECK: BUFFER_LOAD_SSHORT
 define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
@@ -152,10 +144,9 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v2i16
+; FUNC-LABEL: @load_v2i16
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: @load_v2i16
 ; SI-CHECK: BUFFER_LOAD_USHORT
 ; SI-CHECK: BUFFER_LOAD_USHORT
 define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
@@ -166,7 +157,7 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v2i16_sext
+; FUNC-LABEL: @load_v2i16_sext
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
@@ -177,7 +168,6 @@ entry:
 ; R600-CHECK-DAG: 16
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
 ; R600-CHECK-DAG: 16
-; SI-CHECK-LABEL: @load_v2i16_sext
 ; SI-CHECK: BUFFER_LOAD_SSHORT
 ; SI-CHECK: BUFFER_LOAD_SSHORT
 define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
@@ -188,12 +178,11 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v4i16
+; FUNC-LABEL: @load_v4i16
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: @load_v4i16
 ; SI-CHECK: BUFFER_LOAD_USHORT
 ; SI-CHECK: BUFFER_LOAD_USHORT
 ; SI-CHECK: BUFFER_LOAD_USHORT
@@ -206,7 +195,7 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v4i16_sext
+; FUNC-LABEL: @load_v4i16_sext
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
@@ -227,7 +216,6 @@ entry:
 ; R600-CHECK-DAG: 16
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
 ; R600-CHECK-DAG: 16
-; SI-CHECK-LABEL: @load_v4i16_sext
 ; SI-CHECK: BUFFER_LOAD_SSHORT
 ; SI-CHECK: BUFFER_LOAD_SSHORT
 ; SI-CHECK: BUFFER_LOAD_SSHORT
@@ -241,10 +229,9 @@ entry:
 }
 
 ; load an i32 value from the global address space.
-; R600-CHECK-LABEL: @load_i32
+; FUNC-LABEL: @load_i32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK-LABEL: @load_i32
 ; SI-CHECK: BUFFER_LOAD_DWORD v{{[0-9]+}}
 define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
@@ -254,10 +241,9 @@ entry:
 }
 
 ; load a f32 value from the global address space.
-; R600-CHECK-LABEL: @load_f32
+; FUNC-LABEL: @load_f32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK-LABEL: @load_f32
 ; SI-CHECK: BUFFER_LOAD_DWORD v{{[0-9]+}}
 define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
@@ -267,10 +253,9 @@ entry:
 }
 
 ; load a v2f32 value from the global address space
-; R600-CHECK-LABEL: @load_v2f32
+; FUNC-LABEL: @load_v2f32
 ; R600-CHECK: VTX_READ_64
 
-; SI-CHECK-LABEL: @load_v2f32
 ; SI-CHECK: BUFFER_LOAD_DWORDX2
 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 entry:
@@ -279,11 +264,10 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_i64
+; FUNC-LABEL: @load_i64
 ; R600-CHECK: MEM_RAT
 ; R600-CHECK: MEM_RAT
 
-; SI-CHECK-LABEL: @load_i64
 ; SI-CHECK: BUFFER_LOAD_DWORDX2
 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
@@ -292,13 +276,12 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_i64_sext
+; FUNC-LABEL: @load_i64_sext
 ; R600-CHECK: MEM_RAT
 ; R600-CHECK: MEM_RAT
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
 ; R600-CHECK: 31
-; SI-CHECK-LABEL: @load_i64_sext
-; SI-CHECK: BUFFER_LOAD_DWORDX2 [[VAL:v\[[0-9]:[0-9]\]]]
+; SI-CHECK: BUFFER_LOAD_DWORD
 
 define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
@@ -308,7 +291,7 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_i64_zext
+; FUNC-LABEL: @load_i64_zext
 ; R600-CHECK: MEM_RAT
 ; R600-CHECK: MEM_RAT
 define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -319,18 +302,65 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: @load_v8i32
+; R600-CHECK: VTX_READ_128
+; R600-CHECK: VTX_READ_128
+; XXX: We should be using DWORDX4 instructions on SI.
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
+entry:
+  %0 = load <8 x i32> addrspace(1)* %in
+  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @load_v16i32
+; R600-CHECK: VTX_READ_128
+; R600-CHECK: VTX_READ_128
+; R600-CHECK: VTX_READ_128
+; R600-CHECK: VTX_READ_128
+; XXX: We should be using DWORDX4 instructions on SI.
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: BUFFER_LOAD_DWORD
+define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
+entry:
+  %0 = load <16 x i32> addrspace(1)* %in
+  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
 ;===------------------------------------------------------------------------===;
 ; CONSTANT ADDRESS SPACE
 ;===------------------------------------------------------------------------===;
 
 ; Load a sign-extended i8 value
-; R600-CHECK-LABEL: @load_const_i8_sext
+; FUNC-LABEL: @load_const_i8_sext
 ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 24
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 24
-; SI-CHECK-LABEL: @load_const_i8_sext
 ; SI-CHECK: BUFFER_LOAD_SBYTE v{{[0-9]+}},
 define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
@@ -341,9 +371,8 @@ entry:
 }
 
 ; Load an aligned i8 value
-; R600-CHECK-LABEL: @load_const_i8_aligned
+; FUNC-LABEL: @load_const_i8_aligned
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK-LABEL: @load_const_i8_aligned
 ; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
 define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
@@ -354,9 +383,8 @@ entry:
 }
 
 ; Load an un-aligned i8 value
-; R600-CHECK-LABEL: @load_const_i8_unaligned
+; FUNC-LABEL: @load_const_i8_unaligned
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK-LABEL: @load_const_i8_unaligned
 ; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
 define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
@@ -368,13 +396,12 @@ entry:
 }
 
 ; Load a sign-extended i16 value
-; R600-CHECK-LABEL: @load_const_i16_sext
+; FUNC-LABEL: @load_const_i16_sext
 ; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 16
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 16
-; SI-CHECK-LABEL: @load_const_i16_sext
 ; SI-CHECK: BUFFER_LOAD_SSHORT
 define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
@@ -385,9 +412,8 @@ entry:
 }
 
 ; Load an aligned i16 value
-; R600-CHECK-LABEL: @load_const_i16_aligned
+; FUNC-LABEL: @load_const_i16_aligned
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK-LABEL: @load_const_i16_aligned
 ; SI-CHECK: BUFFER_LOAD_USHORT
 define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
@@ -398,9 +424,8 @@ entry:
 }
 
 ; Load an un-aligned i16 value
-; R600-CHECK-LABEL: @load_const_i16_unaligned
+; FUNC-LABEL: @load_const_i16_unaligned
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK-LABEL: @load_const_i16_unaligned
 ; SI-CHECK: BUFFER_LOAD_USHORT
 define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
@@ -412,10 +437,9 @@ entry:
 }
 
 ; Load an i32 value from the constant address space.
-; R600-CHECK-LABEL: @load_const_addrspace_i32
+; FUNC-LABEL: @load_const_addrspace_i32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK-LABEL: @load_const_addrspace_i32
 ; SI-CHECK: S_LOAD_DWORD s{{[0-9]+}}
 define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
@@ -425,10 +449,9 @@ entry:
 }
 
 ; Load a f32 value from the constant address space.
-; R600-CHECK-LABEL: @load_const_addrspace_f32
+; FUNC-LABEL: @load_const_addrspace_f32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK-LABEL: @load_const_addrspace_f32
 ; SI-CHECK: S_LOAD_DWORD s{{[0-9]+}}
 define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
   %1 = load float addrspace(2)* %in
@@ -441,10 +464,10 @@ define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(
 ;===------------------------------------------------------------------------===;
 
 ; Load an i8 value from the local address space.
-; R600-CHECK-LABEL: @load_i8_local
+; FUNC-LABEL: @load_i8_local
 ; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-LABEL: @load_i8_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_U8
 define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
   %1 = load i8 addrspace(3)* %in
@@ -453,11 +476,11 @@ define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
   ret void
 }
 
-; R600-CHECK-LABEL: @load_i8_sext_local
+; FUNC-LABEL: @load_i8_sext_local
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: ASHR
-; SI-CHECK-LABEL: @load_i8_sext_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_I8
 define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
 entry:
@@ -467,11 +490,11 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v2i8_local
+; FUNC-LABEL: @load_v2i8_local
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-LABEL: @load_v2i8_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_U8
 ; SI-CHECK: DS_READ_U8
 define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
@@ -482,13 +505,13 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v2i8_sext_local
+; FUNC-LABEL: @load_v2i8_sext_local
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-LABEL: @load_v2i8_sext_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_I8
 ; SI-CHECK: DS_READ_I8
 define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
@@ -499,13 +522,13 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v4i8_local
+; FUNC-LABEL: @load_v4i8_local
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-LABEL: @load_v4i8_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_U8
 ; SI-CHECK: DS_READ_U8
 ; SI-CHECK: DS_READ_U8
@@ -518,7 +541,7 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v4i8_sext_local
+; FUNC-LABEL: @load_v4i8_sext_local
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
@@ -527,8 +550,8 @@ entry:
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-LABEL: @load_v4i8_sext_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_I8
 ; SI-CHECK: DS_READ_I8
 ; SI-CHECK: DS_READ_I8
@@ -542,10 +565,10 @@ entry:
 }
 
 ; Load an i16 value from the local address space.
-; R600-CHECK-LABEL: @load_i16_local
+; FUNC-LABEL: @load_i16_local
 ; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-LABEL: @load_i16_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_U16
 define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
 entry:
@@ -555,11 +578,11 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_i16_sext_local
+; FUNC-LABEL: @load_i16_sext_local
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: ASHR
-; SI-CHECK-LABEL: @load_i16_sext_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_I16
 define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
 entry:
@@ -569,11 +592,11 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v2i16_local
+; FUNC-LABEL: @load_v2i16_local
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-LABEL: @load_v2i16_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_U16
 ; SI-CHECK: DS_READ_U16
 define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
@@ -584,13 +607,13 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v2i16_sext_local
+; FUNC-LABEL: @load_v2i16_sext_local
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-LABEL: @load_v2i16_sext_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_I16
 ; SI-CHECK: DS_READ_I16
 define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
@@ -601,13 +624,13 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v4i16_local
+; FUNC-LABEL: @load_v4i16_local
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-LABEL: @load_v4i16_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_U16
 ; SI-CHECK: DS_READ_U16
 ; SI-CHECK: DS_READ_U16
@@ -620,7 +643,7 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v4i16_sext_local
+; FUNC-LABEL: @load_v4i16_sext_local
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
@@ -629,8 +652,8 @@ entry:
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-LABEL: @load_v4i16_sext_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_I16
 ; SI-CHECK: DS_READ_I16
 ; SI-CHECK: DS_READ_I16
@@ -643,11 +666,11 @@ entry:
   ret void
 }
 
-; load an i32 value from the glocal address space.
-; R600-CHECK-LABEL: @load_i32_local
+; load an i32 value from the local address space.
+; FUNC-LABEL: @load_i32_local
 ; R600-CHECK: LDS_READ_RET
-; SI-CHECK-LABEL: @load_i32_local
 ; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_B32
 define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
@@ -656,10 +679,10 @@ entry:
   ret void
 }
 
-; load a f32 value from the global address space.
-; R600-CHECK-LABEL: @load_f32_local
+; load a f32 value from the local address space.
+; FUNC-LABEL: @load_f32_local
 ; R600-CHECK: LDS_READ_RET
-; SI-CHECK-LABEL: @load_f32_local
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_B32
 define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
 entry:
@@ -669,10 +692,10 @@ entry:
 }
 
 ; load a v2f32 value from the local address space
-; R600-CHECK-LABEL: @load_v2f32_local
+; FUNC-LABEL: @load_v2f32_local
 ; R600-CHECK: LDS_READ_RET
 ; R600-CHECK: LDS_READ_RET
-; SI-CHECK-LABEL: @load_v2f32_local
+; SI-CHECK: S_MOV_B32 m0
 ; SI-CHECK: DS_READ_B32
 ; SI-CHECK: DS_READ_B32
 define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
diff --git a/test/CodeGen/R600/load64.ll b/test/CodeGen/R600/load64.ll
index e351e41..a117557 100644
--- a/test/CodeGen/R600/load64.ll
+++ b/test/CodeGen/R600/load64.ll
@@ -1,18 +1,28 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; load a f64 value from the global address space.
-; CHECK: @load_f64
+; CHECK-LABEL: @load_f64:
 ; CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
+; CHECK: BUFFER_STORE_DWORDX2 v[{{[0-9]+:[0-9]+}}]
 define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-entry:
-  %0 = load double addrspace(1)* %in
-  store double %0, double addrspace(1)* %out
+  %1 = load double addrspace(1)* %in
+  store double %1, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @load_i64:
+; CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
+; CHECK: BUFFER_STORE_DWORDX2 v[{{[0-9]+:[0-9]+}}]
+define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tmp = load i64 addrspace(1)* %in
+  store i64 %tmp, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; Load a f64 value from the constant address space.
-; CHECK: @load_const_addrspace_f64
+; CHECK-LABEL: @load_const_addrspace_f64:
 ; CHECK: S_LOAD_DWORDX2 s[{{[0-9]+:[0-9]+}}]
+; CHECK: BUFFER_STORE_DWORDX2 v[{{[0-9]+:[0-9]+}}]
 define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
   %1 = load double addrspace(2)* %in
   store double %1, double addrspace(1)* %out
diff --git a/test/CodeGen/R600/local-64.ll b/test/CodeGen/R600/local-64.ll
new file mode 100644
index 0000000..87f18ae
--- /dev/null
+++ b/test/CodeGen/R600/local-64.ll
@@ -0,0 +1,158 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @local_i32_load
+; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 28, [M0]
+; SI: BUFFER_STORE_DWORD [[REG]],
+define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %in, i32 7
+  %val = load i32 addrspace(3)* %gep, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @local_i32_load_0_offset
+; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 0, [M0]
+; SI: BUFFER_STORE_DWORD [[REG]],
+define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+  %val = load i32 addrspace(3)* %in, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @local_i8_load_i16_max_offset
+; SI-NOT: ADD
+; SI: DS_READ_U8 [[REG:v[0-9]+]], {{v[0-9]+}}, -1, [M0]
+; SI: BUFFER_STORE_BYTE [[REG]],
+define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i8 addrspace(3)* %in, i32 65535
+  %val = load i8 addrspace(3)* %gep, align 4
+  store i8 %val, i8 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @local_i8_load_over_i16_max_offset
+; SI: S_ADD_I32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 65536
+; SI: V_MOV_B32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
+; SI: DS_READ_U8 [[REG:v[0-9]+]], [[VREGADDR]], 0, [M0]
+; SI: BUFFER_STORE_BYTE [[REG]],
+define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i8 addrspace(3)* %in, i32 65536
+  %val = load i8 addrspace(3)* %gep, align 4
+  store i8 %val, i8 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @local_i64_load
+; SI-NOT: ADD
+; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 56, [M0]
+; SI: BUFFER_STORE_DWORDX2 [[REG]],
+define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %in, i32 7
+  %val = load i64 addrspace(3)* %gep, align 8
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @local_i64_load_0_offset
+; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0, [M0]
+; SI: BUFFER_STORE_DWORDX2 [[REG]],
+define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+  %val = load i64 addrspace(3)* %in, align 8
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @local_f64_load
+; SI-NOT: ADD
+; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 56, [M0]
+; SI: BUFFER_STORE_DWORDX2 [[REG]],
+define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+  %gep = getelementptr double addrspace(3)* %in, i32 7
+  %val = load double addrspace(3)* %gep, align 8
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @local_f64_load_0_offset
+; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0, [M0]
+; SI: BUFFER_STORE_DWORDX2 [[REG]],
+define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+  %val = load double addrspace(3)* %in, align 8
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @local_i64_store
+; SI-NOT: ADD
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 56 [M0]
+define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %out, i32 7
+  store i64 5678, i64 addrspace(3)* %gep, align 8
+  ret void
+}
+
+; SI-LABEL: @local_i64_store_0_offset
+; SI-NOT: ADD
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
+  store i64 1234, i64 addrspace(3)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @local_f64_store
+; SI-NOT: ADD
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 56 [M0]
+define void @local_f64_store(double addrspace(3)* %out) nounwind {
+  %gep = getelementptr double addrspace(3)* %out, i32 7
+  store double 16.0, double addrspace(3)* %gep, align 8
+  ret void
+}
+
+; SI-LABEL: @local_f64_store_0_offset
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
+  store double 20.0, double addrspace(3)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @local_v2i64_store
+; SI-NOT: ADD
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 120 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 112 [M0]
+define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
+  %gep = getelementptr <2 x i64> addrspace(3)* %out, i32 7
+  store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
+  ret void
+}
+
+; SI-LABEL: @local_v2i64_store_0_offset
+; SI-NOT: ADD
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 8 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
+  store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @local_v4i64_store
+; SI-NOT: ADD
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 248 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 240 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 232 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 224 [M0]
+define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
+  %gep = getelementptr <4 x i64> addrspace(3)* %out, i32 7
+  store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
+  ret void
+}
+
+; SI-LABEL: @local_v4i64_store_0_offset
+; SI-NOT: ADD
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 24 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 16 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 8 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
+  store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index e2d8406..616000d 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -17,18 +17,19 @@
 ; this consistently on evergreen GPUs.
 ; EG-CHECK: LDS_WRITE
 ; EG-CHECK: LDS_WRITE
-; SI-CHECK: DS_WRITE_B32 0, {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
-; SI-CHECK-NOT: DS_WRITE_B32 0, {{v[0-9]*}}, v[[ADDRW]]
+; SI-CHECK: DS_WRITE_B32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
+; SI-CHECK-NOT: DS_WRITE_B32 {{v[0-9]*}}, v[[ADDRW]]
 
 ; GROUP_BARRIER must be the last instruction in a clause
 ; EG-CHECK: GROUP_BARRIER
 ; EG-CHECK-NEXT: ALU clause
 
-; Make sure the lds reads are using different addresses.
+; Make sure the lds reads are using different addresses, at different
+; constant offsets.
 ; EG-CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, 0, [[ADDRR:v[0-9]+]]
-; SI-CHECK-NOT: DS_READ_B32 {{v[0-9]+}}, 0, [[ADDRR]]
+; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]], 16
+; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR]], 0,
 
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
index 2168a3d..6ebe41d 100644
--- a/test/CodeGen/R600/local-memory.ll
+++ b/test/CodeGen/R600/local-memory.ll
@@ -17,8 +17,8 @@
 ; CI-CHECK-NEXT: .long 32768
 
 ; EG-CHECK: LDS_WRITE
-; SI-CHECK_NOT: S_WQM_B64
-; SI-CHECK: DS_WRITE_B32 0
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_WRITE_B32
 
 ; GROUP_BARRIER must be the last instruction in a clause
 ; EG-CHECK: GROUP_BARRIER
@@ -26,7 +26,7 @@
 ; SI-CHECK: S_BARRIER
 
 ; EG-CHECK: LDS_READ_RET
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, 0
+; SI-CHECK: DS_READ_B32 {{v[0-9]+}},
 
 define void @local_memory(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/R600/loop-idiom.ll b/test/CodeGen/R600/loop-idiom.ll
new file mode 100644
index 0000000..8a9cba2
--- /dev/null
+++ b/test/CodeGen/R600/loop-idiom.ll
@@ -0,0 +1,54 @@
+; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "r600--"
+
+
+
+; Make sure loop-idiom doesn't create memcpy or memset.  There are no library
+; implementations of these for R600.
+
+; FUNC: @no_memcpy
+; R600-NOT: @llvm.memcpy
+; SI-NOT: @llvm.memcpy
+define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
+entry:
+  %dest = alloca i8, i32 32
+  br label %for.body
+
+for.body:
+  %0 = phi i32 [0, %entry], [%4, %for.body]
+  %1 = getelementptr i8 addrspace(3)* %in, i32 %0
+  %2 = getelementptr i8* %dest, i32 %0
+  %3 = load i8 addrspace(3)* %1
+  store i8 %3, i8* %2
+  %4 = add i32 %0, 1
+  %5 = icmp eq i32 %4, %size
+  br i1 %5, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; FUNC: @no_memset
+; R600-NOT: @llvm.memset
+; R600-NOT: @memset_pattern16
+; SI-NOT: @llvm.memset
+; SI-NOT: @memset_pattern16
+define void @no_memset(i32 %size) {
+entry:
+  %dest = alloca i8, i32 32
+  br label %for.body
+
+for.body:
+  %0 = phi i32 [0, %entry], [%2, %for.body]
+  %1 = getelementptr i8* %dest, i32 %0
+  store i8 0, i8* %1
+  %2 = add i32 %0, 1
+  %3 = icmp eq i32 %2, %size
+  br i1 %3, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll
index 66a070e..3dcadc9 100644
--- a/test/CodeGen/R600/mad_uint24.ll
+++ b/test/CodeGen/R600/mad_uint24.ll
@@ -26,14 +26,11 @@ entry:
 ; The order of A and B does not matter.
 ; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
 ; The result must be sign-extended
-; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MAD_CHAN]], literal.x
-; EG-CHECK: 16
-; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
+; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
 ; EG-CHECK: 16
 ; SI-CHECK-LABEL: @i16_mad24
 ; SI-CHECK: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 16, [[MAD]]
-; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 16, [[LSHL]]
+; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16
 
 define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
@@ -51,14 +48,11 @@ entry:
 ; The order of A and B does not matter.
 ; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
 ; The result must be sign-extended
-; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MAD_CHAN]], literal.x
-; EG-CHECK: 24
-; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
-; EG-CHECK: 24
+; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG-CHECK: 8
 ; SI-CHECK-LABEL: @i8_mad24
 ; SI-CHECK: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 24, [[MUL]]
-; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 24, [[LSHL]]
+; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8
 
 define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll
new file mode 100644
index 0000000..2d5ddeb
--- /dev/null
+++ b/test/CodeGen/R600/mubuf.ll
@@ -0,0 +1,98 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck %s
+
+;;;==========================================================================;;;
+;;; MUBUF LOAD TESTS
+;;;==========================================================================;;;
+
+; MUBUF load with an immediate byte offset that fits into 12-bits
+; CHECK-LABEL: @mubuf_load0
+; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr i32 addrspace(1)* %in, i64 1
+  %1 = load i32 addrspace(1)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; MUBUF load with the largest possible immediate offset
+; CHECK-LABEL: @mubuf_load1
+; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4095 ; encoding: [0xff,0x8f
+define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr i8 addrspace(1)* %in, i64 4095
+  %1 = load i8 addrspace(1)* %0
+  store i8 %1, i8 addrspace(1)* %out
+  ret void
+}
+
+; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
+; CHECK-LABEL: @mubuf_load2
+; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0 ; encoding: [0x00,0x80
+define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr i32 addrspace(1)* %in, i64 1024
+  %1 = load i32 addrspace(1)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; MUBUF load with a 12-bit immediate offset and a register offset
+; CHECK-LABEL: @mubuf_load3
+; CHECK-NOT: ADD
+; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
+entry:
+  %0 = getelementptr i32 addrspace(1)* %in, i64 %offset
+  %1 = getelementptr i32 addrspace(1)* %0, i64 1
+  %2 = load i32 addrspace(1)* %1
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+;;;==========================================================================;;;
+;;; MUBUF STORE TESTS
+;;;==========================================================================;;;
+
+; MUBUF store with an immediate byte offset that fits into 12-bits
+; CHECK-LABEL: @mubuf_store0
+; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+define void @mubuf_store0(i32 addrspace(1)* %out) {
+entry:
+  %0 = getelementptr i32 addrspace(1)* %out, i64 1
+  store i32 0, i32 addrspace(1)* %0
+  ret void
+}
+
+; MUBUF store with the largest possible immediate offset
+; CHECK-LABEL: @mubuf_store1
+; CHECK: BUFFER_STORE_BYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4095 ; encoding: [0xff,0x8f
+
+define void @mubuf_store1(i8 addrspace(1)* %out) {
+entry:
+  %0 = getelementptr i8 addrspace(1)* %out, i64 4095
+  store i8 0, i8 addrspace(1)* %0
+  ret void
+}
+
+; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
+; CHECK-LABEL: @mubuf_store2
+; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0 ; encoding: [0x00,0x80
+define void @mubuf_store2(i32 addrspace(1)* %out) {
+entry:
+  %0 = getelementptr i32 addrspace(1)* %out, i64 1024
+  store i32 0, i32 addrspace(1)* %0
+  ret void
+}
+
+; MUBUF store with a 12-bit immediate offset and a register offset
+; CHECK-LABEL: @mubuf_store3
+; CHECK-NOT: ADD
+; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
+entry:
+  %0 = getelementptr i32 addrspace(1)* %out, i64 %offset
+  %1 = getelementptr i32 addrspace(1)* %0, i64 1
+  store i32 0, i32 addrspace(1)* %1
+  ret void
+}
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index 8c27e28..e176148 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll
@@ -40,3 +40,15 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; SI-CHECK-LABEL: @trunc_i64_mul_to_i32
+; SI-CHECK: S_LOAD_DWORD
+; SI-CHECK: S_LOAD_DWORD
+; SI-CHECK: V_MUL_LO_I32
+; SI-CHECK: BUFFER_STORE_DWORD
+define void @trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+  %mul = mul i64 %b, %a
+  %trunc = trunc i64 %mul to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll
index 6e6d549..a413961 100644
--- a/test/CodeGen/R600/mul_uint24.ll
+++ b/test/CodeGen/R600/mul_uint24.ll
@@ -24,15 +24,11 @@ entry:
 ; The order of A and B does not matter.
 ; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
 ; The result must be sign-extended
-; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MUL_CHAN]], literal.x
-; EG-CHECK: 16
-; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
+; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
 ; EG-CHECK: 16
 ; SI-CHECK-LABEL: @i16_mul24
 ; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 16, [[MUL]]
-; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 16, [[LSHL]]
-
+; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16,
 define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %0 = mul i16 %a, %b
@@ -47,14 +43,10 @@ entry:
 ; The order of A and B does not matter.
 ; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
 ; The result must be sign-extended
-; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MUL_CHAN]], literal.x
-; EG-CHECK: 24
-; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
-; EG-CHECK: 24
+; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
 ; SI-CHECK-LABEL: @i8_mul24
 ; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 24, [[MUL]]
-; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 24, [[LSHL]]
+; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8,
 
 define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
 entry:
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index 35d23b3..2cc991e 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll
@@ -1,13 +1,13 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
-; EG-CHECK-LABEL: @or_v2i32
-; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-LABEL: @or_v2i32
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: @or_v2i32
-;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI-LABEL: @or_v2i32
+; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +18,17 @@ define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in)
   ret void
 }
 
-; EG-CHECK-LABEL: @or_v4i32
-; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-LABEL: @or_v4i32
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: @or_v4i32
-;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI-LABEL: @or_v4i32
+; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,15 +39,91 @@ define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in)
   ret void
 }
 
-; EG-CHECK-LABEL: @or_i64
-; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
-; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; SI-CHECK-LABEL: @or_i64
-; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
-; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
-define void @or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-	%0 = or i64 %a, %b
-	store i64 %0, i64 addrspace(1)* %out
-	ret void
+; SI-LABEL: @scalar_or_i32
+; SI: S_OR_B32
+define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %or = or i32 %a, %b
+  store i32 %or, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: @vector_or_i32
+; SI: V_OR_B32_e32 v{{[0-9]}}
+define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
+  %loada = load i32 addrspace(1)* %a
+  %or = or i32 %loada, %b
+  store i32 %or, i32 addrspace(1)* %out
+  ret void
+}
+
+; EG-LABEL: @scalar_or_i64
+; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; SI-LABEL: @scalar_or_i64
+; SI: S_OR_B64
+define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %or = or i64 %a, %b
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: @vector_or_i64
+; SI: V_OR_B32_e32 v{{[0-9]}}
+; SI: V_OR_B32_e32 v{{[0-9]}}
+define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 8
+  %loadb = load i64 addrspace(1)* %a, align 8
+  %or = or i64 %loada, %loadb
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: @scalar_vector_or_i64
+; SI: V_OR_B32_e32 v{{[0-9]}}
+; SI: V_OR_B32_e32 v{{[0-9]}}
+define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
+  %loada = load i64 addrspace(1)* %a
+  %or = or i64 %loada, %b
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: @vector_or_i64_loadimm
+; SI-DAG: S_MOV_B32 [[LO_S_IMM:s[0-9]+]], -545810305
+; SI-DAG: S_MOV_B32 [[HI_S_IMM:s[0-9]+]], 5231
+; SI-DAG: BUFFER_LOAD_DWORDX2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: S_ENDPGM
+define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 8
+  %or = or i64 %loada, 22470723082367
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: The or 0 should really be removed.
+; SI-LABEL: @vector_or_i64_imm
+; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI: V_OR_B32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
+; SI: V_OR_B32_e32 {{v[0-9]+}}, 0, {{.*}}
+; SI: S_ENDPGM
+define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 8
+  %or = or i64 %loada, 8
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: @trunc_i64_or_to_i32
+; SI: S_LOAD_DWORD [[SREG0:s[0-9]+]],
+; SI: S_LOAD_DWORD [[SREG1:s[0-9]+]],
+; SI: S_OR_B32 [[SRESULT:s[0-9]+]], [[SREG1]], [[SREG0]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+  %add = or i64 %b, %a
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
 }
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index 48a013c..4920320 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -1,10 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 
 ; This test checks that uses and defs of the AR register happen in the same
 ; instruction clause.
 
-; R600-CHECK-LABEL: @mova_same_clause
+; FUNC-LABEL: @mova_same_clause
+
 ; R600-CHECK: MOVA_INT
 ; R600-CHECK-NOT: ALU clause
 ; R600-CHECK: 0 + AR.x
@@ -12,11 +13,10 @@
 ; R600-CHECK-NOT: ALU clause
 ; R600-CHECK: 0 + AR.x
 
-; SI-CHECK-LABEL: @mova_same_clause
-; SI-CHECK: V_READFIRSTLANE
+; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
 ; SI-CHECK: V_MOVRELD
 ; SI-CHECK: S_CBRANCH
-; SI-CHECK: V_READFIRSTLANE
+; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
 ; SI-CHECK: V_MOVRELD
 ; SI-CHECK: S_CBRANCH
 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
@@ -46,9 +46,8 @@ entry:
 ; XXX: This generated code has unnecessary MOVs, we should be able to optimize
 ; this.
 
-; R600-CHECK-LABEL: @multiple_structs
+; FUNC-LABEL: @multiple_structs
 ; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-LABEL: @multiple_structs
 ; SI-CHECK-NOT: V_MOVREL
 %struct.point = type { i32, i32 }
 
@@ -77,9 +76,8 @@ entry:
 ; loads and stores should be lowered to copies, so there shouldn't be any
 ; MOVA instructions.
 
-; R600-CHECK-LABLE: @direct_loop
+; FUNC-LABEL: @direct_loop
 ; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-LABEL: @direct_loop
 ; SI-CHECK-NOT: V_MOVREL
 
 define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -113,3 +111,106 @@ for.end:
   store i32 %value, i32 addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @short_array
+
+; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal
+; R600-CHECK: 65536
+; R600-CHECK: *
+; R600-CHECK: MOVA_INT
+
+; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 65536
+; SI-CHECK: V_MOVRELS_B32_e32
+define void @short_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [2 x i16]
+  %1 = getelementptr [2 x i16]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i16]* %0, i32 0, i32 1
+  store i16 0, i16* %1
+  store i16 1, i16* %2
+  %3 = getelementptr [2 x i16]* %0, i32 0, i32 %index
+  %4 = load i16* %3
+  %5 = sext i16 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @char_array
+
+; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal
+; R600-CHECK: 256
+; R600-CHECK: *
+; R600-CHECK-NEXT: MOVA_INT
+
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 256
+; SI-CHECK: V_MOVRELS_B32_e32
+define void @char_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [2 x i8]
+  %1 = getelementptr [2 x i8]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i8]* %0, i32 0, i32 1
+  store i8 0, i8* %1
+  store i8 1, i8* %2
+  %3 = getelementptr [2 x i8]* %0, i32 0, i32 %index
+  %4 = load i8* %3
+  %5 = sext i8 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+
+}
+
+; Make sure we don't overwrite workitem information with private memory
+
+; FUNC-LABEL: @work_item_info
+; R600-CHECK-NOT: MOV T0.X
+; Additional check in case the move ends up in the last slot
+; R600-CHECK-NOT: MOV * TO.X
+
+; SI-CHECK-NOT: V_MOV_B32_e{{(32|64)}} v0
+define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = alloca [2 x i32]
+  %1 = getelementptr [2 x i32]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i32]* %0, i32 0, i32 1
+  store i32 0, i32* %1
+  store i32 1, i32* %2
+  %3 = getelementptr [2 x i32]* %0, i32 0, i32 %in
+  %4 = load i32* %3
+  %5 = call i32 @llvm.r600.read.tidig.x()
+  %6 = add i32 %4, %5
+  store i32 %6, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test that two stack objects are not stored in the same register
+; The second stack object should be in T3.X
+; FUNC-LABEL: @no_overlap
+; R600-CHECK: MOV {{\** *}}T3.X
+; SI-CHECK: V_MOV_B32_e32 v3
+define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = alloca [3 x i8], align 1
+  %1 = alloca [2 x i8], align 1
+  %2 = getelementptr [3 x i8]* %0, i32 0, i32 0
+  %3 = getelementptr [3 x i8]* %0, i32 0, i32 1
+  %4 = getelementptr [3 x i8]* %0, i32 0, i32 2
+  %5 = getelementptr [2 x i8]* %1, i32 0, i32 0
+  %6 = getelementptr [2 x i8]* %1, i32 0, i32 1
+  store i8 0, i8* %2
+  store i8 1, i8* %3
+  store i8 2, i8* %4
+  store i8 1, i8* %5
+  store i8 0, i8* %6
+  %7 = getelementptr [3 x i8]* %0, i32 0, i32 %in
+  %8 = getelementptr [2 x i8]* %1, i32 0, i32 %in
+  %9 = load i8* %7
+  %10 = load i8* %8
+  %11 = add i8 %9, %10
+  %12 = sext i8 %11 to i32
+  store i32 %12, i32 addrspace(1)* %out
+  ret void
+}
+
+
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
diff --git a/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll
new file mode 100644
index 0000000..c89398f
--- /dev/null
+++ b/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll
@@ -0,0 +1,59 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman
+;REQUIRES: asserts
+
+define void @main(<4 x float> inreg, <4 x float> inreg) #0 {
+main_body:
+  %2 = extractelement <4 x float> %0, i32 0
+  %3 = extractelement <4 x float> %0, i32 1
+  %4 = extractelement <4 x float> %0, i32 2
+  %5 = extractelement <4 x float> %0, i32 3
+  %6 = insertelement <4 x float> undef, float %2, i32 0
+  %7 = insertelement <4 x float> %6, float %3, i32 1
+  %8 = insertelement <4 x float> %7, float %4, i32 2
+  %9 = insertelement <4 x float> %8, float %5, i32 3
+  %10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9)
+  %11 = extractelement <4 x float> %10, i32 0
+  %12 = extractelement <4 x float> %10, i32 1
+  %13 = extractelement <4 x float> %10, i32 2
+  %14 = extractelement <4 x float> %10, i32 3
+  %15 = call float @fabs(float %13)
+  %16 = fdiv float 1.000000e+00, %15
+  %17 = fmul float %11, %16
+  %18 = fadd float %17, 1.500000e+00
+  %19 = fmul float %12, %16
+  %20 = fadd float %19, 1.500000e+00
+  %21 = insertelement <4 x float> undef, float %20, i32 0
+  %22 = insertelement <4 x float> %21, float %18, i32 1
+  %23 = insertelement <4 x float> %22, float %14, i32 2
+  %24 = insertelement <4 x float> %23, float %5, i32 3
+  %25 = extractelement <4 x float> %24, i32 0
+  %26 = extractelement <4 x float> %24, i32 1
+  %27 = extractelement <4 x float> %24, i32 2
+  %28 = extractelement <4 x float> %24, i32 3
+  %29 = insertelement <4 x float> undef, float %25, i32 0
+  %30 = insertelement <4 x float> %29, float %26, i32 1
+  %31 = insertelement <4 x float> %30, float %27, i32 2
+  %32 = insertelement <4 x float> %31, float %28, i32 3
+  %33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13)
+  %34 = extractelement <4 x float> %33, i32 0
+  %35 = insertelement <4 x float> undef, float %34, i32 0
+  %36 = insertelement <4 x float> %35, float %34, i32 1
+  %37 = insertelement <4 x float> %36, float %34, i32 2
+  %38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
+
+; Function Attrs: readnone
+declare float @fabs(float) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/register-count-comments.ll b/test/CodeGen/R600/register-count-comments.ll
new file mode 100644
index 0000000..a64b280
--- /dev/null
+++ b/test/CodeGen/R600/register-count-comments.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.SI.tid() nounwind readnone
+
+; SI-LABEL: @foo:
+; SI: .section	.AMDGPU.csdata
+; SI: ; Kernel info:
+; SI: ; NumSgprs: {{[0-9]+}}
+; SI: ; NumVgprs: {{[0-9]+}}
+define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
+  %tid = call i32 @llvm.SI.tid() nounwind readnone
+  %aptr = getelementptr i32 addrspace(1)* %abase, i32 %tid
+  %bptr = getelementptr i32 addrspace(1)* %bbase, i32 %tid
+  %outptr = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %aptr, align 4
+  %b = load i32 addrspace(1)* %bptr, align 4
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %outptr, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll
new file mode 100644
index 0000000..e461bf9
--- /dev/null
+++ b/test/CodeGen/R600/salu-to-valu.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; In this test both the pointer and the offset operands to the
+; BUFFER_LOAD instructions end up being stored in vgprs.  This
+; requires us to add the pointer and offset together, store the
+; result in the offset operand (vaddr), and then store 0 in an
+; sgpr register pair and use that for the pointer operand
+; (low 64-bits of srsrc).
+
+; CHECK-LABEL: @mubuf
+
+; Make sure we aren't using VGPRs for the source operand of S_MOV_B64
+; CHECK-NOT: S_MOV_B64 s[{{[0-9]+:[0-9]+}}], v
+
+; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
+; instructions
+; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
+; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
+define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x() #1
+  %1 = call i32 @llvm.r600.read.tidig.y() #1
+  %2 = sext i32 %0 to i64
+  %3 = sext i32 %1 to i64
+  br label %loop
+
+loop:
+  %4 = phi i64 [0, %entry], [%5, %loop]
+  %5 = add i64 %2, %4
+  %6 = getelementptr i8 addrspace(1)* %in, i64 %5
+  %7 = load i8 addrspace(1)* %6, align 1
+  %8 = or i64 %5, 1
+  %9 = getelementptr i8 addrspace(1)* %in, i64 %8
+  %10 = load i8 addrspace(1)* %9, align 1
+  %11 = add i8 %7, %10
+  %12 = sext i8 %11 to i32
+  store i32 %12, i32 addrspace(1)* %out
+  %13 = icmp slt i64 %5, 10
+  br i1 %13, label %loop, label %done
+
+done:
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.r600.read.tidig.y() #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
new file mode 100644
index 0000000..2a286d1
--- /dev/null
+++ b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
@@ -0,0 +1,162 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: llc -O0 -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI
+
+declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+
+
+; SI-LABEL: @main(
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 2
+  %2 = fcmp ult float %0, 0.000000e+00
+  %3 = select i1 %2, float 1.000000e+00, float 0.000000e+00
+  %4 = fsub float -0.000000e+00, %3
+  %5 = fptosi float %4 to i32
+  %6 = bitcast i32 %5 to float
+  %7 = bitcast float %6 to i32
+  %8 = icmp ne i32 %7, 0
+  br i1 %8, label %LOOP, label %ENDIF
+
+Flow1:                                            ; preds = %ENDIF19, %ENDIF16
+  %9 = phi float [ %115, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %10 = phi float [ %114, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %11 = phi float [ %113, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %12 = phi float [ %112, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %13 = phi float [ %111, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %14 = phi i1 [ false, %ENDIF19 ], [ true, %ENDIF16 ]
+  br label %Flow
+
+Flow2:                                            ; preds = %Flow
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %main_body, %Flow2
+  %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %104, %Flow2 ]
+  %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %103, %Flow2 ]
+  %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %102, %Flow2 ]
+  %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
+  %15 = extractelement <4 x float> %reg1, i32 1
+  %16 = extractelement <4 x float> %reg1, i32 3
+  %17 = load <4 x float> addrspace(9)* null
+  %18 = extractelement <4 x float> %17, i32 0
+  %19 = fmul float %18, %0
+  %20 = load <4 x float> addrspace(9)* null
+  %21 = extractelement <4 x float> %20, i32 1
+  %22 = fmul float %21, %0
+  %23 = load <4 x float> addrspace(9)* null
+  %24 = extractelement <4 x float> %23, i32 2
+  %25 = fmul float %24, %0
+  %26 = load <4 x float> addrspace(9)* null
+  %27 = extractelement <4 x float> %26, i32 3
+  %28 = fmul float %27, %0
+  %29 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %30 = extractelement <4 x float> %29, i32 0
+  %31 = fmul float %30, %15
+  %32 = fadd float %31, %19
+  %33 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %34 = extractelement <4 x float> %33, i32 1
+  %35 = fmul float %34, %15
+  %36 = fadd float %35, %22
+  %37 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %38 = extractelement <4 x float> %37, i32 2
+  %39 = fmul float %38, %15
+  %40 = fadd float %39, %25
+  %41 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %42 = extractelement <4 x float> %41, i32 3
+  %43 = fmul float %42, %15
+  %44 = fadd float %43, %28
+  %45 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %46 = extractelement <4 x float> %45, i32 0
+  %47 = fmul float %46, %1
+  %48 = fadd float %47, %32
+  %49 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %50 = extractelement <4 x float> %49, i32 1
+  %51 = fmul float %50, %1
+  %52 = fadd float %51, %36
+  %53 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %54 = extractelement <4 x float> %53, i32 2
+  %55 = fmul float %54, %1
+  %56 = fadd float %55, %40
+  %57 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %58 = extractelement <4 x float> %57, i32 3
+  %59 = fmul float %58, %1
+  %60 = fadd float %59, %44
+  %61 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %62 = extractelement <4 x float> %61, i32 0
+  %63 = fmul float %62, %16
+  %64 = fadd float %63, %48
+  %65 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %66 = extractelement <4 x float> %65, i32 1
+  %67 = fmul float %66, %16
+  %68 = fadd float %67, %52
+  %69 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %70 = extractelement <4 x float> %69, i32 2
+  %71 = fmul float %70, %16
+  %72 = fadd float %71, %56
+  %73 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %74 = extractelement <4 x float> %73, i32 3
+  %75 = fmul float %74, %16
+  %76 = fadd float %75, %60
+  %77 = insertelement <4 x float> undef, float %64, i32 0
+  %78 = insertelement <4 x float> %77, float %68, i32 1
+  %79 = insertelement <4 x float> %78, float %72, i32 2
+  %80 = insertelement <4 x float> %79, float %76, i32 3
+  call void @llvm.AMDGPU.barrier.local()
+  %81 = insertelement <4 x float> undef, float %temp.0, i32 0
+  %82 = insertelement <4 x float> %81, float %temp1.0, i32 1
+  %83 = insertelement <4 x float> %82, float %temp2.0, i32 2
+  %84 = insertelement <4 x float> %83, float %temp3.0, i32 3
+  call void @llvm.AMDGPU.barrier.local()
+  ret void
+
+LOOP:                                             ; preds = %main_body, %Flow
+  %temp.1 = phi float [ %109, %Flow ], [ 0.000000e+00, %main_body ]
+  %temp1.1 = phi float [ %108, %Flow ], [ 1.000000e+00, %main_body ]
+  %temp2.1 = phi float [ %107, %Flow ], [ 0.000000e+00, %main_body ]
+  %temp3.1 = phi float [ %106, %Flow ], [ 0.000000e+00, %main_body ]
+  %temp4.0 = phi float [ %105, %Flow ], [ -2.000000e+00, %main_body ]
+  %85 = fcmp uge float %temp4.0, %0
+  %86 = select i1 %85, float 1.000000e+00, float 0.000000e+00
+  %87 = fsub float -0.000000e+00, %86
+  %88 = fptosi float %87 to i32
+  %89 = bitcast i32 %88 to float
+  %90 = bitcast float %89 to i32
+  %91 = icmp ne i32 %90, 0
+  %92 = xor i1 %91, true
+  br i1 %92, label %ENDIF16, label %Flow
+
+ENDIF16:                                          ; preds = %LOOP
+  %93 = fcmp une float %1, %temp4.0
+  %94 = select i1 %93, float 1.000000e+00, float 0.000000e+00
+  %95 = fsub float -0.000000e+00, %94
+  %96 = fptosi float %95 to i32
+  %97 = bitcast i32 %96 to float
+  %98 = bitcast float %97 to i32
+  %99 = icmp ne i32 %98, 0
+  %100 = xor i1 %99, true
+  br i1 %100, label %ENDIF19, label %Flow1
+
+Flow:                                             ; preds = %Flow1, %LOOP
+  %101 = phi float [ %temp3.1, %Flow1 ], [ %temp3.1, %LOOP ]
+  %102 = phi float [ %temp2.1, %Flow1 ], [ %temp2.1, %LOOP ]
+  %103 = phi float [ %temp1.1, %Flow1 ], [ %temp1.1, %LOOP ]
+  %104 = phi float [ %temp.1, %Flow1 ], [ %temp.1, %LOOP ]
+  %105 = phi float [ %9, %Flow1 ], [ undef, %LOOP ]
+  %106 = phi float [ %10, %Flow1 ], [ undef, %LOOP ]
+  %107 = phi float [ %11, %Flow1 ], [ undef, %LOOP ]
+  %108 = phi float [ %12, %Flow1 ], [ undef, %LOOP ]
+  %109 = phi float [ %13, %Flow1 ], [ undef, %LOOP ]
+  %110 = phi i1 [ %14, %Flow1 ], [ true, %LOOP ]
+  br i1 %110, label %Flow2, label %LOOP
+
+ENDIF19:                                          ; preds = %ENDIF16
+  %111 = fadd float %temp.1, 1.000000e+00
+  %112 = fadd float %temp1.1, 0.000000e+00
+  %113 = fadd float %temp2.1, 0.000000e+00
+  %114 = fadd float %temp3.1, 0.000000e+00
+  %115 = fadd float %temp4.0, 1.000000e+00
+  br label %Flow1
+}
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/select-vectors.ll b/test/CodeGen/R600/select-vectors.ll
new file mode 100644
index 0000000..94605fe
--- /dev/null
+++ b/test/CodeGen/R600/select-vectors.ll
@@ -0,0 +1,155 @@
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; Test expansion of scalar selects on vectors.
+; Evergreen not enabled since it seems to be having problems with doubles.
+
+
+; FUNC-LABEL: @select_v4i8
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
+  %cmp = icmp eq i8 %c, 0
+  %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
+  store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @select_v4i16
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
+  store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @select_v2i32
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: BUFFER_STORE_DWORDX2
+define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
+  store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @select_v4i32
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: BUFFER_STORE_DWORDX4
+define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
+  store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @select_v8i32
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
+  store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @select_v2f32
+; SI: BUFFER_STORE_DWORDX2
+define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
+  store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @select_v4f32
+; SI: BUFFER_STORE_DWORDX4
+define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
+  store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @select_v8f32
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
+  store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @select_v2f64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
+  store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @select_v4f64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
+  store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @select_v8f64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
+  store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/select64.ll b/test/CodeGen/R600/select64.ll
new file mode 100644
index 0000000..6b87d98
--- /dev/null
+++ b/test/CodeGen/R600/select64.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: @select0
+; i64 select should be split into two i32 selects, and we shouldn't need
+; to use a shfit to extract the hi dword of the input.
+; CHECK-NOT: S_LSHR_B64
+; CHECK: V_CNDMASK
+; CHECK: V_CNDMASK
+define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
+entry:
+  %0 = icmp ugt i32 %cond, 5
+  %1 = select i1 %0, i64 0, i64 %in
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/setcc-equivalent.ll b/test/CodeGen/R600/setcc-equivalent.ll
new file mode 100644
index 0000000..4c50aa3
--- /dev/null
+++ b/test/CodeGen/R600/setcc-equivalent.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+; EG-LABEL: @and_setcc_setcc_i32
+; EG: AND_INT
+; EG-NEXT: SETE_INT
+define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %cmp1 = icmp eq i32 %a, -1
+  %cmp2 = icmp eq i32 %b, -1
+  %and = and i1 %cmp1, %cmp2
+  %ext = sext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-LABEL: @and_setcc_setcc_v4i32
+; EG: AND_INT
+; EG: AND_INT
+; EG: SETE_INT
+; EG: AND_INT
+; EG: SETE_INT
+; EG: AND_INT
+; EG: SETE_INT
+define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+  %cmp1 = icmp eq <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %cmp2 = icmp eq <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and = and <4 x i1> %cmp1, %cmp2
+  %ext = sext <4 x i1> %and to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
new file mode 100644
index 0000000..eef3f07
--- /dev/null
+++ b/test/CodeGen/R600/sext-in-reg.ll
@@ -0,0 +1,271 @@
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
+
+
+; FUNC-LABEL: @sext_in_reg_i1_i32
+; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[ARG]], 0, 1
+; SI: BUFFER_STORE_DWORD [[EXTRACT]],
+
+; EG: BFE_INT
+define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
+  %shl = shl i32 %in, 31
+  %sext = ashr i32 %shl, 31
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i8_to_i32
+; SI: S_ADD_I32 [[VAL:s[0-9]+]],
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 8
+; SI: BUFFER_STORE_DWORD [[EXTRACT]],
+
+; EG: BFE_INT
+define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %shl = shl i32 %c, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i16_to_i32
+; SI: S_ADD_I32 [[VAL:s[0-9]+]],
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 16
+; SI: BUFFER_STORE_DWORD [[EXTRACT]],
+
+; EG: BFE_INT
+define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %shl = shl i32 %c, 16
+  %ashr = ashr i32 %shl, 16
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i8_to_v1i32
+; SI: S_ADD_I32 [[VAL:s[0-9]+]],
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 8
+; SI: BUFFER_STORE_DWORD [[EXTRACT]],
+
+; EG: BFE_INT
+define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+  %c = add <1 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <1 x i32> %c, <i32 24>
+  %ashr = ashr <1 x i32> %shl, <i32 24>
+  store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i8_to_i64
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
+; SI: BUFFER_STORE_DWORD
+
+; EG: BFE_INT
+; EG: ASHR
+define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = add i64 %a, %b
+  %shl = shl i64 %c, 56
+  %ashr = ashr i64 %shl, 56
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i16_to_i64
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 16
+; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
+; SI: BUFFER_STORE_DWORD
+
+; EG: BFE_INT
+; EG: ASHR
+define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = add i64 %a, %b
+  %shl = shl i64 %c, 48
+  %ashr = ashr i64 %shl, 48
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i32_to_i64
+; SI: S_LOAD_DWORD
+; SI: S_LOAD_DWORD
+; SI: S_ADD_I32 [[ADD:s[0-9]+]],
+; SI: S_ASHR_I32 s{{[0-9]+}}, [[ADD]], 31
+; SI: BUFFER_STORE_DWORDX2
+define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = add i64 %a, %b
+  %shl = shl i64 %c, 32
+  %ashr = ashr i64 %shl, 32
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
+; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64
+; XSI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; XSI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
+; XSI: BUFFER_STORE_DWORD
+; XEG: BFE_INT
+; XEG: ASHR
+; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind {
+;   %c = add <1 x i64> %a, %b
+;   %shl = shl <1 x i64> %c, <i64 56>
+;   %ashr = ashr <1 x i64> %shl, <i64 56>
+;   store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; FUNC-LABEL: @sext_in_reg_i1_in_i32_other_amount
+; SI-NOT: BFE
+; SI: S_LSHL_B32 [[REG:s[0-9]+]], {{s[0-9]+}}, 6
+; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG]], 7
+; EG-NOT: BFE
+define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b
+  %x = shl i32 %c, 6
+  %y = ashr i32 %x, 7
+  store i32 %y, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_v2i1_in_v2i32_other_amount
+; SI: S_LSHL_B32 [[REG0:s[0-9]+]], {{s[0-9]}}, 6
+; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG0]], 7
+; SI: S_LSHL_B32 [[REG1:s[0-9]+]], {{s[0-9]}}, 6
+; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG1]], 7
+; EG-NOT: BFE
+define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b
+  %x = shl <2 x i32> %c, <i32 6, i32 6>
+  %y = ashr <2 x i32> %x, <i32 7, i32 7>
+  store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2
+  ret void
+}
+
+
+; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: BUFFER_STORE_DWORDX2
+; EG: BFE
+; EG: BFE
+define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i32> %c, <i32 31, i32 31>
+  %ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
+  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: BUFFER_STORE_DWORDX4
+
+; EG: BFE
+; EG: BFE
+; EG: BFE
+; EG: BFE
+define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+  %c = add <4 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
+  %ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_v2i8_to_v2i32
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: BUFFER_STORE_DWORDX2
+
+; EG: BFE
+; EG: BFE
+define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i32> %c, <i32 24, i32 24>
+  %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
+  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_v4i8_to_v4i32
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: BUFFER_STORE_DWORDX4
+
+; EG: BFE
+; EG: BFE
+; EG: BFE
+; EG: BFE
+define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+  %c = add <4 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
+  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_v2i16_to_v2i32
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: BUFFER_STORE_DWORDX2
+
+; EG: BFE
+; EG: BFE
+define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i32> %c, <i32 24, i32 24>
+  %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
+  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @testcase
+define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind {
+  %and_a_1 = and i8 %a, 1
+  %cmp_eq = icmp eq i8 %and_a_1, 0
+  %cmp_slt = icmp slt i8 %a, 0
+  %sel0 = select i1 %cmp_slt, i8 0, i8 %a
+  %sel1 = select i1 %cmp_eq, i8 0, i8 %a
+  %xor = xor i8 %sel0, %sel1
+  store i8 %xor, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @testcase_3
+define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
+  %and_a_1 = and i8 %a, 1
+  %cmp_eq = icmp eq i8 %and_a_1, 0
+  %cmp_slt = icmp slt i8 %a, 0
+  %sel0 = select i1 %cmp_slt, i8 0, i8 %a
+  %sel1 = select i1 %cmp_eq, i8 0, i8 %a
+  %xor = xor i8 %sel0, %sel1
+  store i8 %xor, i8 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: The BFE should really be eliminated. I think it should happen
+; when computeMaskedBitsForTargetNode is implemented for imax.
+
+; FUNC-LABEL: @sext_in_reg_to_illegal_type
+; SI: BUFFER_LOAD_SBYTE
+; SI: V_MAX_I32
+; SI: V_BFE_I32
+; SI: BUFFER_STORE_SHORT
+define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
+  %tmp5 = load i8 addrspace(1)* %src, align 1
+  %tmp2 = sext i8 %tmp5 to i32
+  %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone
+  %tmp4 = trunc i32 %tmp3 to i8
+  %tmp6 = sext i8 %tmp4 to i16
+  store i16 %tmp6, i16 addrspace(1)* %out, align 2
+  ret void
+}
diff --git a/test/CodeGen/R600/si-annotate-cf-assertion.ll b/test/CodeGen/R600/si-annotate-cf-assertion.ll
index 9886fe9..cd3ba2b 100644
--- a/test/CodeGen/R600/si-annotate-cf-assertion.ll
+++ b/test/CodeGen/R600/si-annotate-cf-assertion.ll
@@ -1,3 +1,4 @@
+; REQUIRES: asserts
 ; XFAIL: *
 ; RUN: llc -march=r600 -mcpu=SI -asm-verbose=false < %s | FileCheck %s
 
diff --git a/test/CodeGen/R600/si-sgpr-spill.ll b/test/CodeGen/R600/si-sgpr-spill.ll
index 05c5e31..b34a757 100644
--- a/test/CodeGen/R600/si-sgpr-spill.ll
+++ b/test/CodeGen/R600/si-sgpr-spill.ll
@@ -1,8 +1,5 @@
 ; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
 
-; XXX: Enable when spilling is supported
-; XFAIL: *
-
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
 
@@ -690,3 +687,880 @@ attributes #3 = { readonly }
 attributes #4 = { nounwind readonly }
 
 !0 = metadata !{metadata !"const", null, i32 1}
+
+; CHECK-LABEL: @main1
+; CHECK: S_ENDPGM
+define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %21 = getelementptr [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8> addrspace(2)* %21, !tbaa !0
+  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 0)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 4)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 8)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 12)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 28)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 48)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 52)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 56)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 64)
+  %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 68)
+  %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 72)
+  %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 76)
+  %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
+  %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
+  %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
+  %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 148)
+  %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 152)
+  %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
+  %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 164)
+  %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 168)
+  %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 172)
+  %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
+  %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
+  %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
+  %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
+  %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
+  %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
+  %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
+  %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
+  %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
+  %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 220)
+  %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 236)
+  %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
+  %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
+  %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
+  %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 252)
+  %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
+  %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 260)
+  %61 = call float @llvm.SI.load.const(<16 x i8> %22, i32 264)
+  %62 = call float @llvm.SI.load.const(<16 x i8> %22, i32 268)
+  %63 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
+  %64 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
+  %65 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
+  %66 = call float @llvm.SI.load.const(<16 x i8> %22, i32 284)
+  %67 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
+  %68 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
+  %69 = call float @llvm.SI.load.const(<16 x i8> %22, i32 464)
+  %70 = call float @llvm.SI.load.const(<16 x i8> %22, i32 468)
+  %71 = call float @llvm.SI.load.const(<16 x i8> %22, i32 472)
+  %72 = call float @llvm.SI.load.const(<16 x i8> %22, i32 496)
+  %73 = call float @llvm.SI.load.const(<16 x i8> %22, i32 500)
+  %74 = call float @llvm.SI.load.const(<16 x i8> %22, i32 504)
+  %75 = call float @llvm.SI.load.const(<16 x i8> %22, i32 512)
+  %76 = call float @llvm.SI.load.const(<16 x i8> %22, i32 516)
+  %77 = call float @llvm.SI.load.const(<16 x i8> %22, i32 524)
+  %78 = call float @llvm.SI.load.const(<16 x i8> %22, i32 532)
+  %79 = call float @llvm.SI.load.const(<16 x i8> %22, i32 536)
+  %80 = call float @llvm.SI.load.const(<16 x i8> %22, i32 540)
+  %81 = call float @llvm.SI.load.const(<16 x i8> %22, i32 544)
+  %82 = call float @llvm.SI.load.const(<16 x i8> %22, i32 548)
+  %83 = call float @llvm.SI.load.const(<16 x i8> %22, i32 552)
+  %84 = call float @llvm.SI.load.const(<16 x i8> %22, i32 556)
+  %85 = call float @llvm.SI.load.const(<16 x i8> %22, i32 560)
+  %86 = call float @llvm.SI.load.const(<16 x i8> %22, i32 564)
+  %87 = call float @llvm.SI.load.const(<16 x i8> %22, i32 568)
+  %88 = call float @llvm.SI.load.const(<16 x i8> %22, i32 572)
+  %89 = call float @llvm.SI.load.const(<16 x i8> %22, i32 576)
+  %90 = call float @llvm.SI.load.const(<16 x i8> %22, i32 580)
+  %91 = call float @llvm.SI.load.const(<16 x i8> %22, i32 584)
+  %92 = call float @llvm.SI.load.const(<16 x i8> %22, i32 588)
+  %93 = call float @llvm.SI.load.const(<16 x i8> %22, i32 592)
+  %94 = call float @llvm.SI.load.const(<16 x i8> %22, i32 596)
+  %95 = call float @llvm.SI.load.const(<16 x i8> %22, i32 600)
+  %96 = call float @llvm.SI.load.const(<16 x i8> %22, i32 604)
+  %97 = call float @llvm.SI.load.const(<16 x i8> %22, i32 608)
+  %98 = call float @llvm.SI.load.const(<16 x i8> %22, i32 612)
+  %99 = call float @llvm.SI.load.const(<16 x i8> %22, i32 616)
+  %100 = call float @llvm.SI.load.const(<16 x i8> %22, i32 624)
+  %101 = call float @llvm.SI.load.const(<16 x i8> %22, i32 628)
+  %102 = call float @llvm.SI.load.const(<16 x i8> %22, i32 632)
+  %103 = call float @llvm.SI.load.const(<16 x i8> %22, i32 636)
+  %104 = call float @llvm.SI.load.const(<16 x i8> %22, i32 640)
+  %105 = call float @llvm.SI.load.const(<16 x i8> %22, i32 644)
+  %106 = call float @llvm.SI.load.const(<16 x i8> %22, i32 648)
+  %107 = call float @llvm.SI.load.const(<16 x i8> %22, i32 652)
+  %108 = call float @llvm.SI.load.const(<16 x i8> %22, i32 656)
+  %109 = call float @llvm.SI.load.const(<16 x i8> %22, i32 660)
+  %110 = call float @llvm.SI.load.const(<16 x i8> %22, i32 664)
+  %111 = call float @llvm.SI.load.const(<16 x i8> %22, i32 668)
+  %112 = call float @llvm.SI.load.const(<16 x i8> %22, i32 672)
+  %113 = call float @llvm.SI.load.const(<16 x i8> %22, i32 676)
+  %114 = call float @llvm.SI.load.const(<16 x i8> %22, i32 680)
+  %115 = call float @llvm.SI.load.const(<16 x i8> %22, i32 684)
+  %116 = call float @llvm.SI.load.const(<16 x i8> %22, i32 688)
+  %117 = call float @llvm.SI.load.const(<16 x i8> %22, i32 692)
+  %118 = call float @llvm.SI.load.const(<16 x i8> %22, i32 696)
+  %119 = call float @llvm.SI.load.const(<16 x i8> %22, i32 700)
+  %120 = call float @llvm.SI.load.const(<16 x i8> %22, i32 704)
+  %121 = call float @llvm.SI.load.const(<16 x i8> %22, i32 708)
+  %122 = call float @llvm.SI.load.const(<16 x i8> %22, i32 712)
+  %123 = call float @llvm.SI.load.const(<16 x i8> %22, i32 716)
+  %124 = call float @llvm.SI.load.const(<16 x i8> %22, i32 864)
+  %125 = call float @llvm.SI.load.const(<16 x i8> %22, i32 868)
+  %126 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %127 = load <32 x i8> addrspace(2)* %126, !tbaa !0
+  %128 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %129 = load <16 x i8> addrspace(2)* %128, !tbaa !0
+  %130 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
+  %131 = load <32 x i8> addrspace(2)* %130, !tbaa !0
+  %132 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
+  %133 = load <16 x i8> addrspace(2)* %132, !tbaa !0
+  %134 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
+  %135 = load <32 x i8> addrspace(2)* %134, !tbaa !0
+  %136 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
+  %137 = load <16 x i8> addrspace(2)* %136, !tbaa !0
+  %138 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
+  %139 = load <32 x i8> addrspace(2)* %138, !tbaa !0
+  %140 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
+  %141 = load <16 x i8> addrspace(2)* %140, !tbaa !0
+  %142 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
+  %143 = load <32 x i8> addrspace(2)* %142, !tbaa !0
+  %144 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
+  %145 = load <16 x i8> addrspace(2)* %144, !tbaa !0
+  %146 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
+  %147 = load <32 x i8> addrspace(2)* %146, !tbaa !0
+  %148 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
+  %149 = load <16 x i8> addrspace(2)* %148, !tbaa !0
+  %150 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
+  %151 = load <32 x i8> addrspace(2)* %150, !tbaa !0
+  %152 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
+  %153 = load <16 x i8> addrspace(2)* %152, !tbaa !0
+  %154 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
+  %155 = load <32 x i8> addrspace(2)* %154, !tbaa !0
+  %156 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
+  %157 = load <16 x i8> addrspace(2)* %156, !tbaa !0
+  %158 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8
+  %159 = load <32 x i8> addrspace(2)* %158, !tbaa !0
+  %160 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8
+  %161 = load <16 x i8> addrspace(2)* %160, !tbaa !0
+  %162 = fcmp ugt float %17, 0.000000e+00
+  %163 = select i1 %162, float 1.000000e+00, float 0.000000e+00
+  %164 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
+  %165 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
+  %166 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %4, <2 x i32> %6)
+  %167 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %4, <2 x i32> %6)
+  %168 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
+  %169 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
+  %170 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
+  %171 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %4, <2 x i32> %6)
+  %172 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
+  %173 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
+  %174 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
+  %175 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %4, <2 x i32> %6)
+  %176 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
+  %177 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
+  %178 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
+  %179 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %4, <2 x i32> %6)
+  %180 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
+  %181 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
+  %182 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
+  %183 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %4, <2 x i32> %6)
+  %184 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
+  %185 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
+  %186 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
+  %187 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %4, <2 x i32> %6)
+  %188 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %4, <2 x i32> %6)
+  %189 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %4, <2 x i32> %6)
+  %190 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %4, <2 x i32> %6)
+  %191 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %4, <2 x i32> %6)
+  %192 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %4, <2 x i32> %6)
+  %193 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %4, <2 x i32> %6)
+  %194 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %4, <2 x i32> %6)
+  %195 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %4, <2 x i32> %6)
+  %196 = fmul float %14, %124
+  %197 = fadd float %196, %125
+  %198 = call float @llvm.AMDIL.clamp.(float %163, float 0.000000e+00, float 1.000000e+00)
+  %199 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %200 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %201 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %202 = bitcast float %198 to i32
+  %203 = icmp ne i32 %202, 0
+  %. = select i1 %203, float -1.000000e+00, float 1.000000e+00
+  %204 = fsub float -0.000000e+00, %164
+  %205 = fadd float %44, %204
+  %206 = fsub float -0.000000e+00, %165
+  %207 = fadd float %45, %206
+  %208 = fsub float -0.000000e+00, %166
+  %209 = fadd float %46, %208
+  %210 = fmul float %205, %205
+  %211 = fmul float %207, %207
+  %212 = fadd float %211, %210
+  %213 = fmul float %209, %209
+  %214 = fadd float %212, %213
+  %215 = call float @llvm.AMDGPU.rsq(float %214)
+  %216 = fmul float %205, %215
+  %217 = fmul float %207, %215
+  %218 = fmul float %209, %215
+  %219 = fmul float %., %54
+  %220 = fmul float %13, %47
+  %221 = fmul float %197, %48
+  %222 = bitcast float %174 to i32
+  %223 = bitcast float %175 to i32
+  %224 = insertelement <2 x i32> undef, i32 %222, i32 0
+  %225 = insertelement <2 x i32> %224, i32 %223, i32 1
+  %226 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %225, <32 x i8> %131, <16 x i8> %133, i32 2)
+  %227 = extractelement <4 x float> %226, i32 0
+  %228 = extractelement <4 x float> %226, i32 1
+  %229 = extractelement <4 x float> %226, i32 2
+  %230 = extractelement <4 x float> %226, i32 3
+  %231 = fmul float %227, 0x4012611180000000
+  %232 = fmul float %228, 0x4012611180000000
+  %233 = fmul float %229, 0x4012611180000000
+  %234 = call float @llvm.AMDGPU.lrp(float %27, float %231, float 1.000000e+00)
+  %235 = call float @llvm.AMDGPU.lrp(float %27, float %232, float 1.000000e+00)
+  %236 = call float @llvm.AMDGPU.lrp(float %27, float %233, float 1.000000e+00)
+  %237 = fmul float %216, %184
+  %238 = fmul float %217, %185
+  %239 = fadd float %238, %237
+  %240 = fmul float %218, %186
+  %241 = fadd float %239, %240
+  %242 = fmul float %216, %187
+  %243 = fmul float %217, %188
+  %244 = fadd float %243, %242
+  %245 = fmul float %218, %189
+  %246 = fadd float %244, %245
+  %247 = fmul float %216, %190
+  %248 = fmul float %217, %191
+  %249 = fadd float %248, %247
+  %250 = fmul float %218, %192
+  %251 = fadd float %249, %250
+  %252 = call float @llvm.AMDIL.clamp.(float %251, float 0.000000e+00, float 1.000000e+00)
+  %253 = fmul float %214, 0x3F5A36E2E0000000
+  %254 = call float @llvm.AMDIL.clamp.(float %253, float 0.000000e+00, float 1.000000e+00)
+  %255 = fsub float -0.000000e+00, %254
+  %256 = fadd float 1.000000e+00, %255
+  %257 = call float @llvm.pow.f32(float %252, float 2.500000e-01)
+  %258 = fmul float %39, %257
+  %259 = fmul float %241, %258
+  %260 = fmul float %246, %258
+  %261 = fmul float %259, %230
+  %262 = fmul float %260, %230
+  %263 = fadd float %252, 0x3EE4F8B580000000
+  %264 = fsub float -0.000000e+00, %252
+  %265 = fadd float 1.000000e+00, %264
+  %266 = fmul float 1.200000e+01, %265
+  %267 = fadd float %266, 4.000000e+00
+  %268 = fsub float -0.000000e+00, %267
+  %269 = fmul float %268, %263
+  %270 = fsub float -0.000000e+00, %267
+  %271 = fmul float %270, %263
+  %272 = fsub float -0.000000e+00, %267
+  %273 = fmul float %272, %263
+  %274 = fdiv float 1.000000e+00, %269
+  %275 = fdiv float 1.000000e+00, %271
+  %276 = fdiv float 1.000000e+00, %273
+  %277 = fmul float %261, %274
+  %278 = fmul float %262, %275
+  %279 = fmul float %263, %276
+  br label %LOOP
+
+LOOP:                                             ; preds = %LOOP, %main_body
+  %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %292, %LOOP ]
+  %temp168.0 = phi float [ %176, %main_body ], [ %288, %LOOP ]
+  %temp169.0 = phi float [ %177, %main_body ], [ %289, %LOOP ]
+  %temp170.0 = phi float [ %256, %main_body ], [ %290, %LOOP ]
+  %280 = bitcast float %temp168.0 to i32
+  %281 = bitcast float %temp169.0 to i32
+  %282 = insertelement <4 x i32> undef, i32 %280, i32 0
+  %283 = insertelement <4 x i32> %282, i32 %281, i32 1
+  %284 = insertelement <4 x i32> %283, i32 0, i32 2
+  %285 = insertelement <4 x i32> %284, i32 undef, i32 3
+  %286 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %285, <32 x i8> %147, <16 x i8> %149, i32 2)
+  %287 = extractelement <4 x float> %286, i32 3
+  %288 = fadd float %temp168.0, %277
+  %289 = fadd float %temp169.0, %278
+  %290 = fadd float %temp170.0, %279
+  %291 = fsub float -0.000000e+00, %287
+  %292 = fadd float %290, %291
+  %293 = fcmp oge float 0.000000e+00, %292
+  %294 = sext i1 %293 to i32
+  %295 = bitcast i32 %294 to float
+  %296 = bitcast float %295 to i32
+  %297 = icmp ne i32 %296, 0
+  br i1 %297, label %IF189, label %LOOP
+
+IF189:                                            ; preds = %LOOP
+  %298 = extractelement <4 x float> %286, i32 0
+  %299 = extractelement <4 x float> %286, i32 1
+  %300 = extractelement <4 x float> %286, i32 2
+  %301 = fsub float -0.000000e+00, %292
+  %302 = fadd float %temp144.0, %301
+  %303 = fdiv float 1.000000e+00, %302
+  %304 = fmul float %292, %303
+  %305 = fadd float %304, -1.000000e+00
+  %306 = fmul float %305, %277
+  %307 = fadd float %306, %288
+  %308 = fmul float %305, %278
+  %309 = fadd float %308, %289
+  %310 = fsub float -0.000000e+00, %176
+  %311 = fadd float %307, %310
+  %312 = fsub float -0.000000e+00, %177
+  %313 = fadd float %309, %312
+  %314 = fadd float %176, %311
+  %315 = fadd float %177, %313
+  %316 = fmul float %311, %67
+  %317 = fmul float %313, %68
+  %318 = fmul float %316, %55
+  %319 = fmul float %316, %56
+  %320 = fmul float %317, %57
+  %321 = fadd float %320, %318
+  %322 = fmul float %317, %58
+  %323 = fadd float %322, %319
+  %324 = fadd float %178, %321
+  %325 = fadd float %179, %323
+  %326 = fmul float %316, %59
+  %327 = fmul float %316, %60
+  %328 = fmul float %316, %61
+  %329 = fmul float %316, %62
+  %330 = fmul float %317, %63
+  %331 = fadd float %330, %326
+  %332 = fmul float %317, %64
+  %333 = fadd float %332, %327
+  %334 = fmul float %317, %65
+  %335 = fadd float %334, %328
+  %336 = fmul float %317, %66
+  %337 = fadd float %336, %329
+  %338 = fadd float %168, %331
+  %339 = fadd float %169, %333
+  %340 = fadd float %170, %335
+  %341 = fadd float %171, %337
+  %342 = bitcast float %338 to i32
+  %343 = bitcast float %339 to i32
+  %344 = insertelement <2 x i32> undef, i32 %342, i32 0
+  %345 = insertelement <2 x i32> %344, i32 %343, i32 1
+  %346 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %345, <32 x i8> %135, <16 x i8> %137, i32 2)
+  %347 = extractelement <4 x float> %346, i32 0
+  %348 = extractelement <4 x float> %346, i32 1
+  %349 = extractelement <4 x float> %346, i32 2
+  %350 = extractelement <4 x float> %346, i32 3
+  %351 = fmul float %347, %23
+  %352 = fmul float %348, %24
+  %353 = fmul float %349, %25
+  %354 = fmul float %350, %26
+  %355 = fmul float %351, %180
+  %356 = fmul float %352, %181
+  %357 = fmul float %353, %182
+  %358 = fmul float %354, %183
+  %359 = fsub float -0.000000e+00, %350
+  %360 = fadd float 1.000000e+00, %359
+  %361 = fmul float %360, %49
+  %362 = call float @llvm.AMDGPU.lrp(float %361, float %347, float %355)
+  %363 = call float @llvm.AMDGPU.lrp(float %361, float %348, float %356)
+  %364 = call float @llvm.AMDGPU.lrp(float %361, float %349, float %357)
+  %365 = bitcast float %340 to i32
+  %366 = bitcast float %341 to i32
+  %367 = insertelement <2 x i32> undef, i32 %365, i32 0
+  %368 = insertelement <2 x i32> %367, i32 %366, i32 1
+  %369 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %368, <32 x i8> %151, <16 x i8> %153, i32 2)
+  %370 = extractelement <4 x float> %369, i32 2
+  %371 = fmul float %362, %234
+  %372 = fmul float %363, %235
+  %373 = fmul float %364, %236
+  %374 = fmul float %358, %230
+  %375 = bitcast float %314 to i32
+  %376 = bitcast float %315 to i32
+  %377 = insertelement <2 x i32> undef, i32 %375, i32 0
+  %378 = insertelement <2 x i32> %377, i32 %376, i32 1
+  %379 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %378, <32 x i8> %139, <16 x i8> %141, i32 2)
+  %380 = extractelement <4 x float> %379, i32 0
+  %381 = extractelement <4 x float> %379, i32 1
+  %382 = extractelement <4 x float> %379, i32 2
+  %383 = extractelement <4 x float> %379, i32 3
+  %384 = fcmp olt float 0.000000e+00, %382
+  %385 = sext i1 %384 to i32
+  %386 = bitcast i32 %385 to float
+  %387 = bitcast float %386 to i32
+  %388 = icmp ne i32 %387, 0
+  %.224 = select i1 %388, float %381, float %380
+  %.225 = select i1 %388, float %383, float %381
+  %389 = bitcast float %324 to i32
+  %390 = bitcast float %325 to i32
+  %391 = insertelement <2 x i32> undef, i32 %389, i32 0
+  %392 = insertelement <2 x i32> %391, i32 %390, i32 1
+  %393 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %392, <32 x i8> %143, <16 x i8> %145, i32 2)
+  %394 = extractelement <4 x float> %393, i32 0
+  %395 = extractelement <4 x float> %393, i32 1
+  %396 = extractelement <4 x float> %393, i32 2
+  %397 = extractelement <4 x float> %393, i32 3
+  %398 = fcmp olt float 0.000000e+00, %396
+  %399 = sext i1 %398 to i32
+  %400 = bitcast i32 %399 to float
+  %401 = bitcast float %400 to i32
+  %402 = icmp ne i32 %401, 0
+  %temp112.1 = select i1 %402, float %395, float %394
+  %temp113.1 = select i1 %402, float %397, float %395
+  %403 = fmul float %.224, 2.000000e+00
+  %404 = fadd float %403, -1.000000e+00
+  %405 = fmul float %.225, 2.000000e+00
+  %406 = fadd float %405, -1.000000e+00
+  %407 = fmul float %temp112.1, 2.000000e+00
+  %408 = fadd float %407, -1.000000e+00
+  %409 = fmul float %temp113.1, 2.000000e+00
+  %410 = fadd float %409, -1.000000e+00
+  %411 = fsub float -0.000000e+00, %404
+  %412 = fmul float %411, %35
+  %413 = fsub float -0.000000e+00, %406
+  %414 = fmul float %413, %35
+  %415 = fsub float -0.000000e+00, %408
+  %416 = fmul float %415, %36
+  %417 = fsub float -0.000000e+00, %410
+  %418 = fmul float %417, %36
+  %419 = fmul float %416, %370
+  %420 = fmul float %418, %370
+  %421 = call float @fabs(float %412)
+  %422 = call float @fabs(float %414)
+  %423 = fsub float -0.000000e+00, %421
+  %424 = fadd float 1.000000e+00, %423
+  %425 = fsub float -0.000000e+00, %422
+  %426 = fadd float 1.000000e+00, %425
+  %427 = fmul float %424, %419
+  %428 = fadd float %427, %412
+  %429 = fmul float %426, %420
+  %430 = fadd float %429, %414
+  %431 = fmul float %428, %428
+  %432 = fmul float %430, %430
+  %433 = fadd float %431, %432
+  %434 = fsub float -0.000000e+00, %433
+  %435 = fadd float 0x3FF00068E0000000, %434
+  %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00)
+  %437 = call float @llvm.AMDGPU.rsq(float %436)
+  %438 = fmul float %437, %436
+  %439 = fsub float -0.000000e+00, %436
+  %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00)
+  %441 = fmul float %184, %428
+  %442 = fmul float %185, %428
+  %443 = fmul float %186, %428
+  %444 = fmul float %187, %430
+  %445 = fadd float %444, %441
+  %446 = fmul float %188, %430
+  %447 = fadd float %446, %442
+  %448 = fmul float %189, %430
+  %449 = fadd float %448, %443
+  %450 = fmul float %190, %440
+  %451 = fadd float %450, %445
+  %452 = fmul float %191, %440
+  %453 = fadd float %452, %447
+  %454 = fmul float %192, %440
+  %455 = fadd float %454, %449
+  %456 = fmul float %451, %451
+  %457 = fmul float %453, %453
+  %458 = fadd float %457, %456
+  %459 = fmul float %455, %455
+  %460 = fadd float %458, %459
+  %461 = call float @llvm.AMDGPU.rsq(float %460)
+  %462 = fmul float %451, %461
+  %463 = fmul float %453, %461
+  %464 = fmul float %455, %461
+  %465 = fcmp olt float 0.000000e+00, %219
+  %466 = sext i1 %465 to i32
+  %467 = bitcast i32 %466 to float
+  %468 = bitcast float %467 to i32
+  %469 = icmp ne i32 %468, 0
+  br i1 %469, label %IF198, label %ENDIF197
+
+IF198:                                            ; preds = %IF189
+  %470 = fsub float -0.000000e+00, %462
+  %471 = fsub float -0.000000e+00, %463
+  %472 = fsub float -0.000000e+00, %464
+  br label %ENDIF197
+
+ENDIF197:                                         ; preds = %IF189, %IF198
+  %temp14.0 = phi float [ %472, %IF198 ], [ %464, %IF189 ]
+  %temp13.0 = phi float [ %471, %IF198 ], [ %463, %IF189 ]
+  %temp12.0 = phi float [ %470, %IF198 ], [ %462, %IF189 ]
+  %473 = bitcast float %220 to i32
+  %474 = bitcast float %221 to i32
+  %475 = insertelement <2 x i32> undef, i32 %473, i32 0
+  %476 = insertelement <2 x i32> %475, i32 %474, i32 1
+  %477 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %476, <32 x i8> %159, <16 x i8> %161, i32 2)
+  %478 = extractelement <4 x float> %477, i32 0
+  %479 = extractelement <4 x float> %477, i32 1
+  %480 = extractelement <4 x float> %477, i32 2
+  %481 = extractelement <4 x float> %477, i32 3
+  %482 = fmul float %478, %40
+  %483 = fadd float %482, %41
+  %484 = fmul float %479, %40
+  %485 = fadd float %484, %41
+  %486 = fmul float %480, %40
+  %487 = fadd float %486, %41
+  %488 = fmul float %481, %42
+  %489 = fadd float %488, %43
+  %490 = bitcast float %172 to i32
+  %491 = bitcast float %173 to i32
+  %492 = insertelement <2 x i32> undef, i32 %490, i32 0
+  %493 = insertelement <2 x i32> %492, i32 %491, i32 1
+  %494 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %493, <32 x i8> %155, <16 x i8> %157, i32 2)
+  %495 = extractelement <4 x float> %494, i32 0
+  %496 = extractelement <4 x float> %494, i32 1
+  %497 = extractelement <4 x float> %494, i32 2
+  %498 = extractelement <4 x float> %494, i32 3
+  %499 = fmul float %498, 3.200000e+01
+  %500 = fadd float %499, -1.600000e+01
+  %501 = call float @llvm.AMDIL.exp.(float %500)
+  %502 = fmul float %495, %501
+  %503 = fmul float %496, %501
+  %504 = fmul float %497, %501
+  %505 = fmul float %28, %502
+  %506 = fadd float %505, %193
+  %507 = fmul float %29, %503
+  %508 = fadd float %507, %194
+  %509 = fmul float %30, %504
+  %510 = fadd float %509, %195
+  %511 = fmul float %506, %489
+  %512 = fmul float %508, %489
+  %513 = fmul float %510, %489
+  %514 = fmul float %489, 5.000000e-01
+  %515 = fadd float %514, 5.000000e-01
+  %516 = fmul float %483, %515
+  %517 = fadd float %516, %511
+  %518 = fmul float %485, %515
+  %519 = fadd float %518, %512
+  %520 = fmul float %487, %515
+  %521 = fadd float %520, %513
+  %522 = fmul float %517, %371
+  %523 = fmul float %519, %372
+  %524 = fmul float %521, %373
+  %525 = fmul float %428, 0x3FDB272440000000
+  %526 = fmul float %430, 0xBFDB272440000000
+  %527 = fadd float %526, %525
+  %528 = fmul float %440, 0x3FE99999A0000000
+  %529 = fadd float %527, %528
+  %530 = fmul float %529, 5.000000e-01
+  %531 = fadd float %530, 0x3FE3333340000000
+  %532 = fmul float %531, %531
+  %533 = fmul float %522, %532
+  %534 = fmul float %523, %532
+  %535 = fmul float %524, %532
+  %536 = fsub float -0.000000e+00, %72
+  %537 = fsub float -0.000000e+00, %73
+  %538 = fsub float -0.000000e+00, %74
+  %539 = fmul float %temp12.0, %536
+  %540 = fmul float %temp13.0, %537
+  %541 = fadd float %540, %539
+  %542 = fmul float %temp14.0, %538
+  %543 = fadd float %541, %542
+  %544 = call float @llvm.AMDIL.clamp.(float %543, float 0.000000e+00, float 1.000000e+00)
+  %545 = fmul float %371, %544
+  %546 = fmul float %372, %544
+  %547 = fmul float %373, %544
+  %548 = fmul float %545, %69
+  %549 = fmul float %546, %70
+  %550 = fmul float %547, %71
+  %551 = fsub float -0.000000e+00, %164
+  %552 = fadd float %97, %551
+  %553 = fsub float -0.000000e+00, %165
+  %554 = fadd float %98, %553
+  %555 = fsub float -0.000000e+00, %166
+  %556 = fadd float %99, %555
+  %557 = fmul float %552, %552
+  %558 = fmul float %554, %554
+  %559 = fadd float %558, %557
+  %560 = fmul float %556, %556
+  %561 = fadd float %559, %560
+  %562 = call float @llvm.AMDGPU.rsq(float %561)
+  %563 = fmul float %562, %561
+  %564 = fsub float -0.000000e+00, %561
+  %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00)
+  %566 = fsub float -0.000000e+00, %84
+  %567 = fadd float %565, %566
+  %568 = fsub float -0.000000e+00, %83
+  %569 = fadd float %565, %568
+  %570 = fsub float -0.000000e+00, %82
+  %571 = fadd float %565, %570
+  %572 = fsub float -0.000000e+00, %84
+  %573 = fadd float %83, %572
+  %574 = fsub float -0.000000e+00, %83
+  %575 = fadd float %82, %574
+  %576 = fsub float -0.000000e+00, %82
+  %577 = fadd float %81, %576
+  %578 = fdiv float 1.000000e+00, %573
+  %579 = fdiv float 1.000000e+00, %575
+  %580 = fdiv float 1.000000e+00, %577
+  %581 = fmul float %567, %578
+  %582 = fmul float %569, %579
+  %583 = fmul float %571, %580
+  %584 = fcmp olt float %565, %83
+  %585 = sext i1 %584 to i32
+  %586 = bitcast i32 %585 to float
+  %587 = bitcast float %586 to i32
+  %588 = icmp ne i32 %587, 0
+  br i1 %588, label %ENDIF200, label %ELSE202
+
+ELSE202:                                          ; preds = %ENDIF197
+  %589 = fcmp olt float %565, %82
+  %590 = sext i1 %589 to i32
+  %591 = bitcast i32 %590 to float
+  %592 = bitcast float %591 to i32
+  %593 = icmp ne i32 %592, 0
+  br i1 %593, label %ENDIF200, label %ELSE205
+
+ENDIF200:                                         ; preds = %ELSE205, %ELSE202, %ENDIF197
+  %temp80.0 = phi float [ %581, %ENDIF197 ], [ %.226, %ELSE205 ], [ %582, %ELSE202 ]
+  %temp88.0 = phi float [ %122, %ENDIF197 ], [ %.227, %ELSE205 ], [ %120, %ELSE202 ]
+  %temp89.0 = phi float [ %123, %ENDIF197 ], [ %.228, %ELSE205 ], [ %121, %ELSE202 ]
+  %temp90.0 = phi float [ %120, %ENDIF197 ], [ %116, %ELSE205 ], [ %118, %ELSE202 ]
+  %temp91.0 = phi float [ %121, %ENDIF197 ], [ %117, %ELSE205 ], [ %119, %ELSE202 ]
+  %594 = fcmp olt float %565, %83
+  %595 = sext i1 %594 to i32
+  %596 = bitcast i32 %595 to float
+  %597 = bitcast float %596 to i32
+  %598 = icmp ne i32 %597, 0
+  br i1 %598, label %ENDIF209, label %ELSE211
+
+ELSE205:                                          ; preds = %ELSE202
+  %599 = fcmp olt float %565, %81
+  %600 = sext i1 %599 to i32
+  %601 = bitcast i32 %600 to float
+  %602 = bitcast float %601 to i32
+  %603 = icmp ne i32 %602, 0
+  %.226 = select i1 %603, float %583, float 1.000000e+00
+  %.227 = select i1 %603, float %118, float %116
+  %.228 = select i1 %603, float %119, float %117
+  br label %ENDIF200
+
+ELSE211:                                          ; preds = %ENDIF200
+  %604 = fcmp olt float %565, %82
+  %605 = sext i1 %604 to i32
+  %606 = bitcast i32 %605 to float
+  %607 = bitcast float %606 to i32
+  %608 = icmp ne i32 %607, 0
+  br i1 %608, label %ENDIF209, label %ELSE214
+
+ENDIF209:                                         ; preds = %ELSE214, %ELSE211, %ENDIF200
+  %temp52.0 = phi float [ %108, %ENDIF200 ], [ %100, %ELSE214 ], [ %104, %ELSE211 ]
+  %temp53.0 = phi float [ %109, %ENDIF200 ], [ %101, %ELSE214 ], [ %105, %ELSE211 ]
+  %temp54.0 = phi float [ %110, %ENDIF200 ], [ %102, %ELSE214 ], [ %106, %ELSE211 ]
+  %temp55.0 = phi float [ %111, %ENDIF200 ], [ %103, %ELSE214 ], [ %107, %ELSE211 ]
+  %temp68.0 = phi float [ %112, %ENDIF200 ], [ %.230, %ELSE214 ], [ %108, %ELSE211 ]
+  %temp69.0 = phi float [ %113, %ENDIF200 ], [ %.231, %ELSE214 ], [ %109, %ELSE211 ]
+  %temp70.0 = phi float [ %114, %ENDIF200 ], [ %.232, %ELSE214 ], [ %110, %ELSE211 ]
+  %temp71.0 = phi float [ %115, %ENDIF200 ], [ %.233, %ELSE214 ], [ %111, %ELSE211 ]
+  %609 = fmul float %164, %85
+  %610 = fmul float %165, %86
+  %611 = fadd float %609, %610
+  %612 = fmul float %166, %87
+  %613 = fadd float %611, %612
+  %614 = fmul float %167, %88
+  %615 = fadd float %613, %614
+  %616 = fmul float %164, %89
+  %617 = fmul float %165, %90
+  %618 = fadd float %616, %617
+  %619 = fmul float %166, %91
+  %620 = fadd float %618, %619
+  %621 = fmul float %167, %92
+  %622 = fadd float %620, %621
+  %623 = fmul float %164, %93
+  %624 = fmul float %165, %94
+  %625 = fadd float %623, %624
+  %626 = fmul float %166, %95
+  %627 = fadd float %625, %626
+  %628 = fmul float %167, %96
+  %629 = fadd float %627, %628
+  %630 = fsub float -0.000000e+00, %78
+  %631 = fadd float 1.000000e+00, %630
+  %632 = call float @fabs(float %615)
+  %633 = call float @fabs(float %622)
+  %634 = fcmp oge float %631, %632
+  %635 = sext i1 %634 to i32
+  %636 = bitcast i32 %635 to float
+  %637 = bitcast float %636 to i32
+  %638 = and i32 %637, 1065353216
+  %639 = bitcast i32 %638 to float
+  %640 = fcmp oge float %631, %633
+  %641 = sext i1 %640 to i32
+  %642 = bitcast i32 %641 to float
+  %643 = bitcast float %642 to i32
+  %644 = and i32 %643, 1065353216
+  %645 = bitcast i32 %644 to float
+  %646 = fmul float %639, %645
+  %647 = fmul float %629, %646
+  %648 = fmul float %615, %temp68.0
+  %649 = fadd float %648, %temp70.0
+  %650 = fmul float %622, %temp69.0
+  %651 = fadd float %650, %temp71.0
+  %652 = fmul float %615, %temp52.0
+  %653 = fadd float %652, %temp54.0
+  %654 = fmul float %622, %temp53.0
+  %655 = fadd float %654, %temp55.0
+  %656 = fadd float %temp80.0, -1.000000e+00
+  %657 = fmul float %656, %77
+  %658 = fadd float %657, 1.000000e+00
+  %659 = call float @llvm.AMDIL.clamp.(float %658, float 0.000000e+00, float 1.000000e+00)
+  %660 = bitcast float %649 to i32
+  %661 = bitcast float %651 to i32
+  %662 = bitcast float 0.000000e+00 to i32
+  %663 = insertelement <4 x i32> undef, i32 %660, i32 0
+  %664 = insertelement <4 x i32> %663, i32 %661, i32 1
+  %665 = insertelement <4 x i32> %664, i32 %662, i32 2
+  %666 = insertelement <4 x i32> %665, i32 undef, i32 3
+  %667 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %666, <32 x i8> %127, <16 x i8> %129, i32 2)
+  %668 = extractelement <4 x float> %667, i32 0
+  %669 = extractelement <4 x float> %667, i32 1
+  %670 = bitcast float %653 to i32
+  %671 = bitcast float %655 to i32
+  %672 = bitcast float 0.000000e+00 to i32
+  %673 = insertelement <4 x i32> undef, i32 %670, i32 0
+  %674 = insertelement <4 x i32> %673, i32 %671, i32 1
+  %675 = insertelement <4 x i32> %674, i32 %672, i32 2
+  %676 = insertelement <4 x i32> %675, i32 undef, i32 3
+  %677 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %676, <32 x i8> %127, <16 x i8> %129, i32 2)
+  %678 = extractelement <4 x float> %677, i32 0
+  %679 = extractelement <4 x float> %677, i32 1
+  %680 = fsub float -0.000000e+00, %669
+  %681 = fadd float 1.000000e+00, %680
+  %682 = fsub float -0.000000e+00, %679
+  %683 = fadd float 1.000000e+00, %682
+  %684 = fmul float %681, 2.500000e-01
+  %685 = fmul float %683, 2.500000e-01
+  %686 = fsub float -0.000000e+00, %684
+  %687 = fadd float %668, %686
+  %688 = fsub float -0.000000e+00, %685
+  %689 = fadd float %678, %688
+  %690 = fmul float %647, %temp88.0
+  %691 = fadd float %690, %temp89.0
+  %692 = fmul float %647, %temp90.0
+  %693 = fadd float %692, %temp91.0
+  %694 = call float @llvm.AMDIL.clamp.(float %691, float 0.000000e+00, float 1.000000e+00)
+  %695 = call float @llvm.AMDIL.clamp.(float %693, float 0.000000e+00, float 1.000000e+00)
+  %696 = fsub float -0.000000e+00, %694
+  %697 = fadd float %668, %696
+  %698 = fsub float -0.000000e+00, %695
+  %699 = fadd float %678, %698
+  %700 = fmul float %668, %668
+  %701 = fmul float %678, %678
+  %702 = fsub float -0.000000e+00, %700
+  %703 = fadd float %687, %702
+  %704 = fsub float -0.000000e+00, %701
+  %705 = fadd float %689, %704
+  %706 = fcmp uge float %703, %75
+  %707 = select i1 %706, float %703, float %75
+  %708 = fcmp uge float %705, %75
+  %709 = select i1 %708, float %705, float %75
+  %710 = fmul float %697, %697
+  %711 = fadd float %710, %707
+  %712 = fmul float %699, %699
+  %713 = fadd float %712, %709
+  %714 = fdiv float 1.000000e+00, %711
+  %715 = fdiv float 1.000000e+00, %713
+  %716 = fmul float %707, %714
+  %717 = fmul float %709, %715
+  %718 = fcmp oge float %697, 0.000000e+00
+  %719 = sext i1 %718 to i32
+  %720 = bitcast i32 %719 to float
+  %721 = bitcast float %720 to i32
+  %722 = icmp ne i32 %721, 0
+  %.229 = select i1 %722, float 1.000000e+00, float %716
+  %723 = fcmp oge float %699, 0.000000e+00
+  %724 = sext i1 %723 to i32
+  %725 = bitcast i32 %724 to float
+  %726 = bitcast float %725 to i32
+  %727 = icmp ne i32 %726, 0
+  %temp28.0 = select i1 %727, float 1.000000e+00, float %717
+  %728 = call float @llvm.AMDGPU.lrp(float %659, float %temp28.0, float %.229)
+  %729 = call float @llvm.pow.f32(float %728, float %76)
+  %730 = fmul float %729, %79
+  %731 = fadd float %730, %80
+  %732 = call float @llvm.AMDIL.clamp.(float %731, float 0.000000e+00, float 1.000000e+00)
+  %733 = fmul float %732, %732
+  %734 = fmul float 2.000000e+00, %732
+  %735 = fsub float -0.000000e+00, %734
+  %736 = fadd float 3.000000e+00, %735
+  %737 = fmul float %733, %736
+  %738 = fmul float %548, %737
+  %739 = fmul float %549, %737
+  %740 = fmul float %550, %737
+  %741 = fmul float %738, %515
+  %742 = fadd float %741, %533
+  %743 = fmul float %739, %515
+  %744 = fadd float %743, %534
+  %745 = fmul float %740, %515
+  %746 = fadd float %745, %535
+  %747 = call float @llvm.AMDGPU.lrp(float %230, float %287, float 1.000000e+00)
+  %748 = call float @llvm.AMDGPU.lrp(float %37, float %298, float 1.000000e+00)
+  %749 = call float @llvm.AMDGPU.lrp(float %37, float %299, float 1.000000e+00)
+  %750 = call float @llvm.AMDGPU.lrp(float %37, float %300, float 1.000000e+00)
+  %751 = call float @llvm.AMDGPU.lrp(float %38, float %747, float 1.000000e+00)
+  %752 = fmul float %748, %751
+  %753 = fmul float %749, %751
+  %754 = fmul float %750, %751
+  %755 = fmul float %742, %752
+  %756 = fmul float %744, %753
+  %757 = fmul float %746, %754
+  %758 = fmul float %temp12.0, %216
+  %759 = fmul float %temp13.0, %217
+  %760 = fadd float %759, %758
+  %761 = fmul float %temp14.0, %218
+  %762 = fadd float %760, %761
+  %763 = call float @fabs(float %762)
+  %764 = fmul float %763, %763
+  %765 = fmul float %764, %50
+  %766 = fadd float %765, %51
+  %767 = call float @llvm.AMDIL.clamp.(float %766, float 0.000000e+00, float 1.000000e+00)
+  %768 = fsub float -0.000000e+00, %767
+  %769 = fadd float 1.000000e+00, %768
+  %770 = fmul float %33, %769
+  %771 = fmul float %33, %769
+  %772 = fmul float %33, %769
+  %773 = fmul float %34, %769
+  %774 = call float @llvm.AMDGPU.lrp(float %770, float %31, float %755)
+  %775 = call float @llvm.AMDGPU.lrp(float %771, float %31, float %756)
+  %776 = call float @llvm.AMDGPU.lrp(float %772, float %31, float %757)
+  %777 = call float @llvm.AMDGPU.lrp(float %773, float %32, float %374)
+  %778 = fcmp uge float %774, 0x3E6FFFFE60000000
+  %779 = select i1 %778, float %774, float 0x3E6FFFFE60000000
+  %780 = fcmp uge float %775, 0x3E6FFFFE60000000
+  %781 = select i1 %780, float %775, float 0x3E6FFFFE60000000
+  %782 = fcmp uge float %776, 0x3E6FFFFE60000000
+  %783 = select i1 %782, float %776, float 0x3E6FFFFE60000000
+  %784 = fcmp uge float %779, 6.550400e+04
+  %785 = select i1 %784, float 6.550400e+04, float %779
+  %786 = fcmp uge float %781, 6.550400e+04
+  %787 = select i1 %786, float 6.550400e+04, float %781
+  %788 = fcmp uge float %783, 6.550400e+04
+  %789 = select i1 %788, float 6.550400e+04, float %783
+  %790 = fmul float %777, %52
+  %791 = fadd float %790, %53
+  %792 = call float @llvm.AMDIL.clamp.(float %791, float 0.000000e+00, float 1.000000e+00)
+  %793 = call i32 @llvm.SI.packf16(float %785, float %787)
+  %794 = bitcast i32 %793 to float
+  %795 = call i32 @llvm.SI.packf16(float %789, float %792)
+  %796 = bitcast i32 %795 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %794, float %796, float %794, float %796)
+  ret void
+
+ELSE214:                                          ; preds = %ELSE211
+  %797 = fcmp olt float %565, %81
+  %798 = sext i1 %797 to i32
+  %799 = bitcast i32 %798 to float
+  %800 = bitcast float %799 to i32
+  %801 = icmp ne i32 %800, 0
+  %.230 = select i1 %801, float %104, float %100
+  %.231 = select i1 %801, float %105, float %101
+  %.232 = select i1 %801, float %106, float %102
+  %.233 = select i1 %801, float %107, float %103
+  br label %ENDIF209
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.lrp(float, float, float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.samplel.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.cndlt(float, float, float) #2
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.exp.(float) #2
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
+attributes #3 = { nounwind readonly }
+attributes #4 = { readonly }
diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
new file mode 100644
index 0000000..43231df
--- /dev/null
+++ b/test/CodeGen/R600/smrd.ll
@@ -0,0 +1,80 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck %s
+
+; SMRD load with an immediate offset.
+; CHECK-LABEL: @smrd0
+; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 1 ; encoding: [0x01
+define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32 addrspace(2)* %ptr, i64 1
+  %1 = load i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with the largest possible immediate offset.
+; CHECK-LABEL: @smrd1
+; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 255 ; encoding: [0xff
+define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32 addrspace(2)* %ptr, i64 255
+  %1 = load i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with an offset greater than the largest possible immediate.
+; CHECK-LABEL: @smrd2
+; CHECK: S_MOV_B32 s[[OFFSET:[0-9]]], 1024
+; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32 addrspace(2)* %ptr, i64 256
+  %1 = load i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load using the load.const intrinsic with an immediate offset
+; CHECK-LABEL: @smrd_load_const0
+; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 4 ; encoding: [0x04
+define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  ret void
+}
+
+; SMRD load using the load.const intrinsic with an offset greater largest possible
+; immediate offset.
+; CHECK-LABEL: @smrd_load_const1
+; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 255 ; encoding: [0xff
+define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  ret void
+}
+; SMRD load using the load.const intrinsic with the largetst possible
+; immediate offset.
+; CHECK-LABEL: @smrd_load_const2
+; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/store-v3i32.ll b/test/CodeGen/R600/store-v3i32.ll
new file mode 100644
index 0000000..3357803
--- /dev/null
+++ b/test/CodeGen/R600/store-v3i32.ll
@@ -0,0 +1,12 @@
+; XFAIL: *
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; 3 vectors have the same size and alignment as 4 vectors, so this
+; should be done in a single store.
+
+; SI-LABEL: @store_v3i32:
+; SI: BUFFER_STORE_DWORDX4
+define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
+  store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/store-v3i64.ll b/test/CodeGen/R600/store-v3i64.ll
new file mode 100644
index 0000000..58229f6
--- /dev/null
+++ b/test/CodeGen/R600/store-v3i64.ll
@@ -0,0 +1,28 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI
+
+; SI-LABEL: @global_store_v3i64:
+; SI: BUFFER_STORE_DWORDX4
+; SI: BUFFER_STORE_DWORDX4
+define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: @global_store_v3i64_unaligned:
+define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: @local_store_v3i64:
+define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: @local_store_v3i64_unaligned:
+define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/R600/store-vector-ptrs.ll b/test/CodeGen/R600/store-vector-ptrs.ll
index 01210ce..3af7d91 100644
--- a/test/CodeGen/R600/store-vector-ptrs.ll
+++ b/test/CodeGen/R600/store-vector-ptrs.ll
@@ -1,3 +1,4 @@
+; REQUIRES: asserts
 ; XFAIL: *
 ; RUN: llc -march=r600 -mcpu=SI < %s
 
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index 5e51d56..a3c5331 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll
@@ -1,10 +1,18 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=CM-CHECK --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
 
 ;===------------------------------------------------------------------------===;
 ; Global Address Space
 ;===------------------------------------------------------------------------===;
+; FUNC-LABEL: @store_i1
+; EG-CHECK: MEM_RAT MSKOR
+; SI-CHECK: BUFFER_STORE_BYTE
+define void @store_i1(i1 addrspace(1)* %out) {
+entry:
+  store i1 true, i1 addrspace(1)* %out
+  ret void
+}
 
 ; i8 store
 ; EG-CHECK-LABEL: @store_i8
@@ -173,6 +181,15 @@ entry:
 ; Local Address Space
 ;===------------------------------------------------------------------------===;
 
+; FUNC-LABEL: @store_local_i1
+; EG-CHECK: LDS_BYTE_WRITE
+; SI-CHECK: DS_WRITE_B8
+define void @store_local_i1(i1 addrspace(3)* %out) {
+entry:
+  store i1 true, i1 addrspace(3)* %out
+  ret void
+}
+
 ; EG-CHECK-LABEL: @store_local_i8
 ; EG-CHECK: LDS_BYTE_WRITE
 ; SI-CHECK-LABEL: @store_local_i8
diff --git a/test/CodeGen/R600/trunc-store-i1.ll b/test/CodeGen/R600/trunc-store-i1.ll
new file mode 100644
index 0000000..a888943
--- /dev/null
+++ b/test/CodeGen/R600/trunc-store-i1.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+
+; SI-LABEL: @global_truncstore_i32_to_i1
+; SI: S_LOAD_DWORD [[LOAD:s[0-9]+]],
+; SI: S_AND_B32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; SI: BUFFER_STORE_BYTE [[VREG]],
+define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
+  %trunc = trunc i32 %val to i1
+  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: @global_truncstore_i64_to_i1
+; SI: BUFFER_STORE_BYTE
+define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
+  %trunc = trunc i64 %val to i1
+  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: @global_truncstore_i16_to_i1
+; SI: S_LOAD_DWORD [[LOAD:s[0-9]+]],
+; SI: S_AND_B32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; SI: BUFFER_STORE_BYTE [[VREG]],
+define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
+  %trunc = trunc i16 %val to i1
+  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll
index 0bd320a..8a759dc 100644
--- a/test/CodeGen/R600/trunc.ll
+++ b/test/CodeGen/R600/trunc.ll
@@ -16,15 +16,39 @@ define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
   ret void
 }
 
+; SI-LABEL: @trunc_load_shl_i64:
+; SI-DAG: S_LOAD_DWORDX2
+; SI-DAG: S_LOAD_DWORD [[SREG:s[0-9]+]],
+; SI: S_LSHL_B32 [[SHL:s[0-9]+]], [[SREG]], 2
+; SI: V_MOV_B32_e32 [[VSHL:v[0-9]+]], [[SHL]]
+; SI: BUFFER_STORE_DWORD [[VSHL]],
+define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+  %b = shl i64 %a, 2
+  %result = trunc i64 %b to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
 ; SI-LABEL: @trunc_shl_i64:
-; SI: S_LOAD_DWORDX2
-; SI: S_LOAD_DWORDX2 [[SREG:s\[[0-9]+:[0-9]+\]]]
-; SI: S_LSHL_B64 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, [[SREG]], 2
-; SI: MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}},
+; SI: V_ADD_I32_e32 v[[LO_ADD:[0-9]+]], s[[LO_SREG]],
+; SI: V_LSHL_B64 v{{\[}}[[LO_VREG:[0-9]+]]:{{[0-9]+\]}}, v{{\[}}[[LO_ADD]]:{{[0-9]+\]}}, 2
 ; SI: BUFFER_STORE_DWORD v[[LO_VREG]],
-define void @trunc_shl_i64(i32 addrspace(1)* %out, i64 %a) {
-  %b = shl i64 %a, 2
+define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
+  %aa = add i64 %a, 234 ; Prevent shrinking store.
+  %b = shl i64 %aa, 2
   %result = trunc i64 %b to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
+  store i64 %b, i64 addrspace(1)* %out2, align 8 ; Prevent reducing ops to 32-bits
+  ret void
+}
+
+; SI-LABEL: @trunc_i32_to_i1:
+; SI: V_AND_B32
+; SI: V_CMP_EQ_I32
+define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
+  %trunc = trunc i32 %a to i1
+  %result = select i1 %trunc, i32 1, i32 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
diff --git a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
new file mode 100644
index 0000000..e4129c5
--- /dev/null
+++ b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
@@ -0,0 +1,114 @@
+; REQUIRES: asserts
+; XFAIL: *
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
+
+; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
+
+; COMMON-LABEL: @branch_true:
+define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+entry:
+  br i1 true, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add.ptr.sum = shl i32 %main_stride, 1
+  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
+  %add.ptr4.sum = shl i32 %main_stride, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
+  %1 = load i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %3 = load i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  %5 = load i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
+  %7 = load i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
+  %9 = load i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 undef
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; COMMON-LABEL: @branch_false:
+; SI: .text
+; SI-NEXT: S_ENDPGM
+define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+entry:
+  br i1 false, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add.ptr.sum = shl i32 %main_stride, 1
+  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
+  %add.ptr4.sum = shl i32 %main_stride, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
+  %1 = load i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %3 = load i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  %5 = load i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
+  %7 = load i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
+  %9 = load i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 undef
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; COMMON-LABEL: @branch_undef:
+; SI: .text
+; SI-NEXT: S_ENDPGM
+define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+entry:
+  br i1 undef, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add.ptr.sum = shl i32 %main_stride, 1
+  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
+  %add.ptr4.sum = shl i32 %main_stride, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
+  %1 = load i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %3 = load i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  %5 = load i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
+  %7 = load i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
+  %9 = load i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8 addrspace(1)* %main.addr.011, i32 undef
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/R600/unroll.ll b/test/CodeGen/R600/unroll.ll
new file mode 100644
index 0000000..e0035ea
--- /dev/null
+++ b/test/CodeGen/R600/unroll.ll
@@ -0,0 +1,37 @@
+; RUN: opt -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+target triple = "r600--"
+
+; This test contains a simple loop that initializes an array declared in
+; private memory.  We want to make sure these kinds of loops are always
+; unrolled, because private memory is slow.
+
+; CHECK-LABEL: @test
+; CHECK-NOT: alloca
+; CHECK: store i32 5, i32 addrspace(1)* %out
+define void @test(i32 addrspace(1)* %out) {
+entry:
+  %0 = alloca [32 x i32]
+  br label %loop.header
+
+loop.header:
+  %counter = phi i32 [0, %entry], [%inc, %loop.inc]
+  br label %loop.body
+
+loop.body:
+  %ptr = getelementptr [32 x i32]* %0, i32 0, i32 %counter
+  store i32 %counter, i32* %ptr
+  br label %loop.inc
+
+loop.inc:
+  %inc = add i32 %counter, 1
+  %1 = icmp sge i32 %counter, 32
+  br i1 %1, label  %exit, label %loop.header
+
+exit:
+  %2 = getelementptr [32 x i32]* %0, i32 0, i32 5
+  %3 = load i32* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/v1i64-kernel-arg.ll b/test/CodeGen/R600/v1i64-kernel-arg.ll
new file mode 100644
index 0000000..2aa1221
--- /dev/null
+++ b/test/CodeGen/R600/v1i64-kernel-arg.ll
@@ -0,0 +1,17 @@
+; REQUIRES: asserts
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
+
+; CHECK-LABEL: @kernel_arg_i64
+define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+  store i64 %a, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; i64 arg works, v1i64 arg does not.
+; CHECK-LABEL: @kernel_arg_v1i64
+define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+  store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/R600/v_cndmask.ll b/test/CodeGen/R600/v_cndmask.ll
new file mode 100644
index 0000000..f8e9655
--- /dev/null
+++ b/test/CodeGen/R600/v_cndmask.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+; SI: @v_cnd_nan
+; SI: V_CNDMASK_B32_e64 v{{[0-9]}},
+; SI-DAG: v{{[0-9]}}
+; SI-DAG: {{nan|#QNAN}}
+define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) {
+entry:
+  %0 = icmp ne i32 %c, 0
+  %1 = select i1 %0, float 0xFFFFFFFFE0000000, float %f
+  store float %1, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/vtx-fetch-branch.ll b/test/CodeGen/R600/vtx-fetch-branch.ll
new file mode 100644
index 0000000..0fc99de
--- /dev/null
+++ b/test/CodeGen/R600/vtx-fetch-branch.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=r600 -mcpu=redwood %s -o - | FileCheck %s
+
+; This tests for a bug where vertex fetch clauses right before an ENDIF
+; instruction where being emitted after the ENDIF.  We were using ALU_POP_AFTER
+; for the ALU clause before the vetex fetch instead of emitting a POP instruction
+; after the fetch clause.
+
+
+; CHECK-LABEL: @test
+; CHECK-NOT: ALU_POP_AFTER
+; CHECK: TEX
+; CHECK-NEXT: POP
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %endif, label %if
+
+if:
+  %1 = load i32 addrspace(1)* %in
+  br label %endif
+
+endif:
+  %x = phi i32 [ %1, %if], [ 0, %entry]
+  store i32 %x, i32 addrspace(1)* %out
+  br label %done
+
+done:
+  ret void
+}
diff --git a/test/CodeGen/R600/vtx-schedule.ll b/test/CodeGen/R600/vtx-schedule.ll
index 97d37ed..ce852c5 100644
--- a/test/CodeGen/R600/vtx-schedule.ll
+++ b/test/CodeGen/R600/vtx-schedule.ll
@@ -6,9 +6,9 @@
 
 ; CHECK: @test
 ; CHECK: Fetch clause
-; CHECK_VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
+; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
 ; CHECK: Fetch clause
-; CHECK_VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
+; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
 define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
 entry:
   %0 = load i32 addrspace(1)* addrspace(1)* %in0
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index c12b0c1..49ed12d 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -54,3 +54,21 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad
   store float %result, float addrspace(1)* %out
   ret void
 }
+
+; SI-CHECK-LABEL: @vector_xor_i32
+; SI-CHECK: V_XOR_B32_e32
+define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+  %a = load i32 addrspace(1)* %in0
+  %b = load i32 addrspace(1)* %in1
+  %result = xor i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @scalar_xor_i32
+; SI-CHECK: S_XOR_B32
+define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %result = xor i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
index 481b3b3..a114bfc 100644
--- a/test/CodeGen/R600/zero_extend.ll
+++ b/test/CodeGen/R600/zero_extend.ll
@@ -16,3 +16,13 @@ entry:
   store i64 %2, i64 addrspace(1)* %out
   ret void
 }
+
+; SI-CHECK-LABEL: @testi1toi32
+; SI-CHECK: V_CNDMASK_B32
+define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp eq i32 %a, %b
+  %1 = zext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/SPARC/2009-08-28-PIC.ll b/test/CodeGen/SPARC/2009-08-28-PIC.ll
index a2ba0d0..b004b11 100644
--- a/test/CodeGen/SPARC/2009-08-28-PIC.ll
+++ b/test/CodeGen/SPARC/2009-08-28-PIC.ll
@@ -1,9 +1,45 @@
-; RUN: llc -march=sparc --relocation-model=pic < %s | grep _GLOBAL_OFFSET_TABLE_
+; RUN: llc -march=sparc --relocation-model=pic < %s | FileCheck %s --check-prefix=V8
+; RUN: llc -march=sparcv9 --relocation-model=pic < %s | FileCheck %s --check-prefix=V9
+; RUN: llc -march=sparc   --relocation-model=pic < %s -O0 | FileCheck %s --check-prefix=V8UNOPT
+; RUN: llc -march=sparcv9 --relocation-model=pic < %s -O0 | FileCheck %s --check-prefix=V9UNOPT
+
+
+; V8-LABEL: func
+; V8:  _GLOBAL_OFFSET_TABLE_
+
+; V9-LABEL: func
+; V9:  _GLOBAL_OFFSET_TABLE_
 
 @foo = global i32 0                               ; <i32*> [#uses=1]
 
-define i32 @func() nounwind readonly {
+define i32 @func(i32 %a) nounwind readonly {
 entry:
   %0 = load i32* @foo, align 4                    ; <i32> [#uses=1]
   ret i32 %0
 }
+
+; V8UNOPT-LABEL: test_spill
+; V8UNOPT:       sethi %hi(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[R:%[goli][0-7]]]
+; V8UNOPT:       or [[R]], %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[R]]
+; V8UNOPT:       add [[R]], %o7, [[R]]
+; V8UNOPT:       st [[R]], [%fp+{{.+}}]
+
+; V9UNOPT-LABEL: test_spill
+; V9UNOPT:       sethi %hi(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[R:%[goli][0-7]]]
+; V9UNOPT:       or [[R]], %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[R]]
+; V9UNOPT:       add [[R]], %o7, [[R]]
+; V9UNOPT:       stx [[R]], [%fp+{{.+}}]
+
+define i32 @test_spill(i32 %a, i32 %b) {
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %ret =  load i32* @foo, align 4
+  ret i32 %ret
+
+if.end:
+  %add = add nsw i32 %b, %a
+  ret i32 %add
+}
diff --git a/test/CodeGen/SPARC/2011-01-11-Call.ll b/test/CodeGen/SPARC/2011-01-11-Call.ll
index a0f478e..067bade 100644
--- a/test/CodeGen/SPARC/2011-01-11-Call.ll
+++ b/test/CodeGen/SPARC/2011-01-11-Call.ll
@@ -8,7 +8,7 @@
 ; V8-NEXT:  nop
 ; V8:       call bar
 ; V8-NEXT:  nop
-; V8:       jmp %i7+8
+; V8:       ret
 ; V8-NEXT:  restore
 
 ; V9-LABEL: test
@@ -17,7 +17,7 @@
 ; V9-NEXT:  nop
 ; V9:       call bar
 ; V9-NEXT:  nop
-; V9:       jmp %i7+8
+; V9:       ret
 ; V9-NEXT:  restore
 
 define void @test() nounwind {
@@ -36,14 +36,14 @@ declare void @bar(...)
 ; V8:       save %sp
 ; V8:       call foo
 ; V8-NEXT:  nop
-; V8:       jmp %i7+8
+; V8:       ret
 ; V8-NEXT:  restore %g0, %o0, %o0
 
 ; V9-LABEL: test_tail_call_with_return
 ; V9:       save %sp
 ; V9:       call foo
 ; V9-NEXT:  nop
-; V9:       jmp %i7+8
+; V9:       ret
 ; V9-NEXT:  restore %g0, %o0, %o0
 
 define i32 @test_tail_call_with_return() nounwind {
diff --git a/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll b/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll
index 7cc7868..050b76d 100644
--- a/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll
+++ b/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll
@@ -2,19 +2,28 @@
 ;RUN: llc -march=sparc -mattr=v9 < %s | FileCheck %s -check-prefix=V9
 ;RUN: llc -march=sparc -regalloc=basic < %s | FileCheck %s -check-prefix=V8
 ;RUN: llc -march=sparc -regalloc=basic -mattr=v9 < %s | FileCheck %s -check-prefix=V9
+;RUN: llc -march=sparcv9  < %s | FileCheck %s -check-prefix=SPARC64
 
 
 define i8* @frameaddr() nounwind readnone {
 entry:
 ;V8-LABEL: frameaddr:
 ;V8: save %sp, -96, %sp
-;V8: jmp %i7+8
+;V8: ret
 ;V8: restore %g0, %fp, %o0
 
 ;V9-LABEL: frameaddr:
 ;V9: save %sp, -96, %sp
-;V9: jmp %i7+8
+;V9: ret
 ;V9: restore %g0, %fp, %o0
+
+;SPARC64-LABEL: frameaddr
+;SPARC64:       save %sp, -128, %sp
+;SPARC64:       add  %fp, 2047, %i0
+;SPARC64:       ret
+;SPARC64-NOT:   restore %g0, %g0, %g0
+;SPARC64:       restore
+
   %0 = tail call i8* @llvm.frameaddress(i32 0)
   ret i8* %0
 }
@@ -32,6 +41,14 @@ entry:
 ;V9: ld [%fp+56], {{.+}}
 ;V9: ld [{{.+}}+56], {{.+}}
 ;V9: ld [{{.+}}+56], {{.+}}
+
+;SPARC64-LABEL: frameaddr2
+;SPARC64: flushw
+;SPARC64: ldx [%fp+2159],     %[[R0:[goli][0-7]]]
+;SPARC64: ldx [%[[R0]]+2159], %[[R1:[goli][0-7]]]
+;SPARC64: ldx [%[[R1]]+2159], %[[R2:[goli][0-7]]]
+;SPARC64: add %[[R2]], 2047, {{.+}}
+
   %0 = tail call i8* @llvm.frameaddress(i32 3)
   ret i8* %0
 }
@@ -48,6 +65,9 @@ entry:
 ;V9-LABEL: retaddr:
 ;V9: or %g0, %o7, {{.+}}
 
+;SPARC64-LABEL: retaddr
+;SPARC64:       or %g0, %o7, {{.+}}
+
   %0 = tail call i8* @llvm.returnaddress(i32 0)
   ret i8* %0
 }
@@ -66,17 +86,11 @@ entry:
 ;V9: ld [{{.+}}+56], {{.+}}
 ;V9: ld [{{.+}}+60], {{.+}}
 
-;V8LEAF-LABEL: retaddr2:
-;V8LEAF: ta 3
-;V8LEAF: ld [%fp+56], %[[R:[goli][0-7]]]
-;V8LEAF: ld [%[[R]]+56], %[[R1:[goli][0-7]]]
-;V8LEAF: ld [%[[R1]]+60], {{.+}}
-
-;V9LEAF-LABEL: retaddr2:
-;V9LEAF: flushw
-;V9LEAF: ld [%fp+56], %[[R:[goli][0-7]]]
-;V9LEAF: ld [%[[R]]+56], %[[R1:[goli][0-7]]]
-;V9LEAF: ld [%[[R1]]+60], {{.+}}
+;SPARC64-LABEL: retaddr2
+;SPARC64:       flushw
+;SPARC64: ldx [%fp+2159],     %[[R0:[goli][0-7]]]
+;SPARC64: ldx [%[[R0]]+2159], %[[R1:[goli][0-7]]]
+;SPARC64: ldx [%[[R1]]+2167], {{.+}}
 
   %0 = tail call i8* @llvm.returnaddress(i32 3)
   ret i8* %0
diff --git a/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll b/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
index c71e7c0..60bdf06 100644
--- a/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
+++ b/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
@@ -1,5 +1,5 @@
-;RUN: llc -march=sparc < %s | FileCheck %s
-;RUN: llc -march=sparc -O0 < %s | FileCheck %s -check-prefix=UNOPT
+;RUN: llc -march=sparc < %s -verify-machineinstrs | FileCheck %s
+;RUN: llc -march=sparc -O0 < %s -verify-machineinstrs | FileCheck %s -check-prefix=UNOPT
 
 
 define i32 @test(i32 %a) nounwind {
@@ -7,7 +7,7 @@ entry:
 ; CHECK: test
 ; CHECK: call bar
 ; CHECK-NOT: nop
-; CHECK: jmp
+; CHECK: ret
 ; CHECK-NEXT: restore
   %0 = tail call i32 @bar(i32 %a) nounwind
   ret i32 %0
@@ -18,7 +18,7 @@ entry:
 ; CHECK:      test_jmpl
 ; CHECK:      call
 ; CHECK-NOT:  nop
-; CHECK:      jmp
+; CHECK:      ret
 ; CHECK-NEXT: restore
   %0 = tail call i32 %f(i32 %a, i32 %b) nounwind
   ret i32 %0
@@ -47,7 +47,7 @@ bb:                                               ; preds = %entry, %bb
 
 bb5:                                              ; preds = %bb, %entry
   %a_addr.1.lcssa = phi i32 [ %a, %entry ], [ %a_addr.0, %bb ]
-;CHECK:      jmp
+;CHECK:      retl
 ;CHECK-NOT: restore
   ret i32 %a_addr.1.lcssa
 }
@@ -110,7 +110,7 @@ declare i32 @func(i32*)
 define i32 @restore_add(i32 %a, i32 %b) {
 entry:
 ;CHECK-LABEL:  restore_add:
-;CHECK:  jmp %i7+8
+;CHECK:  ret
 ;CHECK:  restore %o0, %i1, %o0
   %0 = tail call i32 @bar(i32 %a) nounwind
   %1 = add nsw i32 %0, %b
@@ -120,7 +120,7 @@ entry:
 define i32 @restore_add_imm(i32 %a) {
 entry:
 ;CHECK-LABEL:  restore_add_imm:
-;CHECK:  jmp %i7+8
+;CHECK:  ret
 ;CHECK:  restore %o0, 20, %o0
   %0 = tail call i32 @bar(i32 %a) nounwind
   %1 = add nsw i32 %0, 20
@@ -130,7 +130,7 @@ entry:
 define i32 @restore_or(i32 %a) {
 entry:
 ;CHECK-LABEL:  restore_or:
-;CHECK:  jmp %i7+8
+;CHECK:  ret
 ;CHECK:  restore %g0, %o0, %o0
   %0 = tail call i32 @bar(i32 %a) nounwind
   ret i32 %0
@@ -140,8 +140,9 @@ define i32 @restore_or_imm(i32 %a) {
 entry:
 ;CHECK-LABEL:  restore_or_imm:
 ;CHECK:  or %o0, 20, %i0
-;CHECK:  jmp %i7+8
-;CHECK:  restore %g0, %g0, %g0
+;CHECK:  ret
+;CHECK-NOT:  restore %g0, %g0, %g0
+;CHECK:  restore
   %0 = tail call i32 @bar(i32 %a) nounwind
   %1 = or i32 %0, 20
   ret i32 %1
@@ -174,7 +175,8 @@ define i32 @restore_sethi_large(i32 %a) {
 entry:
 ;CHECK-LABEL: restore_sethi_large:
 ;CHECK: sethi  4000, %i0
-;CHECK: restore %g0, %g0, %g0
+;CHECK-NOT: restore %g0, %g0, %g0
+;CHECK:     restore
   %0 = tail call i32 @bar(i32 %a) nounwind
   %1 = icmp ne i32 %0, 0
   %2 = select i1 %1, i32 4096000, i32 0
diff --git a/test/CodeGen/SPARC/64abi.ll b/test/CodeGen/SPARC/64abi.ll
index 8b752a1..3771888 100644
--- a/test/CodeGen/SPARC/64abi.ll
+++ b/test/CodeGen/SPARC/64abi.ll
@@ -180,7 +180,7 @@ define void @call_inreg_fi(i32* %p, i32 %i1, float %f5) {
 }
 
 ; CHECK: inreg_ff
-; CHECK: fsubs %f0, %f1, %f1
+; CHECK: fsubs %f0, %f1, %f0
 define float @inreg_ff(float inreg %a0,   ; %f0
                        float inreg %a1) { ; %f1
   %rv = fsub float %a0, %a1
@@ -262,10 +262,10 @@ define void @call_ret_i64_pair(i64* %i0) {
   ret void
 }
 
-; This is not a C struct, each member uses 8 bytes.
+; This is not a C struct, the i32 member uses 8 bytes, but the float only 4.
 ; CHECK: ret_i32_float_pair
 ; CHECK: ld [%i2], %i0
-; CHECK: ld [%i3], %f3
+; CHECK: ld [%i3], %f2
 define { i32, float } @ret_i32_float_pair(i32 %a0, i32 %a1,
                                           i32* %p, float* %q) {
   %r1 = load i32* %p
@@ -279,7 +279,7 @@ define { i32, float } @ret_i32_float_pair(i32 %a0, i32 %a1,
 ; CHECK: call_ret_i32_float_pair
 ; CHECK: call ret_i32_float_pair
 ; CHECK: st %o0, [%i0]
-; CHECK: st %f3, [%i1]
+; CHECK: st %f2, [%i1]
 define void @call_ret_i32_float_pair(i32* %i0, float* %i1) {
   %rv = call { i32, float } @ret_i32_float_pair(i32 undef, i32 undef,
                                                 i32* undef, float* undef)
@@ -411,3 +411,54 @@ entry:
 }
 
 declare i32 @use_buf(i32, i8*)
+
+; CHECK-LABEL: test_fp128_args
+; CHECK-DAG:   std %f0, [%fp+{{.+}}]
+; CHECK-DAG:   std %f2, [%fp+{{.+}}]
+; CHECK-DAG:   std %f6, [%fp+{{.+}}]
+; CHECK-DAG:   std %f4, [%fp+{{.+}}]
+; CHECK:       add %fp, [[Offset:[0-9]+]], %o0
+; CHECK:       call _Qp_add
+; CHECK:       ldd [%fp+[[Offset]]], %f0
+define fp128 @test_fp128_args(fp128 %a, fp128 %b) {
+entry:
+  %0 = fadd fp128 %a, %b
+  ret fp128 %0
+}
+
+declare i64 @receive_fp128(i64 %a, ...)
+
+; CHECK-LABEL: test_fp128_variable_args
+; CHECK-DAG:   std %f4, [%sp+[[Offset0:[0-9]+]]]
+; CHECK-DAG:   std %f6, [%sp+[[Offset1:[0-9]+]]]
+; CHECK-DAG:   ldx [%sp+[[Offset0]]], %o2
+; CHECK-DAG:   ldx [%sp+[[Offset1]]], %o3
+; CHECK:       call receive_fp128
+define i64 @test_fp128_variable_args(i64 %a, fp128 %b) {
+entry:
+  %0 = call i64 (i64, ...)* @receive_fp128(i64 %a, fp128 %b)
+  ret i64 %0
+}
+
+; CHECK-LABEL: test_call_libfunc
+; CHECK:       st %f1, [%fp+[[Offset0:[0-9]+]]]
+; CHECK:       fmovs %f3, %f1
+; CHECK:       call cosf
+; CHECK:       st %f0, [%fp+[[Offset1:[0-9]+]]]
+; CHECK:       ld [%fp+[[Offset0]]], %f1
+; CHECK:       call sinf
+; CHECK:       ld [%fp+[[Offset1]]], %f1
+; CHECK:       fmuls %f1, %f0, %f0
+
+define inreg float @test_call_libfunc(float %arg0, float %arg1) {
+entry:
+  %0 = tail call inreg float @cosf(float %arg1)
+  %1 = tail call inreg float @sinf(float %arg0)
+  %2 = fmul float %0, %1
+  ret float %2
+}
+
+declare inreg float @cosf(float %arg) readnone nounwind
+declare inreg float @sinf(float %arg) readnone nounwind
+
+
diff --git a/test/CodeGen/SPARC/64bit.ll b/test/CodeGen/SPARC/64bit.ll
index f5ed047..7ab19f3 100644
--- a/test/CodeGen/SPARC/64bit.ll
+++ b/test/CodeGen/SPARC/64bit.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=sparcv9 -disable-sparc-delay-filler -disable-sparc-leaf-proc | FileCheck %s
-; RUN: llc < %s -march=sparcv9  | FileCheck %s -check-prefix=OPT
+; RUN: llc < %s -march=sparcv9 -mattr=+popc -disable-sparc-delay-filler -disable-sparc-leaf-proc | FileCheck %s
+; RUN: llc < %s -march=sparcv9 -mattr=+popc | FileCheck %s -check-prefix=OPT
 
 ; CHECK-LABEL: ret2:
 ; CHECK: or %g0, %i1, %i0
 
 ; OPT-LABEL: ret2:
-; OPT: jmp %o7+8
+; OPT: retl
 ; OPT: or %g0, %o1, %o0
 define i64 @ret2(i64 %a, i64 %b) {
   ret i64 %b
@@ -15,7 +15,7 @@ define i64 @ret2(i64 %a, i64 %b) {
 ; CHECK: sllx %i0, 7, %i0
 
 ; OPT-LABEL: shl_imm:
-; OPT: jmp %o7+8
+; OPT: retl
 ; OPT: sllx %o0, 7, %o0
 define i64 @shl_imm(i64 %a) {
   %x = shl i64 %a, 7
@@ -26,7 +26,7 @@ define i64 @shl_imm(i64 %a) {
 ; CHECK: srax %i0, %i1, %i0
 
 ; OPT-LABEL: sra_reg:
-; OPT: jmp %o7+8
+; OPT: retl
 ; OPT: srax %o0, %o1, %o0
 define i64 @sra_reg(i64 %a, i64 %b) {
   %x = ashr i64 %a, %b
@@ -42,7 +42,7 @@ define i64 @sra_reg(i64 %a, i64 %b) {
 ; CHECK: or %g0, 0, %i0
 
 ; OPT: ret_imm0
-; OPT: jmp %o7+8
+; OPT: retl
 ; OPT: or %g0, 0, %o0
 define i64 @ret_imm0() {
   ret i64 0
@@ -52,7 +52,7 @@ define i64 @ret_imm0() {
 ; CHECK: or %g0, -4096, %i0
 
 ; OPT:   ret_simm13
-; OPT:   jmp %o7+8
+; OPT:   retl
 ; OPT:   or %g0, -4096, %o0
 define i64 @ret_simm13() {
   ret i64 -4096
@@ -64,7 +64,7 @@ define i64 @ret_simm13() {
 ; CHECK: restore
 
 ; OPT:  ret_sethi
-; OPT:  jmp %o7+8
+; OPT:  retl
 ; OPT:  sethi 4, %o0
 define i64 @ret_sethi() {
   ret i64 4096
@@ -76,7 +76,7 @@ define i64 @ret_sethi() {
 
 ; OPT: ret_sethi_or
 ; OPT: sethi 4, [[R:%[go][0-7]]]
-; OPT: jmp %o7+8
+; OPT: retl
 ; OPT: or [[R]], 1, %o0
 
 define i64 @ret_sethi_or() {
@@ -89,7 +89,7 @@ define i64 @ret_sethi_or() {
 
 ; OPT: ret_nimm33
 ; OPT: sethi 4, [[R:%[go][0-7]]]
-; OPT: jmp %o7+8
+; OPT: retl
 ; OPT: xor [[R]], -4, %o0
 
 define i64 @ret_nimm33() {
diff --git a/test/CodeGen/SPARC/64cond.ll b/test/CodeGen/SPARC/64cond.ll
index 7451b04..1bd17a4 100644
--- a/test/CodeGen/SPARC/64cond.ll
+++ b/test/CodeGen/SPARC/64cond.ll
@@ -80,7 +80,7 @@ entry:
 ; CHECK: selectf32_xcc
 ; CHECK: cmp %i0, %i1
 ; CHECK: fmovsg %xcc, %f5, %f7
-; CHECK: fmovs %f7, %f1
+; CHECK: fmovs %f7, %f0
 define float @selectf32_xcc(i64 %x, i64 %y, float %a, float %b) {
 entry:
   %tobool = icmp sgt i64 %x, %y
@@ -111,6 +111,11 @@ entry:
 }
 
 ; CHECK-LABEL: setcc_resultty
+; CHECK-DAG:       srax %i0, 63, %o0
+; CHECK-DAG:       or %g0, %i0, %o1
+; CHECK-DAG:       or %g0, 0, %o2
+; CHECK-DAG:       or %g0, 32, %o3
+; CHECK-DAG:       call __multi3
 ; CHECK:       cmp
 ; CHECK:       movne %xcc, 1, [[R:%[gilo][0-7]]]
 ; CHECK:       or [[R]], %i1, %i0
diff --git a/test/CodeGen/SPARC/64spill.ll b/test/CodeGen/SPARC/64spill.ll
new file mode 100644
index 0000000..ab08d6b
--- /dev/null
+++ b/test/CodeGen/SPARC/64spill.ll
@@ -0,0 +1,116 @@
+; RUN: llc < %s -march=sparcv9 | FileCheck %s
+
+target datalayout = "E-i64:64-n32:64-S128"
+target triple = "sparc64-sun-sparc"
+
+; CHECK-LABEL: test_and_spill
+; CHECK:       and %i0, %i1, [[R:%[gilo][0-7]]]
+; CHECK:       stx [[R]], [%fp+{{.+}}]
+; CHECK:       ldx [%fp+{{.+}}, %i0
+define i64 @test_and_spill(i64 %a, i64 %b) {
+entry:
+  %r0 = and i64 %a, %b
+  %0 = tail call i64 asm sideeffect "#$0 $1", "=r,r,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6}"(i64 %r0)
+  ret i64 %r0
+}
+
+; CHECK-LABEL: test_or_spill
+; CHECK:       or %i0, %i1, [[R:%[gilo][0-7]]]
+; CHECK:       stx [[R]], [%fp+{{.+}}]
+; CHECK:       ldx [%fp+{{.+}}, %i0
+define i64 @test_or_spill(i64 %a, i64 %b) {
+entry:
+  %r0 = or i64 %a, %b
+  %0 = tail call i64 asm sideeffect "#$0 $1", "=r,r,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6}"(i64 %r0)
+  ret i64 %r0
+}
+
+; CHECK-LABEL: test_xor_spill
+; CHECK:       xor %i0, %i1, [[R:%[gilo][0-7]]]
+; CHECK:       stx [[R]], [%fp+{{.+}}]
+; CHECK:       ldx [%fp+{{.+}}, %i0
+define i64 @test_xor_spill(i64 %a, i64 %b) {
+entry:
+  %r0 = xor i64 %a, %b
+  %0 = tail call i64 asm sideeffect "#$0 $1", "=r,r,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6}"(i64 %r0)
+  ret i64 %r0
+}
+
+
+; CHECK-LABEL: test_add_spill
+; CHECK:       add %i0, %i1, [[R:%[gilo][0-7]]]
+; CHECK:       stx [[R]], [%fp+{{.+}}]
+; CHECK:       ldx [%fp+{{.+}}, %i0
+define i64 @test_add_spill(i64 %a, i64 %b) {
+entry:
+  %r0 = add i64 %a, %b
+  %0 = tail call i64 asm sideeffect "#$0 $1", "=r,r,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6}"(i64 %r0)
+  ret i64 %r0
+}
+
+; CHECK-LABEL: test_sub_spill
+; CHECK:       sub %i0, %i1, [[R:%[gilo][0-7]]]
+; CHECK:       stx [[R]], [%fp+{{.+}}]
+; CHECK:       ldx [%fp+{{.+}}, %i0
+define i64 @test_sub_spill(i64 %a, i64 %b) {
+entry:
+  %r0 = sub i64 %a, %b
+  %0 = tail call i64 asm sideeffect "#$0 $1", "=r,r,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6}"(i64 %r0)
+  ret i64 %r0
+}
+
+; CHECK-LABEL: test_andi_spill
+; CHECK:       and %i0, 1729, [[R:%[gilo][0-7]]]
+; CHECK:       stx [[R]], [%fp+{{.+}}]
+; CHECK:       ldx [%fp+{{.+}}, %i0
+define i64 @test_andi_spill(i64 %a) {
+entry:
+  %r0 = and i64 %a, 1729
+  %0 = tail call i64 asm sideeffect "#$0 $1", "=r,r,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6}"(i64 %r0)
+  ret i64 %r0
+}
+
+; CHECK-LABEL: test_ori_spill
+; CHECK:       or %i0, 1729, [[R:%[gilo][0-7]]]
+; CHECK:       stx [[R]], [%fp+{{.+}}]
+; CHECK:       ldx [%fp+{{.+}}, %i0
+define i64 @test_ori_spill(i64 %a) {
+entry:
+  %r0 = or i64 %a, 1729
+  %0 = tail call i64 asm sideeffect "#$0 $1", "=r,r,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6}"(i64 %r0)
+  ret i64 %r0
+}
+
+; CHECK-LABEL: test_xori_spill
+; CHECK:       xor %i0, 1729, [[R:%[gilo][0-7]]]
+; CHECK:       stx [[R]], [%fp+{{.+}}]
+; CHECK:       ldx [%fp+{{.+}}, %i0
+define i64 @test_xori_spill(i64 %a) {
+entry:
+  %r0 = xor i64 %a, 1729
+  %0 = tail call i64 asm sideeffect "#$0 $1", "=r,r,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6}"(i64 %r0)
+  ret i64 %r0
+}
+
+; CHECK-LABEL: test_addi_spill
+; CHECK:       add %i0, 1729, [[R:%[gilo][0-7]]]
+; CHECK:       stx [[R]], [%fp+{{.+}}]
+; CHECK:       ldx [%fp+{{.+}}, %i0
+define i64 @test_addi_spill(i64 %a) {
+entry:
+  %r0 = add i64 %a, 1729
+  %0 = tail call i64 asm sideeffect "#$0 $1", "=r,r,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6}"(i64 %r0)
+  ret i64 %r0
+}
+
+; CHECK-LABEL: test_subi_spill
+; CHECK:       add %i0, -1729, [[R:%[gilo][0-7]]]
+; CHECK:       stx [[R]], [%fp+{{.+}}]
+; CHECK:       ldx [%fp+{{.+}}, %i0
+define i64 @test_subi_spill(i64 %a) {
+entry:
+  %r0 = sub i64 %a, 1729
+  %0 = tail call i64 asm sideeffect "#$0 $1", "=r,r,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6}"(i64 %r0)
+  ret i64 %r0
+}
+
diff --git a/test/CodeGen/SPARC/atomics.ll b/test/CodeGen/SPARC/atomics.ll
new file mode 100644
index 0000000..4e3e7ae
--- /dev/null
+++ b/test/CodeGen/SPARC/atomics.ll
@@ -0,0 +1,153 @@
+; RUN: llc < %s -march=sparcv9 -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: test_atomic_i32
+; CHECK:       ld [%o0]
+; CHECK:       membar
+; CHECK:       ld [%o1]
+; CHECK:       membar
+; CHECK:       membar
+; CHECK:       st {{.+}}, [%o2]
+define i32 @test_atomic_i32(i32* %ptr1, i32* %ptr2, i32* %ptr3) {
+entry:
+  %0 = load atomic i32* %ptr1 acquire, align 8
+  %1 = load atomic i32* %ptr2 acquire, align 8
+  %2 = add i32 %0, %1
+  store atomic i32 %2, i32* %ptr3 release, align 8
+  ret i32 %2
+}
+
+; CHECK-LABEL: test_atomic_i64
+; CHECK:       ldx [%o0]
+; CHECK:       membar
+; CHECK:       ldx [%o1]
+; CHECK:       membar
+; CHECK:       membar
+; CHECK:       stx {{.+}}, [%o2]
+define i64 @test_atomic_i64(i64* %ptr1, i64* %ptr2, i64* %ptr3) {
+entry:
+  %0 = load atomic i64* %ptr1 acquire, align 8
+  %1 = load atomic i64* %ptr2 acquire, align 8
+  %2 = add i64 %0, %1
+  store atomic i64 %2, i64* %ptr3 release, align 8
+  ret i64 %2
+}
+
+; CHECK-LABEL: test_cmpxchg_i32
+; CHECK:       or  %g0, 123, [[R:%[gilo][0-7]]]
+; CHECK:       cas [%o1], %o0, [[R]]
+
+define i32 @test_cmpxchg_i32(i32 %a, i32* %ptr) {
+entry:
+  %b = cmpxchg i32* %ptr, i32 %a, i32 123 monotonic monotonic
+  ret i32 %b
+}
+
+; CHECK-LABEL: test_cmpxchg_i64
+; CHECK:       or  %g0, 123, [[R:%[gilo][0-7]]]
+; CHECK:       casx [%o1], %o0, [[R]]
+
+define i64 @test_cmpxchg_i64(i64 %a, i64* %ptr) {
+entry:
+  %b = cmpxchg i64* %ptr, i64 %a, i64 123 monotonic monotonic
+  ret i64 %b
+}
+
+; CHECK-LABEL: test_swap_i32
+; CHECK:       or  %g0, 42, [[R:%[gilo][0-7]]]
+; CHECK:       swap [%o1], [[R]]
+
+define i32 @test_swap_i32(i32 %a, i32* %ptr) {
+entry:
+  %b = atomicrmw xchg i32* %ptr, i32 42 monotonic
+  ret i32 %b
+}
+
+; CHECK-LABEL: test_swap_i64
+; CHECK:       casx [%o1],
+
+define i64 @test_swap_i64(i64 %a, i64* %ptr) {
+entry:
+  %b = atomicrmw xchg i64* %ptr, i64 42 monotonic
+  ret i64 %b
+}
+
+; CHECK-LABEL: test_load_add_32
+; CHECK: membar
+; CHECK: add [[V:%[gilo][0-7]]], %o1, [[U:%[gilo][0-7]]]
+; CHECK: cas [%o0], [[V]], [[U]]
+; CHECK: membar
+define zeroext i32 @test_load_add_32(i32* %p, i32 zeroext %v) {
+entry:
+  %0 = atomicrmw add i32* %p, i32 %v seq_cst
+  ret i32 %0
+}
+
+; CHECK-LABEL: test_load_sub_64
+; CHECK: membar
+; CHECK: sub
+; CHECK: casx [%o0]
+; CHECK: membar
+define zeroext i64 @test_load_sub_64(i64* %p, i64 zeroext %v) {
+entry:
+  %0 = atomicrmw sub i64* %p, i64 %v seq_cst
+  ret i64 %0
+}
+
+; CHECK-LABEL: test_load_xor_32
+; CHECK: membar
+; CHECK: xor
+; CHECK: cas [%o0]
+; CHECK: membar
+define zeroext i32 @test_load_xor_32(i32* %p, i32 zeroext %v) {
+entry:
+  %0 = atomicrmw xor i32* %p, i32 %v seq_cst
+  ret i32 %0
+}
+
+; CHECK-LABEL: test_load_and_32
+; CHECK: membar
+; CHECK: and
+; CHECK-NOT: xor
+; CHECK: cas [%o0]
+; CHECK: membar
+define zeroext i32 @test_load_and_32(i32* %p, i32 zeroext %v) {
+entry:
+  %0 = atomicrmw and i32* %p, i32 %v seq_cst
+  ret i32 %0
+}
+
+; CHECK-LABEL: test_load_nand_32
+; CHECK: membar
+; CHECK: and
+; CHECK: xor
+; CHECK: cas [%o0]
+; CHECK: membar
+define zeroext i32 @test_load_nand_32(i32* %p, i32 zeroext %v) {
+entry:
+  %0 = atomicrmw nand i32* %p, i32 %v seq_cst
+  ret i32 %0
+}
+
+; CHECK-LABEL: test_load_max_64
+; CHECK: membar
+; CHECK: cmp
+; CHECK: movg %xcc
+; CHECK: casx [%o0]
+; CHECK: membar
+define zeroext i64 @test_load_max_64(i64* %p, i64 zeroext %v) {
+entry:
+  %0 = atomicrmw max i64* %p, i64 %v seq_cst
+  ret i64 %0
+}
+
+; CHECK-LABEL: test_load_umin_32
+; CHECK: membar
+; CHECK: cmp
+; CHECK: movleu %icc
+; CHECK: cas [%o0]
+; CHECK: membar
+define zeroext i32 @test_load_umin_32(i32* %p, i32 zeroext %v) {
+entry:
+  %0 = atomicrmw umin i32* %p, i32 %v seq_cst
+  ret i32 %0
+}
diff --git a/test/CodeGen/SPARC/constpool.ll b/test/CodeGen/SPARC/constpool.ll
index b861676..8b0d1d9 100644
--- a/test/CodeGen/SPARC/constpool.ll
+++ b/test/CodeGen/SPARC/constpool.ll
@@ -12,7 +12,7 @@ entry:
 
 ; abs32: floatCP
 ; abs32: sethi %hi(.LCPI0_0), %[[R:[gilo][0-7]]]
-; abs32: jmp %o7+8
+; abs32: retl
 ; abs32: ld [%[[R]]+%lo(.LCPI0_0)], %f
 
 
@@ -20,8 +20,8 @@ entry:
 ; abs44: sethi %h44(.LCPI0_0), %[[R1:[gilo][0-7]]]
 ; abs44: add %[[R1]], %m44(.LCPI0_0), %[[R2:[gilo][0-7]]]
 ; abs44: sllx %[[R2]], 12, %[[R3:[gilo][0-7]]]
-; abs44: jmp %o7+8
-; abs44: ld [%[[R3]]+%l44(.LCPI0_0)], %f1
+; abs44: retl
+; abs44: ld [%[[R3]]+%l44(.LCPI0_0)], %f0
 
 
 ; abs64: floatCP
@@ -30,8 +30,8 @@ entry:
 ; abs64: sethi %hh(.LCPI0_0), %[[R3:[gilo][0-7]]]
 ; abs64: add %[[R3]], %hm(.LCPI0_0), %[[R4:[gilo][0-7]]]
 ; abs64: sllx %[[R4]], 32, %[[R5:[gilo][0-7]]]
-; abs64: jmp %o7+8
-; abs64: ld [%[[R5]]+%[[R2]]], %f1
+; abs64: retl
+; abs64: ld [%[[R5]]+%[[R2]]], %f0
 
 
 ; v8pic32: floatCP
@@ -40,7 +40,7 @@ entry:
 ; v8pic32: add %[[R1]], %lo(.LCPI0_0), %[[Goffs:[gilo][0-7]]]
 ; v8pic32: ld [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
 ; v8pic32: ld [%[[Gaddr]]], %f0
-; v8pic32: jmp %i7+8
+; v8pic32: ret
 ; v8pic32: restore
 
 
@@ -50,8 +50,8 @@ entry:
 ; v9pic32: sethi %hi(.LCPI0_0), %[[R1:[gilo][0-7]]]
 ; v9pic32: add %[[R1]], %lo(.LCPI0_0), %[[Goffs:[gilo][0-7]]]
 ; v9pic32: ldx [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
-; v9pic32: ld [%[[Gaddr]]], %f1
-; v9pic32: jmp %i7+8
+; v9pic32: ld [%[[Gaddr]]], %f0
+; v9pic32: ret
 ; v9pic32: restore
 
 
diff --git a/test/CodeGen/SPARC/ctpop.ll b/test/CodeGen/SPARC/ctpop.ll
index 916a414..3a37340 100644
--- a/test/CodeGen/SPARC/ctpop.ll
+++ b/test/CodeGen/SPARC/ctpop.ll
@@ -1,8 +1,29 @@
-; RUN: llc < %s -march=sparc -mattr=-v9 | not grep popc
-; RUN: llc < %s -march=sparc -mattr=+v9 | grep popc
+; RUN: llc < %s -march=sparc -mattr=-v9 | FileCheck %s -check-prefix=V8
+; RUN: llc < %s -march=sparc -mattr=+v9,+popc | FileCheck %s -check-prefix=V9
+; RUN: llc < %s -march=sparc -mcpu=v9 | FileCheck %s -check-prefix=V8
+; RUN: llc < %s -march=sparc -mcpu=ultrasparc  | FileCheck %s -check-prefix=V8
+; RUN: llc < %s -march=sparc -mcpu=ultrasparc3 | FileCheck %s -check-prefix=V8
+; RUN: llc < %s -march=sparc -mcpu=niagara     | FileCheck %s -check-prefix=V8
+; RUN: llc < %s -march=sparc -mcpu=niagara2    | FileCheck %s -check-prefix=V9
+; RUN: llc < %s -march=sparc -mcpu=niagara3    | FileCheck %s -check-prefix=V9
+; RUN: llc < %s -march=sparc -mcpu=niagara4    | FileCheck %s -check-prefix=V9
+; RUN: llc < %s -march=sparcv9 -mattr=+popc | FileCheck %s -check-prefix=SPARC64
 
 declare i32 @llvm.ctpop.i32(i32)
 
+; V8-LABEL: test
+; V8-NOT: popc
+
+; V9-LABEL: test
+; V9:       srl %o0, 0, %o0
+; V9-NEXT:  retl
+; V9-NEXT:  popc %o0, %o0
+
+; SPARC64-LABEL: test
+; SPARC64:       srl %o0, 0, %o0
+; SPARC64:       retl
+; SPARC64:       popc %o0, %o0
+
 define i32 @test(i32 %X) {
         %Y = call i32 @llvm.ctpop.i32( i32 %X )         ; <i32> [#uses=1]
         ret i32 %Y
diff --git a/test/CodeGen/SPARC/exception.ll b/test/CodeGen/SPARC/exception.ll
index cb5b6e5..3a3f59f 100644
--- a/test/CodeGen/SPARC/exception.ll
+++ b/test/CodeGen/SPARC/exception.ll
@@ -1,4 +1,9 @@
-; RUN: llc < %s -march=sparc | FileCheck %s
+; RUN: llc < %s -march=sparc   -relocation-model=static | FileCheck -check-prefix=V8ABS %s
+; RUN: llc < %s -march=sparc   -relocation-model=pic    | FileCheck -check-prefix=V8PIC %s
+; RUN: llc < %s -march=sparc   -relocation-model=pic -disable-cfi    | FileCheck -check-prefix=V8PIC_NOCFI %s
+; RUN: llc < %s -march=sparcv9 -relocation-model=static | FileCheck -check-prefix=V9ABS %s
+; RUN: llc < %s -march=sparcv9 -relocation-model=pic    | FileCheck -check-prefix=V9PIC %s
+; RUN: llc < %s -march=sparcv9 -relocation-model=pic -disable-cfi    | FileCheck -check-prefix=V9PIC_NOCFI %s
 
 
 %struct.__fundamental_type_info_pseudo = type { %struct.__type_info_pseudo }
@@ -6,25 +11,99 @@
 
 @_ZTIi = external constant %struct.__fundamental_type_info_pseudo
 @_ZTIf = external constant %struct.__fundamental_type_info_pseudo
-@.cst = linker_private unnamed_addr constant [12 x i8] c"catched int\00", align 64
-@.cst1 = linker_private unnamed_addr constant [14 x i8] c"catched float\00", align 64
-
-; CHECK-LABEL: main:
-; CHECK:       .cfi_startproc
-; CHECK:       .cfi_def_cfa_register 30
-; CHECK:       .cfi_window_save
-; CHECK:       .cfi_register 15, 31
-
-; CHECK:        call __cxa_throw
-; CHECK:        call __cxa_throw
-
-; CHECK:        call __cxa_begin_catch
-; CHECK:        call __cxa_end_catch
-
-; CHECK:        call __cxa_begin_catch
-; CHECK:        call __cxa_end_catch
-
-; CHECK:        .cfi_endproc
+@.cst = private unnamed_addr constant [12 x i8] c"catched int\00", align 64
+@.cst1 = private unnamed_addr constant [14 x i8] c"catched float\00", align 64
+
+; V8ABS-LABEL: main:
+; V8ABS:        .cfi_startproc
+; V8ABS:        .cfi_personality 0, __gxx_personality_v0
+; V8ABS:        .cfi_lsda 0,
+; V8ABS:        .cfi_def_cfa_register {{30|%fp}}
+; V8ABS:        .cfi_window_save
+; V8ABS:        .cfi_register 15, 31
+
+; V8ABS:        call __cxa_throw
+; V8ABS:        call __cxa_throw
+
+; V8ABS:        call __cxa_begin_catch
+; V8ABS:        call __cxa_end_catch
+
+; V8ABS:        call __cxa_begin_catch
+; V8ABS:        call __cxa_end_catch
+
+; V8ABS:        .cfi_endproc
+
+; V8PIC-LABEL: main:
+; V8PIC:        .cfi_startproc
+; V8PIC:        .cfi_personality 155, DW.ref.__gxx_personality_v0
+; V8PIC:        .cfi_lsda 27,
+; V8PIC:        .cfi_def_cfa_register {{30|%fp}}
+; V8PIC:        .cfi_window_save
+; V8PIC:        .cfi_register 15, 31
+; V8PIC:        .section .gcc_except_table
+; V8PIC-NOT:    .section
+; V8PIC:        .word %r_disp32(.L_ZTIi.DW.stub)
+; V8PIC:        .data
+; V8PIC: .L_ZTIi.DW.stub:
+; V8PIC-NEXT:   .word _ZTIi
+
+; V8PIC_NOCFI-LABEL: main:
+; V8PIC_NOCFI:        .section .gcc_except_table
+; V8PIC_NOCFI-NOT:    .section
+; V8PIC_NOCFI:        .word %r_disp32(.L_ZTIi.DW.stub)
+; V8PIC_NOCFI:        .data
+; V8PIC_NOCFI: .L_ZTIi.DW.stub:
+; V8PIC_NOCFI-NEXT:   .word _ZTIi
+; V8PIC_NOCFI:        .section .eh_frame
+; V8PIC_NOCFI-NOT:    .section
+; V8PIC_NOCFI:        .byte 15                     ! CIE Return Address Column
+; V8PIC_NOCFI:        .word %r_disp32(DW.ref.__gxx_personality_v0)
+; V8PIC_NOCFI:        .byte 12                     ! DW_CFA_def_cfa
+; V8PIC_NOCFI:        .byte 14                     ! Reg 14
+; V8PIC_NOCFI-NEXT:   .byte 0                      ! Offset 0
+; V8PIC_NOCFI:        .word %r_disp32(.Ltmp{{.+}}) ! FDE initial location
+
+
+; V9ABS-LABEL: main:
+; V9ABS:        .cfi_startproc
+; V9ABS:        .cfi_personality 0, __gxx_personality_v0
+; V9ABS:        .cfi_lsda 27,
+; V9ABS:        .cfi_def_cfa_register {{30|%fp}}
+; V9ABS:        .cfi_window_save
+; V9ABS:        .cfi_register 15, 31
+; V9ABS:        .section .gcc_except_table
+; V9ABS-NOT:    .section
+; V9ABS:        .xword _ZTIi
+
+; V9PIC-LABEL: main:
+; V9PIC:        .cfi_startproc
+; V9PIC:        .cfi_personality 155, DW.ref.__gxx_personality_v0
+; V9PIC:        .cfi_lsda 27,
+; V9PIC:        .cfi_def_cfa_register {{30|%fp}}
+; V9PIC:        .cfi_window_save
+; V9PIC:        .cfi_register 15, 31
+; V9PIC:        .section .gcc_except_table
+; V9PIC-NOT:    .section
+; V9PIC:        .word %r_disp32(.L_ZTIi.DW.stub)
+; V9PIC:        .data
+; V9PIC: .L_ZTIi.DW.stub:
+; V9PIC-NEXT:   .xword _ZTIi
+
+; V9PIC_NOCFI-LABEL: main:
+; V9PIC_NOCFI:        .section .gcc_except_table
+; V9PIC_NOCFI-NOT:    .section
+; V9PIC_NOCFI:        .word %r_disp32(.L_ZTIi.DW.stub)
+; V9PIC_NOCFI:        .data
+; V9PIC_NOCFI: .L_ZTIi.DW.stub:
+; V9PIC_NOCFI-NEXT:   .xword _ZTIi
+; V9PIC_NOCFI:        .section .eh_frame
+; V9PIC_NOCFI-NOT:    .section
+; V9PIC_NOCFI:        .byte 15                     ! CIE Return Address Column
+; V9PIC_NOCFI:        .word %r_disp32(DW.ref.__gxx_personality_v0)
+; V9PIC_NOCFI:        .byte 12                     ! DW_CFA_def_cfa
+; V9PIC_NOCFI-NEXT:   .byte 14                     ! Reg 14
+; V9PIC_NOCFI:        .ascii "\377\017"            ! Offset 2047
+; V9PIC_NOCFI:        .word %r_disp32(.Ltmp{{.+}}) ! FDE initial location
 
 define i32 @main(i32 %argc, i8** nocapture readnone %argv) unnamed_addr #0 {
 entry:
diff --git a/test/CodeGen/SPARC/fp128.ll b/test/CodeGen/SPARC/fp128.ll
index c761361..abd89bf 100644
--- a/test/CodeGen/SPARC/fp128.ll
+++ b/test/CodeGen/SPARC/fp128.ll
@@ -45,14 +45,14 @@ entry:
 ; HARD:       std %f{{.+}}, [%[[S1:.+]]]
 ; HARD-DAG:   ldd [%[[S0]]], %f{{.+}}
 ; HARD-DAG:   ldd [%[[S1]]], %f{{.+}}
-; HARD:       jmp
+; HARD:       jmp %o7+12
 
 ; SOFT-LABEL: f128_spill
 ; SOFT:       std %f{{.+}}, [%[[S0:.+]]]
 ; SOFT:       std %f{{.+}}, [%[[S1:.+]]]
 ; SOFT-DAG:   ldd [%[[S0]]], %f{{.+}}
 ; SOFT-DAG:   ldd [%[[S1]]], %f{{.+}}
-; SOFT:       jmp
+; SOFT:       jmp %o7+12
 
 define void @f128_spill(fp128* noalias sret %scalar.result, fp128* byval %a) {
 entry:
@@ -132,13 +132,13 @@ entry:
 ; HARD:       ldub
 ; HARD:       faddq
 ; HARD:       stb
-; HARD:       jmp
+; HARD:       ret
 
 ; SOFT-LABEL: fp128_unaligned
 ; SOFT:       ldub
 ; SOFT:       call _Q_add
 ; SOFT:       stb
-; SOFT:       jmp
+; SOFT:       ret
 
 define void @fp128_unaligned(fp128* %a, fp128* %b, fp128* %c) {
 entry:
@@ -232,3 +232,14 @@ entry:
   store  i32 %3, i32* %4, align 8
   ret void
 }
+
+; SOFT-LABEL:     f128_neg
+; SOFT:           fnegs
+
+define void @f128_neg(fp128* noalias sret %scalar.result, fp128* byval %a) {
+entry:
+  %0 = load fp128* %a, align 8
+  %1 = fsub fp128 0xL00000000000000008000000000000000, %0
+  store fp128 %1, fp128* %scalar.result, align 8
+  ret void
+}
diff --git a/test/CodeGen/SPARC/globals.ll b/test/CodeGen/SPARC/globals.ll
index 7e3effe..3d3eba2 100644
--- a/test/CodeGen/SPARC/globals.ll
+++ b/test/CodeGen/SPARC/globals.ll
@@ -14,7 +14,7 @@ define zeroext i8 @loadG() {
 
 ; abs32: loadG
 ; abs32: sethi %hi(G), %[[R:[gilo][0-7]]]
-; abs32: jmp %o7+8
+; abs32: retl
 ; abs32: ldub [%[[R]]+%lo(G)], %o0
 
 
@@ -22,7 +22,7 @@ define zeroext i8 @loadG() {
 ; abs44: sethi %h44(G), %[[R1:[gilo][0-7]]]
 ; abs44: add %[[R1]], %m44(G), %[[R2:[gilo][0-7]]]
 ; abs44: sllx %[[R2]], 12, %[[R3:[gilo][0-7]]]
-; abs44: jmp %o7+8
+; abs44: retl
 ; abs44: ldub [%[[R3]]+%l44(G)], %o0
 
 
@@ -32,7 +32,7 @@ define zeroext i8 @loadG() {
 ; abs64: sethi %hh(G), %[[R3:[gilo][0-7]]]
 ; abs64: add %[[R3]], %hm(G), %[[R4:[gilo][0-7]]]
 ; abs64: sllx %[[R4]], 32, %[[R5:[gilo][0-7]]]
-; abs64: jmp %o7+8
+; abs64: retl
 ; abs64: ldub [%[[R5]]+%[[R2]]], %o0
 
 
@@ -42,7 +42,7 @@ define zeroext i8 @loadG() {
 ; v8pic32: add %[[R1]], %lo(G), %[[Goffs:[gilo][0-7]]]
 ; v8pic32: ld [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
 ; v8pic32: ldub [%[[Gaddr]]], %i0
-; v8pic32: jmp %i7+8
+; v8pic32: ret
 ; v8pic32: restore
 
 
@@ -52,6 +52,6 @@ define zeroext i8 @loadG() {
 ; v9pic32: add %[[R1]], %lo(G), %[[Goffs:[gilo][0-7]]]
 ; v9pic32: ldx [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
 ; v9pic32: ldub [%[[Gaddr]]], %i0
-; v9pic32: jmp %i7+8
+; v9pic32: ret
 ; v9pic32: restore
 
diff --git a/test/CodeGen/SPARC/inlineasm.ll b/test/CodeGen/SPARC/inlineasm.ll
new file mode 100644
index 0000000..2650533
--- /dev/null
+++ b/test/CodeGen/SPARC/inlineasm.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=sparc <%s | FileCheck %s
+
+; CHECK-LABEL: test_constraint_r
+; CHECK:       add %o1, %o0, %o0
+define i32 @test_constraint_r(i32 %a, i32 %b) {
+entry:
+  %0 = tail call i32 asm sideeffect "add $2, $1, $0", "=r,r,r"(i32 %a, i32 %b)
+  ret i32 %0
+}
+
+; CHECK-LABEL: test_constraint_I
+; CHECK:       add %o0, 1023, %o0
+define i32 @test_constraint_I(i32 %a) {
+entry:
+  %0 = tail call i32 asm sideeffect "add $1, $2, $0", "=r,r,rI"(i32 %a, i32 1023)
+  ret i32 %0
+}
+
+; CHECK-LABEL: test_constraint_I_neg
+; CHECK:       add %o0, -4096, %o0
+define i32 @test_constraint_I_neg(i32 %a) {
+entry:
+  %0 = tail call i32 asm sideeffect "add $1, $2, $0", "=r,r,rI"(i32 %a, i32 -4096)
+  ret i32 %0
+}
+
+; CHECK-LABEL: test_constraint_I_largeimm
+; CHECK:       sethi 9, [[R0:%[gilo][0-7]]]
+; CHECK:       or [[R0]], 784, [[R1:%[gilo][0-7]]]
+; CHECK:       add %o0, [[R1]], %o0
+define i32 @test_constraint_I_largeimm(i32 %a) {
+entry:
+  %0 = tail call i32 asm sideeffect "add $1, $2, $0", "=r,r,rI"(i32 %a, i32 10000)
+  ret i32 %0
+}
+
+; CHECK-LABEL: test_constraint_reg
+; CHECK:       ldda [%o1] 43, %g2
+; CHECK:       ldda [%o1] 43, %g3
+define void @test_constraint_reg(i32 %s, i32* %ptr) {
+entry:
+  %0 = tail call i64 asm sideeffect "ldda [$1] $2, $0", "={r2},r,n"(i32* %ptr, i32 43)
+  %1 = tail call i64 asm sideeffect "ldda [$1] $2, $0", "={g3},r,n"(i32* %ptr, i32 43)
+  ret void
+}
diff --git a/test/CodeGen/SPARC/leafproc.ll b/test/CodeGen/SPARC/leafproc.ll
index 0a7ae08..963fac0 100644
--- a/test/CodeGen/SPARC/leafproc.ll
+++ b/test/CodeGen/SPARC/leafproc.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=sparc -disable-sparc-leaf-proc=0 < %s | FileCheck %s
 
 ; CHECK-LABEL:      func_nobody:
-; CHECK:      jmp %o7+8
+; CHECK:      retl
 ; CHECK-NEXT: nop
 define void @func_nobody() {
 entry:
@@ -10,7 +10,7 @@ entry:
 
 
 ; CHECK-LABEL:      return_int_const:
-; CHECK:      jmp %o7+8
+; CHECK:      retl
 ; CHECK-NEXT: or %g0, 1729, %o0
 define i32 @return_int_const() {
 entry:
@@ -19,7 +19,7 @@ entry:
 
 ; CHECK-LABEL:      return_double_const:
 ; CHECK:      sethi
-; CHECK:      jmp %o7+8
+; CHECK:      retl
 ; CHECK-NEXT: ldd {{.*}}, %f0
 
 define double @return_double_const() {
@@ -29,7 +29,7 @@ entry:
 
 ; CHECK-LABEL:      leaf_proc_with_args:
 ; CHECK:      add {{%o[0-1]}}, {{%o[0-1]}}, [[R:%[go][0-7]]]
-; CHECK:      jmp %o7+8
+; CHECK:      retl
 ; CHECK-NEXT: add [[R]], %o2, %o0
 
 define i32 @leaf_proc_with_args(i32 %a, i32 %b, i32 %c) {
@@ -42,7 +42,7 @@ entry:
 ; CHECK-LABEL:     leaf_proc_with_args_in_stack:
 ; CHECK-DAG: ld [%sp+92], {{%[go][0-7]}}
 ; CHECK-DAG: ld [%sp+96], {{%[go][0-7]}}
-; CHECK:     jmp %o7+8
+; CHECK:     retl
 ; CHECK-NEXT: add {{.*}}, %o0
 define i32 @leaf_proc_with_args_in_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) {
 entry:
@@ -63,7 +63,7 @@ entry:
 ; CHECK:      or %g0, 2, [[R2:%[go][0-7]]]
 ; CHECK:      st [[R2]], [%sp+100]
 ; CHECK:      ld {{.+}}, %o0
-; CHECK:      jmp %o7+8
+; CHECK:      retl
 ; CHECK-NEXT: add %sp, 104, %sp
 
 define i32 @leaf_proc_with_local_array(i32 %a, i32 %b, i32 %c) {
diff --git a/test/CodeGen/SPARC/mature-mc-support.ll b/test/CodeGen/SPARC/mature-mc-support.ll
new file mode 100644
index 0000000..4ed3309
--- /dev/null
+++ b/test/CodeGen/SPARC/mature-mc-support.ll
@@ -0,0 +1,20 @@
+; Test that inline assembly is parsed by the MC layer when MC support is mature
+; (even when the output is assembly).
+; FIXME: SPARC doesn't use the integrated assembler by default in all cases
+; so we only test that -filetype=obj tries to parse the assembly.
+
+; SKIP: not llc -march=sparc < %s > /dev/null 2> %t1
+; SKIP: FileCheck %s < %t1
+
+; RUN: not llc -march=sparc -filetype=obj < %s > /dev/null 2> %t2
+; RUN: FileCheck %s < %t2
+
+; SKIP: not llc -march=sparcv9 < %s > /dev/null 2> %t3
+; SKIP: FileCheck %s < %t3
+
+; RUN: not llc -march=sparcv9 -filetype=obj < %s > /dev/null 2> %t4
+; RUN: FileCheck %s < %t4
+
+module asm "	.this_directive_is_very_unlikely_to_exist"
+
+; CHECK: LLVM ERROR: Error parsing inline asm
diff --git a/test/CodeGen/SPARC/missinglabel.ll b/test/CodeGen/SPARC/missinglabel.ll
new file mode 100644
index 0000000..bcf384b
--- /dev/null
+++ b/test/CodeGen/SPARC/missinglabel.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64-S128"
+target triple = "sparc64-unknown-linux-gnu"
+
+define void @f() align 2 {
+entry:
+; CHECK: %xcc, .LBB0_1
+  %cmp = icmp eq i64 undef, 0
+  br i1 %cmp, label %targetblock, label %cond.false
+
+cond.false:
+  unreachable
+
+; CHECK: .LBB0_1: ! %targetblock
+targetblock:
+  br i1 undef, label %cond.false.i83, label %exit.i85
+
+cond.false.i83:
+  unreachable
+
+exit.i85:
+  unreachable
+}
diff --git a/test/CodeGen/SPARC/obj-relocs.ll b/test/CodeGen/SPARC/obj-relocs.ll
new file mode 100644
index 0000000..6d57598
--- /dev/null
+++ b/test/CodeGen/SPARC/obj-relocs.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=sparcv9 -filetype=obj --relocation-model=static | llvm-readobj -r | FileCheck %s --check-prefix=CHECK-ABS
+; RUN: llc < %s -march=sparcv9 -filetype=obj --relocation-model=pic    | llvm-readobj -r | FileCheck %s --check-prefix=CHECK-PIC
+
+;CHECK-ABS: Relocations [
+;CHECK-ABS:    0x{{[0-9,A-F]+}} R_SPARC_H44 AGlobalVar 0x0
+;CHECK-ABS:    0x{{[0-9,A-F]+}} R_SPARC_M44 AGlobalVar 0x0
+;CHECK-ABS:    0x{{[0-9,A-F]+}} R_SPARC_L44 AGlobalVar 0x0
+;CHECK-ABS:    0x{{[0-9,A-F]+}} R_SPARC_WDISP30 bar 0x0
+;CHECK-ABS:]
+
+; CHECK-PIC: Relocations [
+; CHECK-PIC:    0x{{[0-9,A-F]+}} R_SPARC_PC22 _GLOBAL_OFFSET_TABLE_ 0x4
+; CHECK-PIC:    0x{{[0-9,A-F]+}} R_SPARC_PC10 _GLOBAL_OFFSET_TABLE_ 0x8
+; CHECK-PIC:    0x{{[0-9,A-F]+}} R_SPARC_GOT22 AGlobalVar 0x0
+; CHECK-PIC:    0x{{[0-9,A-F]+}} R_SPARC_GOT10 AGlobalVar 0x0
+; CHECK-PIC:    0x{{[0-9,A-F]+}} R_SPARC_WPLT30 bar 0x0
+; CHECK-PIC: ]
+
+
+@AGlobalVar = global i64 0, align 8
+
+define i64 @foo(i64 %a) {
+entry:
+  %0 = load i64* @AGlobalVar, align 4
+  %1 = add i64 %a, %0
+  %2 = call i64 @bar(i64 %1)
+  ret i64 %2
+}
+
+
+declare i64 @bar(i64)
diff --git a/test/CodeGen/SPARC/parts.ll b/test/CodeGen/SPARC/parts.ll
new file mode 100644
index 0000000..57add49
--- /dev/null
+++ b/test/CodeGen/SPARC/parts.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=sparcv9    | FileCheck %s
+  
+; CHECK-LABEL: test
+; CHECK:        srl %i1, 0, %o2
+; CHECK-NEXT:   or %g0, %i2, %o0
+; CHECK-NEXT:   call __ashlti3
+; CHECK-NEXT:   or %g0, %i3, %o1
+; CHECK-NEXT:   or %g0, %o0, %i0
+  
+define i128 @test(i128 %a, i128 %b) {
+entry:
+    %tmp = shl i128 %b, %a
+    ret i128 %tmp
+}
diff --git a/test/CodeGen/SPARC/rem.ll b/test/CodeGen/SPARC/rem.ll
index abef1fc..3b01a55 100644
--- a/test/CodeGen/SPARC/rem.ll
+++ b/test/CodeGen/SPARC/rem.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: test1:
 ; CHECK:        sdivx %o0, %o1, %o2
 ; CHECK-NEXT:   mulx %o2, %o1, %o1
-; CHECK-NEXT:   jmp %o7+8
+; CHECK-NEXT:   retl
 ; CHECK-NEXT:   sub %o0, %o1, %o0
 
 define i64 @test1(i64 %X, i64 %Y) {
@@ -14,7 +14,7 @@ define i64 @test1(i64 %X, i64 %Y) {
 ; CHECK-LABEL: test2:
 ; CHECK:        udivx %o0, %o1, %o2
 ; CHECK-NEXT:   mulx %o2, %o1, %o1
-; CHECK-NEXT:   jmp %o7+8
+; CHECK-NEXT:   retl
 ; CHECK-NEXT:   sub %o0, %o1, %o0
 
 define i64 @test2(i64 %X, i64 %Y) {
diff --git a/test/CodeGen/SPARC/setjmp.ll b/test/CodeGen/SPARC/setjmp.ll
index 39984fb..a31cd70 100644
--- a/test/CodeGen/SPARC/setjmp.ll
+++ b/test/CodeGen/SPARC/setjmp.ll
@@ -7,7 +7,7 @@
 %struct.__jmp_buf_tag = type { [3 x i32], i32, %0 }
 
 @jenv = common unnamed_addr global %struct.jmpbuf_env* null
-@.cst = linker_private unnamed_addr constant [30 x i8] c"in bar with jmp_buf's id: %d\0A\00", align 64
+@.cst = private unnamed_addr constant [30 x i8] c"in bar with jmp_buf's id: %d\0A\00", align 64
 
 ; CHECK-LABEL: foo
 ; CHECK-DAG:   st {{.+}}, [%i0]
diff --git a/test/CodeGen/SPARC/spillsize.ll b/test/CodeGen/SPARC/spillsize.ll
new file mode 100644
index 0000000..64f63f9
--- /dev/null
+++ b/test/CodeGen/SPARC/spillsize.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64-S128"
+target triple = "sparcv9"
+
+; CHECK-LABEL: spill4
+; This function spills two values: %p and the materialized large constant.
+; Both must use 8-byte spill and fill instructions.
+; CHECK: stx %{{..}}, [%fp+
+; CHECK: stx %{{..}}, [%fp+
+; CHECK: ldx [%fp+
+; CHECK: ldx [%fp+
+define void @spill4(i64* nocapture %p) {
+entry:
+  %val0 = load i64* %p
+  %cmp0 = icmp ult i64 %val0, 385672958347594845
+  %cm80 = zext i1 %cmp0 to i64
+  store i64 %cm80, i64* %p, align 8
+  tail call void asm sideeffect "", "~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{g2},~{g3},~{g4},~{g5},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o7}"()
+  %arrayidx1 = getelementptr inbounds i64* %p, i64 1
+  %val = load i64* %arrayidx1
+  %cmp = icmp ult i64 %val, 385672958347594845
+  %cm8 = select i1 %cmp, i64 10, i64 20
+  store i64 %cm8, i64* %arrayidx1, align 8
+  ret void
+}
diff --git a/test/CodeGen/SPARC/tls.ll b/test/CodeGen/SPARC/tls.ll
index 660ddff..ce3e005 100644
--- a/test/CodeGen/SPARC/tls.ll
+++ b/test/CodeGen/SPARC/tls.ll
@@ -3,6 +3,10 @@
 ; RUN: llc <%s -march=sparc   -relocation-model=pic    | FileCheck %s --check-prefix=pic
 ; RUN: llc <%s -march=sparcv9 -relocation-model=pic    | FileCheck %s --check-prefix=pic
 
+; RUN: llc <%s -march=sparc   -relocation-model=static -filetype=obj | llvm-readobj -r | FileCheck %s --check-prefix=v8abs-obj
+; RUN: llc <%s -march=sparcv9 -relocation-model=static -filetype=obj | llvm-readobj -r | FileCheck %s --check-prefix=v9abs-obj
+; RUN: llc <%s -march=sparc   -relocation-model=pic    -filetype=obj | llvm-readobj -r | FileCheck %s --check-prefix=pic-obj
+; RUN: llc <%s -march=sparcv9 -relocation-model=pic    -filetype=obj | llvm-readobj -r | FileCheck %s --check-prefix=pic-obj
 
 @local_symbol = internal thread_local global i32 0
 @extern_symbol = external thread_local global i32
@@ -38,8 +42,7 @@ entry:
 
 
 ; v8abs-LABEL:  test_tls_extern
-; v8abs:        or     {{%[goli][0-7]}}, %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[PC:%[goli][0-7]]]
-; v8abs:        add    [[PC]], %o7, %[[GOTBASE:[goli][0-7]]]
+; v8abs:        or     {{%[goli][0-7]}}, %lo(_GLOBAL_OFFSET_TABLE_), %[[GOTBASE:[goli][0-7]]]
 ; v8abs:        sethi  %tie_hi22(extern_symbol), [[R1:%[goli][0-7]]]
 ; v8abs:        add    [[R1]], %tie_lo10(extern_symbol), %[[R2:[goli][0-7]]]
 ; v8abs:        ld     [%[[GOTBASE]]+%[[R2]]], [[R3:%[goli][0-7]]], %tie_ld(extern_symbol)
@@ -47,8 +50,7 @@ entry:
 ; v8abs:        ld     [%[[R4]]]
 
 ; v9abs-LABEL:  test_tls_extern
-; v9abs:        or     {{%[goli][0-7]}}, %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[PC:%[goli][0-7]]]
-; v9abs:        add    [[PC]], %o7, %[[GOTBASE:[goli][0-7]]]
+; v9abs:        or     {{%[goli][0-7]}}, %l44(_GLOBAL_OFFSET_TABLE_), %[[GOTBASE:[goli][0-7]]]
 ; v9abs:        sethi  %tie_hi22(extern_symbol), [[R1:%[goli][0-7]]]
 ; v9abs:        add    [[R1]], %tie_lo10(extern_symbol), %[[R2:[goli][0-7]]]
 ; v9abs:        ldx    [%[[GOTBASE]]+%[[R2]]], [[R3:%[goli][0-7]]], %tie_ldx(extern_symbol)
@@ -71,3 +73,47 @@ entry:
   store i32 %1, i32* @extern_symbol, align 4
   ret i32 %1
 }
+
+
+; v8abs-obj: Relocations [
+; v8abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LE_HIX22 local_symbol 0x0
+; v8abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LE_LOX10 local_symbol 0x0
+; v8abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_HI22 _GLOBAL_OFFSET_TABLE_ 0x0
+; v8abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_LO10 _GLOBAL_OFFSET_TABLE_ 0x0
+; v8abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_IE_HI22 extern_symbol 0x0
+; v8abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_IE_LO10 extern_symbol 0x0
+; v8abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_IE_LD extern_symbol 0x0
+; v8abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_IE_ADD extern_symbol 0x0
+; v8abs-obj: ]
+
+; v9abs-obj: Relocations [
+; v9abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LE_HIX22 local_symbol 0x0
+; v9abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LE_LOX10 local_symbol 0x0
+; v9abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_H44 _GLOBAL_OFFSET_TABLE_ 0x0
+; v9abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_M44 _GLOBAL_OFFSET_TABLE_ 0x0
+; v9abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_L44 _GLOBAL_OFFSET_TABLE_ 0x0
+; v9abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_IE_HI22 extern_symbol 0x0
+; v9abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_IE_LO10 extern_symbol 0x0
+; v9abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_IE_LDX extern_symbol 0x0
+; v9abs-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_IE_ADD extern_symbol 0x0
+; v9abs-obj: ]
+
+; pic-obj: Relocations [
+; pic-obj:  Section (2) .rela.text {
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_PC22 _GLOBAL_OFFSET_TABLE_ 0x4
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_PC10 _GLOBAL_OFFSET_TABLE_ 0x8
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDO_HIX22 local_symbol 0x0
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDO_LOX10 local_symbol 0x0
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDM_HI22 local_symbol 0x0
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDM_LO10 local_symbol 0x0
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDM_ADD local_symbol 0x0
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDM_CALL local_symbol 0x0
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_LDO_ADD local_symbol 0x0
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_PC22 _GLOBAL_OFFSET_TABLE_ 0x4
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_PC10 _GLOBAL_OFFSET_TABLE_ 0x8
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_GD_HI22 extern_symbol 0x0
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_GD_LO10 extern_symbol 0x0
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_GD_ADD extern_symbol 0x0
+; pic-obj:    0x{{[0-9,A-F]+}} R_SPARC_TLS_GD_CALL extern_symbol 0x0
+; pic-obj: ]
+
diff --git a/test/CodeGen/SPARC/trap.ll b/test/CodeGen/SPARC/trap.ll
new file mode 100644
index 0000000..b72a63c
--- /dev/null
+++ b/test/CodeGen/SPARC/trap.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=sparc-linux-gnu < %s -show-mc-encoding | FileCheck %s
+
+define void @test1() {
+  tail call void @llvm.trap()
+  unreachable
+
+; CHECK-LABEL: test1:
+; CHECK: ta 5 ! encoding: [0x91,0xd0,0x20,0x05]
+}
+
+declare void @llvm.trap()
diff --git a/test/CodeGen/SystemZ/Large/branch-range-01.py b/test/CodeGen/SystemZ/Large/branch-range-01.py
index 552c9ca..edb631d 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-01.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-01.py
@@ -79,7 +79,7 @@ for i in xrange(branch_blocks):
     next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
     print 'before%d:' % i
     print '  %%bstop%d = getelementptr i32 *%%stop, i64 %d' % (i, i)
-    print '  %%bcur%d = load volatile i32 *%%bstop%d' % (i, i)
+    print '  %%bcur%d = load i32 *%%bstop%d' % (i, i)
     print '  %%btest%d = icmp eq i32 %%limit, %%bcur%d' % (i, i)
     print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
     print ''
@@ -95,7 +95,7 @@ for i in xrange(0, main_size, 6):
 
 for i in xrange(branch_blocks):
     print '  %%astop%d = getelementptr i32 *%%stop, i64 %d' % (i, i + 25)
-    print '  %%acur%d = load volatile i32 *%%astop%d' % (i, i)
+    print '  %%acur%d = load i32 *%%astop%d' % (i, i)
     print '  %%atest%d = icmp eq i32 %%limit, %%acur%d' % (i, i)
     print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
     print ''
diff --git a/test/CodeGen/SystemZ/Large/branch-range-02.py b/test/CodeGen/SystemZ/Large/branch-range-02.py
index 0b21ced..743e12d 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-02.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-02.py
@@ -72,7 +72,7 @@ for i in xrange(blocks):
     print 'b%d:' % i
     print '  store volatile i8 %d, i8 *%%base' % value
     print '  %%astop%d = getelementptr i32 *%%stop, i64 %d' % (i, i)
-    print '  %%acur%d = load volatile i32 *%%astop%d' % (i, i)
+    print '  %%acur%d = load i32 *%%astop%d' % (i, i)
     print '  %%atest%d = icmp eq i32 %%limit, %%acur%d' % (i, i)
     print '  br i1 %%atest%d, label %%%s, label %%%s' % (i, other, next)
 
diff --git a/test/CodeGen/SystemZ/Large/branch-range-03.py b/test/CodeGen/SystemZ/Large/branch-range-03.py
index 75cdf24..5c9a93b 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-03.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-03.py
@@ -79,7 +79,7 @@ for i in xrange(branch_blocks):
     next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
     print 'before%d:' % i
     print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
-    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
+    print '  %%bcur%d = load i8 *%%bstop%d' % (i, i)
     print '  %%bext%d = sext i8 %%bcur%d to i32' % (i, i)
     print '  %%btest%d = icmp eq i32 %%limit, %%bext%d' % (i, i)
     print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
@@ -96,7 +96,7 @@ for i in xrange(0, main_size, 6):
 
 for i in xrange(branch_blocks):
     print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
-    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
+    print '  %%acur%d = load i8 *%%astop%d' % (i, i)
     print '  %%aext%d = sext i8 %%acur%d to i32' % (i, i)
     print '  %%atest%d = icmp eq i32 %%limit, %%aext%d' % (i, i)
     print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
diff --git a/test/CodeGen/SystemZ/Large/branch-range-04.py b/test/CodeGen/SystemZ/Large/branch-range-04.py
index 3ae3ae9..2c9090f 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-04.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-04.py
@@ -83,7 +83,7 @@ for i in xrange(branch_blocks):
     next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
     print 'before%d:' % i
     print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
-    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
+    print '  %%bcur%d = load i8 *%%bstop%d' % (i, i)
     print '  %%bext%d = sext i8 %%bcur%d to i64' % (i, i)
     print '  %%btest%d = icmp eq i64 %%limit, %%bext%d' % (i, i)
     print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
@@ -100,7 +100,7 @@ for i in xrange(0, main_size, 6):
 
 for i in xrange(branch_blocks):
     print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
-    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
+    print '  %%acur%d = load i8 *%%astop%d' % (i, i)
     print '  %%aext%d = sext i8 %%acur%d to i64' % (i, i)
     print '  %%atest%d = icmp eq i64 %%limit, %%aext%d' % (i, i)
     print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
diff --git a/test/CodeGen/SystemZ/Large/branch-range-05.py b/test/CodeGen/SystemZ/Large/branch-range-05.py
index 6928b8f..52f4a96 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-05.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-05.py
@@ -82,7 +82,7 @@ print ''
 for i in xrange(branch_blocks):
     next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
     print 'before%d:' % i
-    print '  %%bcur%d = load volatile i8 *%%stop' % i
+    print '  %%bcur%d = load i8 *%%stop' % i
     print '  %%bext%d = sext i8 %%bcur%d to i32' % (i, i)
     print '  %%btest%d = icmp slt i32 %%bext%d, %d' % (i, i, i + 50)
     print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
@@ -98,7 +98,7 @@ for i in xrange(0, main_size, 6):
     print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
 
 for i in xrange(branch_blocks):
-    print '  %%acur%d = load volatile i8 *%%stop' % i
+    print '  %%acur%d = load i8 *%%stop' % i
     print '  %%aext%d = sext i8 %%acur%d to i32' % (i, i)
     print '  %%atest%d = icmp slt i32 %%aext%d, %d' % (i, i, i + 100)
     print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
diff --git a/test/CodeGen/SystemZ/Large/branch-range-06.py b/test/CodeGen/SystemZ/Large/branch-range-06.py
index aabc72f..c34ebac 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-06.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-06.py
@@ -82,7 +82,7 @@ print ''
 for i in xrange(branch_blocks):
     next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
     print 'before%d:' % i
-    print '  %%bcur%d = load volatile i8 *%%stop' % i
+    print '  %%bcur%d = load i8 *%%stop' % i
     print '  %%bext%d = sext i8 %%bcur%d to i64' % (i, i)
     print '  %%btest%d = icmp slt i64 %%bext%d, %d' % (i, i, i + 50)
     print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
@@ -98,7 +98,7 @@ for i in xrange(0, main_size, 6):
     print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
 
 for i in xrange(branch_blocks):
-    print '  %%acur%d = load volatile i8 *%%stop' % i
+    print '  %%acur%d = load i8 *%%stop' % i
     print '  %%aext%d = sext i8 %%acur%d to i64' % (i, i)
     print '  %%atest%d = icmp slt i64 %%aext%d, %d' % (i, i, i + 100)
     print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
diff --git a/test/CodeGen/SystemZ/Large/branch-range-09.py b/test/CodeGen/SystemZ/Large/branch-range-09.py
index b3fd813..bc712cb 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-09.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-09.py
@@ -79,7 +79,7 @@ for i in xrange(branch_blocks):
     next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
     print 'before%d:' % i
     print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
-    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
+    print '  %%bcur%d = load i8 *%%bstop%d' % (i, i)
     print '  %%bext%d = sext i8 %%bcur%d to i32' % (i, i)
     print '  %%btest%d = icmp ult i32 %%limit, %%bext%d' % (i, i)
     print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
@@ -96,7 +96,7 @@ for i in xrange(0, main_size, 6):
 
 for i in xrange(branch_blocks):
     print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
-    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
+    print '  %%acur%d = load i8 *%%astop%d' % (i, i)
     print '  %%aext%d = sext i8 %%acur%d to i32' % (i, i)
     print '  %%atest%d = icmp ult i32 %%limit, %%aext%d' % (i, i)
     print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
diff --git a/test/CodeGen/SystemZ/Large/branch-range-10.py b/test/CodeGen/SystemZ/Large/branch-range-10.py
index 3aeea3e..8c483c3 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-10.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-10.py
@@ -83,7 +83,7 @@ for i in xrange(branch_blocks):
     next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
     print 'before%d:' % i
     print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
-    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
+    print '  %%bcur%d = load i8 *%%bstop%d' % (i, i)
     print '  %%bext%d = sext i8 %%bcur%d to i64' % (i, i)
     print '  %%btest%d = icmp ult i64 %%limit, %%bext%d' % (i, i)
     print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
@@ -100,7 +100,7 @@ for i in xrange(0, main_size, 6):
 
 for i in xrange(branch_blocks):
     print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
-    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
+    print '  %%acur%d = load i8 *%%astop%d' % (i, i)
     print '  %%aext%d = sext i8 %%acur%d to i64' % (i, i)
     print '  %%atest%d = icmp ult i64 %%limit, %%aext%d' % (i, i)
     print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
diff --git a/test/CodeGen/SystemZ/Large/branch-range-11.py b/test/CodeGen/SystemZ/Large/branch-range-11.py
index 034902c..0546103 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-11.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-11.py
@@ -98,8 +98,8 @@ print ''
 for i in xrange(branch_blocks):
     next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
     print 'before%d:' % i
-    print '  %%bcur%da = load volatile i32 *%%stopa' % i
-    print '  %%bcur%db = load volatile i32 *%%stopb' % i
+    print '  %%bcur%da = load i32 *%%stopa' % i
+    print '  %%bcur%db = load i32 *%%stopb' % i
     print '  %%bsub%d = sub i32 %%bcur%da, %%bcur%db' % (i, i, i)
     print '  %%btest%d = icmp ult i32 %%bsub%d, %d' % (i, i, i + 50)
     print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
@@ -115,8 +115,8 @@ for i in xrange(0, main_size, 6):
     print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
 
 for i in xrange(branch_blocks):
-    print '  %%acur%da = load volatile i32 *%%stopa' % i
-    print '  %%acur%db = load volatile i32 *%%stopb' % i
+    print '  %%acur%da = load i32 *%%stopa' % i
+    print '  %%acur%db = load i32 *%%stopb' % i
     print '  %%asub%d = sub i32 %%acur%da, %%acur%db' % (i, i, i)
     print '  %%atest%d = icmp ult i32 %%asub%d, %d' % (i, i, i + 100)
     print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
diff --git a/test/CodeGen/SystemZ/Large/branch-range-12.py b/test/CodeGen/SystemZ/Large/branch-range-12.py
index 007d477..626c899 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-12.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-12.py
@@ -98,8 +98,8 @@ print ''
 for i in xrange(branch_blocks):
     next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
     print 'before%d:' % i
-    print '  %%bcur%da = load volatile i64 *%%stopa' % i
-    print '  %%bcur%db = load volatile i64 *%%stopb' % i
+    print '  %%bcur%da = load i64 *%%stopa' % i
+    print '  %%bcur%db = load i64 *%%stopb' % i
     print '  %%bsub%d = sub i64 %%bcur%da, %%bcur%db' % (i, i, i)
     print '  %%btest%d = icmp ult i64 %%bsub%d, %d' % (i, i, i + 50)
     print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
@@ -115,8 +115,8 @@ for i in xrange(0, main_size, 6):
     print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
 
 for i in xrange(branch_blocks):
-    print '  %%acur%da = load volatile i64 *%%stopa' % i
-    print '  %%acur%db = load volatile i64 *%%stopb' % i
+    print '  %%acur%da = load i64 *%%stopa' % i
+    print '  %%acur%db = load i64 *%%stopb' % i
     print '  %%asub%d = sub i64 %%acur%da, %%acur%db' % (i, i, i)
     print '  %%atest%d = icmp ult i64 %%asub%d, %d' % (i, i, i + 100)
     print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
diff --git a/test/CodeGen/SystemZ/alias-01.ll b/test/CodeGen/SystemZ/alias-01.ll
index 8839aad..dc90481 100644
--- a/test/CodeGen/SystemZ/alias-01.ll
+++ b/test/CodeGen/SystemZ/alias-01.ll
@@ -2,6 +2,9 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
+; The use of TBAA in CodeGen has been temporarily disabled pending correctness fixes.
+; XFAIL: *
+
 ; Check that there are no spills.
 define void @f1(<16 x i32> *%src1, <16 x float> *%dest) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/SystemZ/atomic-load-01.ll b/test/CodeGen/SystemZ/atomic-load-01.ll
index a5bc883..f3acd60 100644
--- a/test/CodeGen/SystemZ/atomic-load-01.ll
+++ b/test/CodeGen/SystemZ/atomic-load-01.ll
@@ -2,11 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that loads are handled.
-; The CS-based sequence is probably far too conservative.
 define i8 @f1(i8 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: cs
+; CHECK: bcr 1{{[45]}}, %r0
+; CHECK: lb %r2, 0(%r2)
 ; CHECK: br %r14
   %val = load atomic i8 *%src seq_cst, align 1
   ret i8 %val
diff --git a/test/CodeGen/SystemZ/atomic-load-02.ll b/test/CodeGen/SystemZ/atomic-load-02.ll
index 2c9bbdb..d9bec60 100644
--- a/test/CodeGen/SystemZ/atomic-load-02.ll
+++ b/test/CodeGen/SystemZ/atomic-load-02.ll
@@ -2,11 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that loads are handled.
-; The CS-based sequence is probably far too conservative.
 define i16 @f1(i16 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: cs
+; CHECK: bcr 1{{[45]}}, %r0
+; CHECK: lh %r2, 0(%r2)
 ; CHECK: br %r14
   %val = load atomic i16 *%src seq_cst, align 2
   ret i16 %val
diff --git a/test/CodeGen/SystemZ/atomic-load-03.ll b/test/CodeGen/SystemZ/atomic-load-03.ll
index 1fb41f5..7e5eb92 100644
--- a/test/CodeGen/SystemZ/atomic-load-03.ll
+++ b/test/CodeGen/SystemZ/atomic-load-03.ll
@@ -2,12 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that loads are handled.
-; Using CS is probably too conservative.
-define i32 @f1(i32 %dummy, i32 *%src) {
+define i32 @f1(i32 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: lhi %r2, 0
-; CHECK: cs %r2, %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
+; CHECK: l %r2, 0(%r2)
 ; CHECK: br %r14
   %val = load atomic i32 *%src seq_cst, align 4
   ret i32 %val
diff --git a/test/CodeGen/SystemZ/atomic-load-04.ll b/test/CodeGen/SystemZ/atomic-load-04.ll
index 92cac40..c7a9a98 100644
--- a/test/CodeGen/SystemZ/atomic-load-04.ll
+++ b/test/CodeGen/SystemZ/atomic-load-04.ll
@@ -2,12 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that loads are handled.
-; Using CSG is probably too conservative.
-define i64 @f1(i64 %dummy, i64 *%src) {
+define i64 @f1(i64 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: lghi %r2, 0
-; CHECK: csg %r2, %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
+; CHECK: lg %r2, 0(%r2)
 ; CHECK: br %r14
   %val = load atomic i64 *%src seq_cst, align 8
   ret i64 %val
diff --git a/test/CodeGen/SystemZ/atomic-store-01.ll b/test/CodeGen/SystemZ/atomic-store-01.ll
index 53ed24f..952e1a9 100644
--- a/test/CodeGen/SystemZ/atomic-store-01.ll
+++ b/test/CodeGen/SystemZ/atomic-store-01.ll
@@ -2,11 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that stores are handled.
-; The CS-based sequence is probably far too conservative.
 define void @f1(i8 %val, i8 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: cs
+; CHECK: stc %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
 ; CHECK: br %r14
   store atomic i8 %val, i8 *%src seq_cst, align 1
   ret void
diff --git a/test/CodeGen/SystemZ/atomic-store-02.ll b/test/CodeGen/SystemZ/atomic-store-02.ll
index 42d6695..c9576e5 100644
--- a/test/CodeGen/SystemZ/atomic-store-02.ll
+++ b/test/CodeGen/SystemZ/atomic-store-02.ll
@@ -2,11 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that stores are handled.
-; The CS-based sequence is probably far too conservative.
 define void @f1(i16 %val, i16 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: cs
+; CHECK: sth %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
 ; CHECK: br %r14
   store atomic i16 %val, i16 *%src seq_cst, align 2
   ret void
diff --git a/test/CodeGen/SystemZ/atomic-store-03.ll b/test/CodeGen/SystemZ/atomic-store-03.ll
index 846c86f..459cb6a 100644
--- a/test/CodeGen/SystemZ/atomic-store-03.ll
+++ b/test/CodeGen/SystemZ/atomic-store-03.ll
@@ -2,14 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that stores are handled.
-; Using CS is probably too conservative.
 define void @f1(i32 %val, i32 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: l %r0, 0(%r3)
-; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: cs %r0, %r2, 0(%r3)
-; CHECK: jl [[LABEL]]
+; CHECK: st %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
 ; CHECK: br %r14
   store atomic i32 %val, i32 *%src seq_cst, align 4
   ret void
diff --git a/test/CodeGen/SystemZ/atomic-store-04.ll b/test/CodeGen/SystemZ/atomic-store-04.ll
index 24615b1..7f2406e 100644
--- a/test/CodeGen/SystemZ/atomic-store-04.ll
+++ b/test/CodeGen/SystemZ/atomic-store-04.ll
@@ -2,14 +2,10 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; This is just a placeholder to make sure that stores are handled.
-; Using CS is probably too conservative.
 define void @f1(i64 %val, i64 *%src) {
 ; CHECK-LABEL: f1:
-; CHECK: lg %r0, 0(%r3)
-; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: csg %r0, %r2, 0(%r3)
-; CHECK: jl [[LABEL]]
+; CHECK: stg %r2, 0(%r3)
+; CHECK: bcr 1{{[45]}}, %r0
 ; CHECK: br %r14
   store atomic i64 %val, i64 *%src seq_cst, align 8
   ret void
diff --git a/test/CodeGen/SystemZ/atomicrmw-add-05.ll b/test/CodeGen/SystemZ/atomicrmw-add-05.ll
new file mode 100644
index 0000000..956c0d9
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomicrmw-add-05.ll
@@ -0,0 +1,64 @@
+; Test 32-bit atomic additions, z196 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check addition of a variable.
+define i32 @f1(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f1:
+; CHECK: laa %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw add i32 *%src, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check addition of 1, which needs a temporary.
+define i32 @f2(i32 %dummy, i32 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lhi [[TMP:%r[0-5]]], 1
+; CHECK: laa %r2, [[TMP]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw add i32 *%src, i32 1 seq_cst
+  ret i32 %res
+}
+
+; Check the high end of the LAA range.
+define i32 @f3(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f3:
+; CHECK: laa %r2, %r4, 524284(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 131071
+  %res = atomicrmw add i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the next word up, which needs separate address logic.
+define i32 @f4(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f4:
+; CHECK: agfi %r3, 524288
+; CHECK: laa %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 131072
+  %res = atomicrmw add i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the low end of the LAA range.
+define i32 @f5(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f5:
+; CHECK: laa %r2, %r4, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 -131072
+  %res = atomicrmw add i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the next word down, which needs separate address logic.
+define i32 @f6(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524292
+; CHECK: laa %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 -131073
+  %res = atomicrmw add i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-add-06.ll b/test/CodeGen/SystemZ/atomicrmw-add-06.ll
new file mode 100644
index 0000000..f508858
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomicrmw-add-06.ll
@@ -0,0 +1,64 @@
+; Test 64-bit atomic additions, z196 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check addition of a variable.
+define i64 @f1(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK: laag %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw add i64 *%src, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check addition of 1, which needs a temporary.
+define i64 @f2(i64 %dummy, i64 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lghi [[TMP:%r[0-5]]], 1
+; CHECK: laag %r2, [[TMP]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw add i64 *%src, i64 1 seq_cst
+  ret i64 %res
+}
+
+; Check the high end of the LAAG range.
+define i64 @f3(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f3:
+; CHECK: laag %r2, %r4, 524280(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 65535
+  %res = atomicrmw add i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the next doubleword up, which needs separate address logic.
+define i64 @f4(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f4:
+; CHECK: agfi %r3, 524288
+; CHECK: laag %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 65536
+  %res = atomicrmw add i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the low end of the LAAG range.
+define i64 @f5(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f5:
+; CHECK: laag %r2, %r4, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 -65536
+  %res = atomicrmw add i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the next doubleword down, which needs separate address logic.
+define i64 @f6(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524296
+; CHECK: laag %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 -65537
+  %res = atomicrmw add i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-and-05.ll b/test/CodeGen/SystemZ/atomicrmw-and-05.ll
new file mode 100644
index 0000000..f0b999c
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomicrmw-and-05.ll
@@ -0,0 +1,64 @@
+; Test 32-bit atomic ANDs, z196 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check AND of a variable.
+define i32 @f1(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f1:
+; CHECK: lan %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw and i32 *%src, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check AND of 1, which needs a temporary.
+define i32 @f2(i32 %dummy, i32 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lhi [[TMP:%r[0-5]]], 1
+; CHECK: lan %r2, [[TMP]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw and i32 *%src, i32 1 seq_cst
+  ret i32 %res
+}
+
+; Check the high end of the LAN range.
+define i32 @f3(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f3:
+; CHECK: lan %r2, %r4, 524284(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 131071
+  %res = atomicrmw and i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the next word up, which needs separate address logic.
+define i32 @f4(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f4:
+; CHECK: agfi %r3, 524288
+; CHECK: lan %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 131072
+  %res = atomicrmw and i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the low end of the LAN range.
+define i32 @f5(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f5:
+; CHECK: lan %r2, %r4, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 -131072
+  %res = atomicrmw and i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the next word down, which needs separate address logic.
+define i32 @f6(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524292
+; CHECK: lan %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 -131073
+  %res = atomicrmw and i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-and-06.ll b/test/CodeGen/SystemZ/atomicrmw-and-06.ll
new file mode 100644
index 0000000..e5b7194
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomicrmw-and-06.ll
@@ -0,0 +1,64 @@
+; Test 64-bit atomic ANDs, z196 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check AND of a variable.
+define i64 @f1(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK: lang %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw and i64 *%src, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check AND of -2, which needs a temporary.
+define i64 @f2(i64 %dummy, i64 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lghi [[TMP:%r[0-5]]], -2
+; CHECK: lang %r2, [[TMP]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw and i64 *%src, i64 -2 seq_cst
+  ret i64 %res
+}
+
+; Check the high end of the LANG range.
+define i64 @f3(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f3:
+; CHECK: lang %r2, %r4, 524280(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 65535
+  %res = atomicrmw and i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the next doubleword up, which needs separate address logic.
+define i64 @f4(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f4:
+; CHECK: agfi %r3, 524288
+; CHECK: lang %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 65536
+  %res = atomicrmw and i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the low end of the LANG range.
+define i64 @f5(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f5:
+; CHECK: lang %r2, %r4, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 -65536
+  %res = atomicrmw and i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the next doubleword down, which needs separate address logic.
+define i64 @f6(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524296
+; CHECK: lang %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 -65537
+  %res = atomicrmw and i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-or-05.ll b/test/CodeGen/SystemZ/atomicrmw-or-05.ll
new file mode 100644
index 0000000..b38654c
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomicrmw-or-05.ll
@@ -0,0 +1,64 @@
+; Test 32-bit atomic ORs, z196 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check OR of a variable.
+define i32 @f1(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f1:
+; CHECK: lao %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw or i32 *%src, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check OR of 1, which needs a temporary.
+define i32 @f2(i32 %dummy, i32 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lhi [[TMP:%r[0-5]]], 1
+; CHECK: lao %r2, [[TMP]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw or i32 *%src, i32 1 seq_cst
+  ret i32 %res
+}
+
+; Check the high end of the LAO range.
+define i32 @f3(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f3:
+; CHECK: lao %r2, %r4, 524284(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 131071
+  %res = atomicrmw or i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the next word up, which needs separate address logic.
+define i32 @f4(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f4:
+; CHECK: agfi %r3, 524288
+; CHECK: lao %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 131072
+  %res = atomicrmw or i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the low end of the LAO range.
+define i32 @f5(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f5:
+; CHECK: lao %r2, %r4, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 -131072
+  %res = atomicrmw or i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the next word down, which needs separate address logic.
+define i32 @f6(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524292
+; CHECK: lao %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 -131073
+  %res = atomicrmw or i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-or-06.ll b/test/CodeGen/SystemZ/atomicrmw-or-06.ll
new file mode 100644
index 0000000..30874ab
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomicrmw-or-06.ll
@@ -0,0 +1,64 @@
+; Test 64-bit atomic ORs, z196 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check OR of a variable.
+define i64 @f1(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK: laog %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw or i64 *%src, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check OR of 1, which needs a temporary.
+define i64 @f2(i64 %dummy, i64 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lghi [[TMP:%r[0-5]]], 1
+; CHECK: laog %r2, [[TMP]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw or i64 *%src, i64 1 seq_cst
+  ret i64 %res
+}
+
+; Check the high end of the LAOG range.
+define i64 @f3(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f3:
+; CHECK: laog %r2, %r4, 524280(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 65535
+  %res = atomicrmw or i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the next doubleword up, which needs separate address logic.
+define i64 @f4(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f4:
+; CHECK: agfi %r3, 524288
+; CHECK: laog %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 65536
+  %res = atomicrmw or i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the low end of the LAOG range.
+define i64 @f5(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f5:
+; CHECK: laog %r2, %r4, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 -65536
+  %res = atomicrmw or i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the next doubleword down, which needs separate address logic.
+define i64 @f6(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524296
+; CHECK: laog %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 -65537
+  %res = atomicrmw or i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-sub-05.ll b/test/CodeGen/SystemZ/atomicrmw-sub-05.ll
new file mode 100644
index 0000000..7668f0e
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomicrmw-sub-05.ll
@@ -0,0 +1,69 @@
+; Test 32-bit atomic subtractions, z196 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check addition of a variable.
+define i32 @f1(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f1:
+; CHECK: lcr [[NEG:%r[0-5]]], %r4
+; CHECK: laa %r2, [[NEG]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw sub i32 *%src, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check addition of 1, which needs a temporary.
+define i32 @f2(i32 %dummy, i32 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lhi [[TMP:%r[0-5]]], -1
+; CHECK: laa %r2, [[TMP]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw sub i32 *%src, i32 1 seq_cst
+  ret i32 %res
+}
+
+; Check the high end of the LAA range.
+define i32 @f3(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f3:
+; CHECK: lcr [[NEG:%r[0-5]]], %r4
+; CHECK: laa %r2, [[NEG]], 524284(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 131071
+  %res = atomicrmw sub i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the next word up, which needs separate address logic.
+define i32 @f4(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f4:
+; CHECK-DAG: lcr [[NEG:%r[0-5]]], %r4
+; CHECK-DAG: agfi %r3, 524288
+; CHECK: laa %r2, [[NEG]], 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 131072
+  %res = atomicrmw sub i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the low end of the LAA range.
+define i32 @f5(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f5:
+; CHECK: lcr [[NEG:%r[0-5]]], %r4
+; CHECK: laa %r2, [[NEG]], -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 -131072
+  %res = atomicrmw sub i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the next word down, which needs separate address logic.
+define i32 @f6(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f6:
+; CHECK-DAG: lcr [[NEG:%r[0-5]]], %r4
+; CHECK-DAG: agfi %r3, -524292
+; CHECK: laa %r2, [[NEG]], 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 -131073
+  %res = atomicrmw sub i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-sub-06.ll b/test/CodeGen/SystemZ/atomicrmw-sub-06.ll
new file mode 100644
index 0000000..5d11bdf
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomicrmw-sub-06.ll
@@ -0,0 +1,69 @@
+; Test 64-bit atomic subtractions, z196 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check addition of a variable.
+define i64 @f1(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK: lcgr [[NEG:%r[0-5]]], %r4
+; CHECK: laag %r2, [[NEG]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw sub i64 *%src, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check addition of 1, which needs a temporary.
+define i64 @f2(i64 %dummy, i64 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lghi [[TMP:%r[0-5]]], -1
+; CHECK: laag %r2, [[TMP]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw sub i64 *%src, i64 1 seq_cst
+  ret i64 %res
+}
+
+; Check the high end of the LAAG range.
+define i64 @f3(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f3:
+; CHECK: lcgr [[NEG:%r[0-5]]], %r4
+; CHECK: laag %r2, [[NEG]], 524280(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 65535
+  %res = atomicrmw sub i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the next doubleword up, which needs separate address logic.
+define i64 @f4(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f4:
+; CHECK-DAG: lcgr [[NEG:%r[0-5]]], %r4
+; CHECK-DAG: agfi %r3, 524288
+; CHECK: laag %r2, [[NEG]], 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 65536
+  %res = atomicrmw sub i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the low end of the LAAG range.
+define i64 @f5(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f5:
+; CHECK: lcgr [[NEG:%r[0-5]]], %r4
+; CHECK: laag %r2, [[NEG]], -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 -65536
+  %res = atomicrmw sub i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the next doubleword down, which needs separate address logic.
+define i64 @f6(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f6:
+; CHECK-DAG: lcgr [[NEG:%r[0-5]]], %r4
+; CHECK-DAG: agfi %r3, -524296
+; CHECK: laag %r2, [[NEG]], 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 -65537
+  %res = atomicrmw sub i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-xor-05.ll b/test/CodeGen/SystemZ/atomicrmw-xor-05.ll
new file mode 100644
index 0000000..e9e7d30
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomicrmw-xor-05.ll
@@ -0,0 +1,64 @@
+; Test 32-bit atomic ORs, z196 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check OR of a variable.
+define i32 @f1(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f1:
+; CHECK: lax %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw xor i32 *%src, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check OR of 1, which needs a temporary.
+define i32 @f2(i32 %dummy, i32 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lhi [[TMP:%r[0-5]]], 1
+; CHECK: lax %r2, [[TMP]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw xor i32 *%src, i32 1 seq_cst
+  ret i32 %res
+}
+
+; Check the high end of the LAX range.
+define i32 @f3(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f3:
+; CHECK: lax %r2, %r4, 524284(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 131071
+  %res = atomicrmw xor i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the next word up, which needs separate address logic.
+define i32 @f4(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f4:
+; CHECK: agfi %r3, 524288
+; CHECK: lax %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 131072
+  %res = atomicrmw xor i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the low end of the LAX range.
+define i32 @f5(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f5:
+; CHECK: lax %r2, %r4, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 -131072
+  %res = atomicrmw xor i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
+
+; Check the next word down, which needs separate address logic.
+define i32 @f6(i32 %dummy, i32 *%src, i32 %b) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524292
+; CHECK: lax %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%src, i32 -131073
+  %res = atomicrmw xor i32 *%ptr, i32 %b seq_cst
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-xor-06.ll b/test/CodeGen/SystemZ/atomicrmw-xor-06.ll
new file mode 100644
index 0000000..0870c64
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomicrmw-xor-06.ll
@@ -0,0 +1,64 @@
+; Test 64-bit atomic XORs, z196 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check XOR of a variable.
+define i64 @f1(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK: laxg %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw xor i64 *%src, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check XOR of 1, which needs a temporary.
+define i64 @f2(i64 %dummy, i64 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lghi [[TMP:%r[0-5]]], 1
+; CHECK: laxg %r2, [[TMP]], 0(%r3)
+; CHECK: br %r14
+  %res = atomicrmw xor i64 *%src, i64 1 seq_cst
+  ret i64 %res
+}
+
+; Check the high end of the LAXG range.
+define i64 @f3(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f3:
+; CHECK: laxg %r2, %r4, 524280(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 65535
+  %res = atomicrmw xor i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the next doubleword up, which needs separate address logic.
+define i64 @f4(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f4:
+; CHECK: agfi %r3, 524288
+; CHECK: laxg %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 65536
+  %res = atomicrmw xor i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the low end of the LAXG range.
+define i64 @f5(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f5:
+; CHECK: laxg %r2, %r4, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 -65536
+  %res = atomicrmw xor i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
+
+; Check the next doubleword down, which needs separate address logic.
+define i64 @f6(i64 %dummy, i64 *%src, i64 %b) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524296
+; CHECK: laxg %r2, %r4, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%src, i64 -65537
+  %res = atomicrmw xor i64 *%ptr, i64 %b seq_cst
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/cmpxchg-01.ll b/test/CodeGen/SystemZ/cmpxchg-01.ll
index d5ea977..bb0b18a 100644
--- a/test/CodeGen/SystemZ/cmpxchg-01.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-01.ll
@@ -32,7 +32,7 @@ define i8 @f1(i8 %dummy, i8 *%src, i8 %cmp, i8 %swap) {
 ; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
 ; CHECK-SHIFT: rll
 ; CHECK-SHIFT: rll {{%r[0-9]+}}, %r5, -8([[NEGSHIFT]])
-  %res = cmpxchg i8 *%src, i8 %cmp, i8 %swap seq_cst
+  %res = cmpxchg i8 *%src, i8 %cmp, i8 %swap seq_cst seq_cst
   ret i8 %res
 }
 
@@ -50,6 +50,6 @@ define i8 @f2(i8 *%src) {
 ; CHECK-SHIFT: risbg
 ; CHECK-SHIFT: risbg [[SWAP]], {{%r[0-9]+}}, 32, 55, 0
 ; CHECK-SHIFT: br %r14
-  %res = cmpxchg i8 *%src, i8 42, i8 88 seq_cst
+  %res = cmpxchg i8 *%src, i8 42, i8 88 seq_cst seq_cst
   ret i8 %res
 }
diff --git a/test/CodeGen/SystemZ/cmpxchg-02.ll b/test/CodeGen/SystemZ/cmpxchg-02.ll
index 08c79d7..8d46a8c 100644
--- a/test/CodeGen/SystemZ/cmpxchg-02.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-02.ll
@@ -32,7 +32,7 @@ define i16 @f1(i16 %dummy, i16 *%src, i16 %cmp, i16 %swap) {
 ; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
 ; CHECK-SHIFT: rll
 ; CHECK-SHIFT: rll {{%r[0-9]+}}, %r5, -16([[NEGSHIFT]])
-  %res = cmpxchg i16 *%src, i16 %cmp, i16 %swap seq_cst
+  %res = cmpxchg i16 *%src, i16 %cmp, i16 %swap seq_cst seq_cst
   ret i16 %res
 }
 
@@ -50,6 +50,6 @@ define i16 @f2(i16 *%src) {
 ; CHECK-SHIFT: risbg
 ; CHECK-SHIFT: risbg [[SWAP]], {{%r[0-9]+}}, 32, 47, 0
 ; CHECK-SHIFT: br %r14
-  %res = cmpxchg i16 *%src, i16 42, i16 88 seq_cst
+  %res = cmpxchg i16 *%src, i16 42, i16 88 seq_cst seq_cst
   ret i16 %res
 }
diff --git a/test/CodeGen/SystemZ/cmpxchg-03.ll b/test/CodeGen/SystemZ/cmpxchg-03.ll
index 3917979..f6a2ad0 100644
--- a/test/CodeGen/SystemZ/cmpxchg-03.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-03.ll
@@ -7,7 +7,7 @@ define i32 @f1(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK-LABEL: f1:
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i32 *%src, i32 %cmp, i32 %swap seq_cst
+  %val = cmpxchg i32 *%src, i32 %cmp, i32 %swap seq_cst seq_cst
   ret i32 %val
 }
 
@@ -17,7 +17,7 @@ define i32 @f2(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: cs %r2, %r3, 4092(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 1023
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst
+  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
   ret i32 %val
 }
 
@@ -27,7 +27,7 @@ define i32 @f3(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, 4096(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 1024
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst
+  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
   ret i32 %val
 }
 
@@ -37,7 +37,7 @@ define i32 @f4(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, 524284(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 131071
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst
+  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
   ret i32 %val
 }
 
@@ -49,7 +49,7 @@ define i32 @f5(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 131072
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst
+  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
   ret i32 %val
 }
 
@@ -59,7 +59,7 @@ define i32 @f6(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, -4(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 -1
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst
+  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
   ret i32 %val
 }
 
@@ -69,7 +69,7 @@ define i32 @f7(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, -524288(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 -131072
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst
+  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
   ret i32 %val
 }
 
@@ -81,7 +81,7 @@ define i32 @f8(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 -131073
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst
+  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
   ret i32 %val
 }
 
@@ -93,7 +93,7 @@ define i32 @f9(i32 %cmp, i32 %swap, i64 %src, i64 %index) {
 ; CHECK: br %r14
   %add1 = add i64 %src, %index
   %ptr = inttoptr i64 %add1 to i32 *
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst
+  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
   ret i32 %val
 }
 
@@ -106,7 +106,7 @@ define i32 @f10(i32 %cmp, i32 %swap, i64 %src, i64 %index) {
   %add1 = add i64 %src, %index
   %add2 = add i64 %add1, 4096
   %ptr = inttoptr i64 %add2 to i32 *
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst
+  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
   ret i32 %val
 }
 
@@ -116,7 +116,7 @@ define i32 @f11(i32 %dummy, i32 %swap, i32 *%ptr) {
 ; CHECK: lhi %r2, 1001
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i32 *%ptr, i32 1001, i32 %swap seq_cst
+  %val = cmpxchg i32 *%ptr, i32 1001, i32 %swap seq_cst seq_cst
   ret i32 %val
 }
 
@@ -126,6 +126,6 @@ define i32 @f12(i32 %cmp, i32 *%ptr) {
 ; CHECK: lhi [[SWAP:%r[0-9]+]], 1002
 ; CHECK: cs %r2, [[SWAP]], 0(%r3)
 ; CHECK: br %r14
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 1002 seq_cst
+  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 1002 seq_cst seq_cst
   ret i32 %val
 }
diff --git a/test/CodeGen/SystemZ/cmpxchg-04.ll b/test/CodeGen/SystemZ/cmpxchg-04.ll
index f58868f..069bad6 100644
--- a/test/CodeGen/SystemZ/cmpxchg-04.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-04.ll
@@ -7,7 +7,7 @@ define i64 @f1(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK-LABEL: f1:
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i64 *%src, i64 %cmp, i64 %swap seq_cst
+  %val = cmpxchg i64 *%src, i64 %cmp, i64 %swap seq_cst seq_cst
   ret i64 %val
 }
 
@@ -17,7 +17,7 @@ define i64 @f2(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, 524280(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 65535
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst
+  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
   ret i64 %val
 }
 
@@ -29,7 +29,7 @@ define i64 @f3(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 65536
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst
+  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
   ret i64 %val
 }
 
@@ -39,7 +39,7 @@ define i64 @f4(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, -8(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 -1
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst
+  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
   ret i64 %val
 }
 
@@ -49,7 +49,7 @@ define i64 @f5(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, -524288(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 -65536
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst
+  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
   ret i64 %val
 }
 
@@ -61,7 +61,7 @@ define i64 @f6(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 -65537
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst
+  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
   ret i64 %val
 }
 
@@ -73,7 +73,7 @@ define i64 @f7(i64 %cmp, i64 %swap, i64 %src, i64 %index) {
 ; CHECK: br %r14
   %add1 = add i64 %src, %index
   %ptr = inttoptr i64 %add1 to i64 *
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst
+  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
   ret i64 %val
 }
 
@@ -83,7 +83,7 @@ define i64 @f8(i64 %dummy, i64 %swap, i64 *%ptr) {
 ; CHECK: lghi %r2, 1001
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i64 *%ptr, i64 1001, i64 %swap seq_cst
+  %val = cmpxchg i64 *%ptr, i64 1001, i64 %swap seq_cst seq_cst
   ret i64 %val
 }
 
@@ -93,6 +93,6 @@ define i64 @f9(i64 %cmp, i64 *%ptr) {
 ; CHECK: lghi [[SWAP:%r[0-9]+]], 1002
 ; CHECK: csg %r2, [[SWAP]], 0(%r3)
 ; CHECK: br %r14
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 1002 seq_cst
+  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 1002 seq_cst seq_cst
   ret i64 %val
 }
diff --git a/test/CodeGen/SystemZ/cond-store-01.ll b/test/CodeGen/SystemZ/cond-store-01.ll
index d55ea21..62e9796 100644
--- a/test/CodeGen/SystemZ/cond-store-01.ll
+++ b/test/CodeGen/SystemZ/cond-store-01.ll
@@ -347,11 +347,10 @@ define void @f19(i8 *%ptr, i8 %alt, i32 %limit) {
 define void @f20(i8 *%ptr, i8 %alt, i32 %limit) {
 ; FIXME: should use a normal load instead of CS.
 ; CHECK-LABEL: f20:
-; CHECK: cs {{%r[0-9]+}},
-; CHECK: jl
+; CHECK: lb {{%r[0-9]+}}, 0(%r2)
 ; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
 ; CHECK: [[LABEL]]:
-; CHECK: stc {{%r[0-9]+}},
+; CHECK: stc {{%r[0-9]+}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load atomic i8 *%ptr unordered, align 1
@@ -367,7 +366,7 @@ define void @f21(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: jhe [[LABEL:[^ ]*]]
 ; CHECK: lb %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
-; CHECK: cs {{%r[0-9]+}},
+; CHECK: stc %r3, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-02.ll b/test/CodeGen/SystemZ/cond-store-02.ll
index 91bc486..4fbcdab 100644
--- a/test/CodeGen/SystemZ/cond-store-02.ll
+++ b/test/CodeGen/SystemZ/cond-store-02.ll
@@ -347,11 +347,10 @@ define void @f19(i16 *%ptr, i16 %alt, i32 %limit) {
 define void @f20(i16 *%ptr, i16 %alt, i32 %limit) {
 ; FIXME: should use a normal load instead of CS.
 ; CHECK-LABEL: f20:
-; CHECK: cs {{%r[0-9]+}},
-; CHECK: jl
+; CHECK: lh {{%r[0-9]+}}, 0(%r2)
 ; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
 ; CHECK: [[LABEL]]:
-; CHECK: sth {{%r[0-9]+}},
+; CHECK: sth {{%r[0-9]+}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load atomic i16 *%ptr unordered, align 2
@@ -367,7 +366,7 @@ define void @f21(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: jhe [[LABEL:[^ ]*]]
 ; CHECK: lh %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
-; CHECK: cs {{%r[0-9]+}},
+; CHECK: sth %r3, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-03.ll b/test/CodeGen/SystemZ/cond-store-03.ll
index d4fd48d..4b22555 100644
--- a/test/CodeGen/SystemZ/cond-store-03.ll
+++ b/test/CodeGen/SystemZ/cond-store-03.ll
@@ -272,7 +272,7 @@ define void @f15(i32 *%ptr, i32 %alt, i32 %limit) {
 define void @f16(i32 *%ptr, i32 %alt, i32 %limit) {
 ; FIXME: should use a normal load instead of CS.
 ; CHECK-LABEL: f16:
-; CHECK: cs {{%r[0-5]}}, {{%r[0-5]}}, 0(%r2)
+; CHECK: l {{%r[0-5]}}, 0(%r2)
 ; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
 ; CHECK: [[LABEL]]:
 ; CHECK: st {{%r[0-5]}}, 0(%r2)
@@ -291,7 +291,7 @@ define void @f17(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: jhe [[LABEL:[^ ]*]]
 ; CHECK: l %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
-; CHECK: cs {{%r[0-5]}}, %r3, 0(%r2)
+; CHECK: st %r3, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-04.ll b/test/CodeGen/SystemZ/cond-store-04.ll
index fc565c4..346b51a 100644
--- a/test/CodeGen/SystemZ/cond-store-04.ll
+++ b/test/CodeGen/SystemZ/cond-store-04.ll
@@ -164,7 +164,7 @@ define void @f9(i64 *%ptr, i64 %alt, i32 %limit) {
 define void @f10(i64 *%ptr, i64 %alt, i32 %limit) {
 ; FIXME: should use a normal load instead of CSG.
 ; CHECK-LABEL: f10:
-; CHECK: csg {{%r[0-5]}}, {{%r[0-5]}}, 0(%r2)
+; CHECK: lg {{%r[0-5]}}, 0(%r2)
 ; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
 ; CHECK: [[LABEL]]:
 ; CHECK: stg {{%r[0-5]}}, 0(%r2)
@@ -183,7 +183,7 @@ define void @f11(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: jhe [[LABEL:[^ ]*]]
 ; CHECK: lg %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
-; CHECK: csg {{%r[0-5]}}, %r3, 0(%r2)
+; CHECK: stg %r3, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
diff --git a/test/CodeGen/SystemZ/fp-cmp-04.ll b/test/CodeGen/SystemZ/fp-cmp-04.ll
index 8d84216..781a3be 100644
--- a/test/CodeGen/SystemZ/fp-cmp-04.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-04.ll
@@ -1,4 +1,4 @@
-; Test that floating-point compares are ommitted if CC already has the
+; Test that floating-point compares are omitted if CC already has the
 ; right value.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
@@ -346,3 +346,62 @@ store:
 exit:
   ret double %val
 }
+
+; Repeat f2 with a comparison against -0.
+define float @f17(float %a, float %b, float *%dest) {
+; CHECK-LABEL: f17:
+; CHECK: aebr %f0, %f2
+; CHECK-NEXT: jl .L{{.*}}
+; CHECK: br %r14
+entry:
+  %res = fadd float %a, %b
+  %cmp = fcmp olt float %res, -0.0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store float %b, float *%dest
+  br label %exit
+
+exit:
+  ret float %res
+}
+
+; Test another form of f7 in which the condition is based on the unnegated
+; result.  This is what InstCombine would produce.
+define float @f18(float %dummy, float %a, float *%dest) {
+; CHECK-LABEL: f18:
+; CHECK: lnebr %f0, %f2
+; CHECK-NEXT: jl .L{{.*}}
+; CHECK: br %r14
+entry:
+  %abs = call float @llvm.fabs.f32(float %a)
+  %res = fsub float -0.0, %abs
+  %cmp = fcmp ogt float %abs, 0.0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store float %res, float *%dest
+  br label %exit
+
+exit:
+  ret float %res
+}
+
+; Similarly for f8.
+define float @f19(float %dummy, float %a, float *%dest) {
+; CHECK-LABEL: f19:
+; CHECK: lcebr %f0, %f2
+; CHECK-NEXT: jle .L{{.*}}
+; CHECK: br %r14
+entry:
+  %res = fsub float -0.0, %a
+  %cmp = fcmp oge float %a, 0.0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store float %res, float *%dest
+  br label %exit
+
+exit:
+  ret float %res
+}
diff --git a/test/CodeGen/SystemZ/fp-conv-06.ll b/test/CodeGen/SystemZ/fp-conv-06.ll
index 466c145..8a3971a 100644
--- a/test/CodeGen/SystemZ/fp-conv-06.ll
+++ b/test/CodeGen/SystemZ/fp-conv-06.ll
@@ -1,6 +1,6 @@
-; Test conversions of unsigned i32s to floating-point values.
+; Test conversions of unsigned i32s to floating-point values (z10 only).
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Check i32->f32.  There is no native instruction, so we must promote
 ; to i64 first.
diff --git a/test/CodeGen/SystemZ/fp-conv-08.ll b/test/CodeGen/SystemZ/fp-conv-08.ll
index 69b2d13..295ce8b 100644
--- a/test/CodeGen/SystemZ/fp-conv-08.ll
+++ b/test/CodeGen/SystemZ/fp-conv-08.ll
@@ -1,6 +1,6 @@
-; Test conversions of unsigned i64s to floating-point values.
+; Test conversions of unsigned i64s to floating-point values (z10 only).
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Test i64->f32.  There's no native support for unsigned i64-to-fp conversions,
 ; but we should be able to implement them using signed i64-to-fp conversions.
diff --git a/test/CodeGen/SystemZ/fp-conv-10.ll b/test/CodeGen/SystemZ/fp-conv-10.ll
index 723d19d..b8155ed 100644
--- a/test/CodeGen/SystemZ/fp-conv-10.ll
+++ b/test/CodeGen/SystemZ/fp-conv-10.ll
@@ -1,6 +1,6 @@
-; Test conversion of floating-point values to unsigned i32s.
+; Test conversion of floating-point values to unsigned i32s (z10 only).
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; z10 doesn't have native support for unsigned fp-to-i32 conversions;
 ; they were added in z196 as the Convert to Logical family of instructions.
diff --git a/test/CodeGen/SystemZ/fp-conv-12.ll b/test/CodeGen/SystemZ/fp-conv-12.ll
index 6cc343a..770c940 100644
--- a/test/CodeGen/SystemZ/fp-conv-12.ll
+++ b/test/CodeGen/SystemZ/fp-conv-12.ll
@@ -1,6 +1,6 @@
-; Test conversion of floating-point values to unsigned i64s.
+; Test conversion of floating-point values to unsigned i64s (z10 only).
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; z10 doesn't have native support for unsigned fp-to-i64 conversions;
 ; they were added in z196 as the Convert to Logical family of instructions.
diff --git a/test/CodeGen/SystemZ/fp-conv-13.ll b/test/CodeGen/SystemZ/fp-conv-13.ll
new file mode 100644
index 0000000..96293bc
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-conv-13.ll
@@ -0,0 +1,64 @@
+; Test conversions of unsigned integers to floating-point values
+; (z196 and above).
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check i32->f32.
+define float @f1(i32 %i) {
+; CHECK-LABEL: f1:
+; CHECK: celfbr %f0, 0, %r2, 0
+; CHECK: br %r14
+  %conv = uitofp i32 %i to float
+  ret float %conv
+}
+
+; Check i32->f64.
+define double @f2(i32 %i) {
+; CHECK-LABEL: f2:
+; CHECK: cdlfbr %f0, 0, %r2, 0
+; CHECK: br %r14
+  %conv = uitofp i32 %i to double
+  ret double %conv
+}
+
+; Check i32->f128.
+define void @f3(i32 %i, fp128 *%dst) {
+; CHECK-LABEL: f3:
+; CHECK: cxlfbr %f0, 0, %r2, 0
+; CHECK-DAG: std %f0, 0(%r3)
+; CHECK-DAG: std %f2, 8(%r3)
+; CHECK: br %r14
+  %conv = uitofp i32 %i to fp128
+  store fp128 %conv, fp128 *%dst
+  ret void
+}
+
+; Check i64->f32.
+define float @f4(i64 %i) {
+; CHECK-LABEL: f4:
+; CHECK: celgbr %f0, 0, %r2, 0
+; CHECK: br %r14
+  %conv = uitofp i64 %i to float
+  ret float %conv
+}
+
+; Check i64->f64.
+define double @f5(i64 %i) {
+; CHECK-LABEL: f5:
+; CHECK: cdlgbr %f0, 0, %r2, 0
+; CHECK: br %r14
+  %conv = uitofp i64 %i to double
+  ret double %conv
+}
+
+; Check i64->f128.
+define void @f6(i64 %i, fp128 *%dst) {
+; CHECK-LABEL: f6:
+; CHECK: cxlgbr %f0, 0, %r2, 0
+; CHECK-DAG: std %f0, 0(%r3)
+; CHECK-DAG: std %f2, 8(%r3)
+; CHECK: br %r14
+  %conv = uitofp i64 %i to fp128
+  store fp128 %conv, fp128 *%dst
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-conv-14.ll b/test/CodeGen/SystemZ/fp-conv-14.ll
new file mode 100644
index 0000000..e926e9b
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-conv-14.ll
@@ -0,0 +1,63 @@
+; Test conversion of floating-point values to unsigned integers.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Test f32->i32.
+define i32 @f1(float %f) {
+; CHECK-LABEL: f1:
+; CHECK: clfebr %r2, 5, %f0, 0
+; CHECK: br %r14
+  %conv = fptoui float %f to i32
+  ret i32 %conv
+}
+
+; Test f64->i32.
+define i32 @f2(double %f) {
+; CHECK-LABEL: f2:
+; CHECK: clfdbr %r2, 5, %f0, 0
+; CHECK: br %r14
+  %conv = fptoui double %f to i32
+  ret i32 %conv
+}
+
+; Test f128->i32.
+define i32 @f3(fp128 *%src) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: ld %f0, 0(%r2)
+; CHECK-DAG: ld %f2, 8(%r2)
+; CHECK: clfxbr %r2, 5, %f0, 0
+; CHECK: br %r14
+  %f = load fp128 *%src
+  %conv = fptoui fp128 %f to i32
+  ret i32 %conv
+}
+
+; Test f32->i64.
+define i64 @f4(float %f) {
+; CHECK-LABEL: f4:
+; CHECK: clgebr %r2, 5, %f0, 0
+; CHECK: br %r14
+  %conv = fptoui float %f to i64
+  ret i64 %conv
+}
+
+; Test f64->i64.
+define i64 @f5(double %f) {
+; CHECK-LABEL: f5:
+; CHECK: clgdbr %r2, 5, %f0, 0
+; CHECK: br %r14
+  %conv = fptoui double %f to i64
+  ret i64 %conv
+}
+
+; Test f128->i64.
+define i64 @f6(fp128 *%src) {
+; CHECK-LABEL: f6:
+; CHECK-DAG: ld %f0, 0(%r2)
+; CHECK-DAG: ld %f2, 8(%r2)
+; CHECK: clgxbr %r2, 5, %f0, 0
+; CHECK: br %r14
+  %f = load fp128 *%src
+  %conv = fptoui fp128 %f to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/SystemZ/frame-08.ll b/test/CodeGen/SystemZ/frame-08.ll
index da2a614..aa4e3f4 100644
--- a/test/CodeGen/SystemZ/frame-08.ll
+++ b/test/CodeGen/SystemZ/frame-08.ll
@@ -208,7 +208,7 @@ define void @f4(i32 *%ptr, i64 %x) {
   ret void
 }
 
-; This is the largest frame size for which the prepatory increment for
+; This is the largest frame size for which the preparatory increment for
 ; "lmg %r14, %r15, ..." can be done using AGHI.
 define void @f5(i32 *%ptr, i64 %x) {
 ; CHECK-LABEL: f5:
@@ -242,7 +242,7 @@ define void @f5(i32 *%ptr, i64 %x) {
   ret void
 }
 
-; This is the smallest frame size for which the prepatory increment for
+; This is the smallest frame size for which the preparatory increment for
 ; "lmg %r14, %r15, ..." needs to be done using AGFI.
 define void @f6(i32 *%ptr, i64 %x) {
 ; CHECK-LABEL: f6:
diff --git a/test/CodeGen/SystemZ/frame-11.ll b/test/CodeGen/SystemZ/frame-11.ll
index 5145b4d..575a433 100644
--- a/test/CodeGen/SystemZ/frame-11.ll
+++ b/test/CodeGen/SystemZ/frame-11.ll
@@ -2,17 +2,24 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
+declare i8 *@llvm.stacksave()
 declare void @llvm.stackrestore(i8 *)
 
 ; we should use a frame pointer and tear down the frame based on %r11
 ; rather than %r15.
-define void @f1(i8 *%src) {
+define void @f1(i32 %count1, i32 %count2) {
 ; CHECK-LABEL: f1:
 ; CHECK: stmg %r11, %r15, 88(%r15)
+; CHECK: aghi %r15, -160
 ; CHECK: lgr %r11, %r15
-; CHECK: lgr %r15, %r2
-; CHECK: lmg %r11, %r15, 88(%r11)
+; CHECK: lgr %r15, %r{{[0-5]}}
+; CHECK: lmg %r11, %r15, 248(%r11)
 ; CHECK: br %r14
+  %src = call i8 *@llvm.stacksave()
+  %array1 = alloca i8, i32 %count1
+  store volatile i8 0, i8 *%array1
   call void @llvm.stackrestore(i8 *%src)
+  %array2 = alloca i8, i32 %count2
+  store volatile i8 0, i8 *%array2
   ret void
 }
diff --git a/test/CodeGen/SystemZ/frame-13.ll b/test/CodeGen/SystemZ/frame-13.ll
index 393850f..58dee1d 100644
--- a/test/CodeGen/SystemZ/frame-13.ll
+++ b/test/CodeGen/SystemZ/frame-13.ll
@@ -243,8 +243,8 @@ define void @f10(i32 *%vptr) {
 
 ; And again with maximum register pressure.  The only spill slots that the
 ; NOFP case needs are the emergency ones, so the offsets are the same as for f2.
-; However, the FP case uses %r11 as the frame pointer and must therefore
-; spill a second register.  This leads to an extra displacement of 8.
+; The FP case needs to spill an extra register and is too dependent on
+; register allocation heuristics for a stable test.
 define void @f11(i32 *%vptr) {
 ; CHECK-NOFP-LABEL: f11:
 ; CHECK-NOFP: stmg %r6, %r15,
@@ -254,15 +254,6 @@ define void @f11(i32 *%vptr) {
 ; CHECK-NOFP: lg [[REGISTER]], [[OFFSET]](%r15)
 ; CHECK-NOFP: lmg %r6, %r15,
 ; CHECK-NOFP: br %r14
-;
-; CHECK-FP-LABEL: f11:
-; CHECK-FP: stmg %r6, %r15,
-; CHECK-FP: stg [[REGISTER:%r[1-9][0-4]?]], [[OFFSET:160|168]](%r11)
-; CHECK-FP: lay [[REGISTER]], 4096(%r11)
-; CHECK-FP: mvhi 8([[REGISTER]]), 42
-; CHECK-FP: lg [[REGISTER]], [[OFFSET]](%r11)
-; CHECK-FP: lmg %r6, %r15,
-; CHECK-FP: br %r14
   %i0 = load volatile i32 *%vptr
   %i1 = load volatile i32 *%vptr
   %i3 = load volatile i32 *%vptr
diff --git a/test/CodeGen/SystemZ/frame-14.ll b/test/CodeGen/SystemZ/frame-14.ll
index 3b48179..24169cf 100644
--- a/test/CodeGen/SystemZ/frame-14.ll
+++ b/test/CodeGen/SystemZ/frame-14.ll
@@ -266,8 +266,8 @@ define void @f10(i32 *%vptr) {
 
 ; And again with maximum register pressure.  The only spill slots that the
 ; NOFP case needs are the emergency ones, so the offsets are the same as for f4.
-; However, the FP case uses %r11 as the frame pointer and must therefore
-; spill a second register.  This leads to an extra displacement of 8.
+; The FP case needs to spill an extra register and is too dependent on
+; register allocation heuristics for a stable test.
 define void @f11(i32 *%vptr) {
 ; CHECK-NOFP-LABEL: f11:
 ; CHECK-NOFP: stmg %r6, %r15,
@@ -278,16 +278,6 @@ define void @f11(i32 *%vptr) {
 ; CHECK-NOFP: lg [[REGISTER]], [[OFFSET]](%r15)
 ; CHECK-NOFP: lmg %r6, %r15,
 ; CHECK-NOFP: br %r14
-;
-; CHECK-FP-LABEL: f11:
-; CHECK-FP: stmg %r6, %r15,
-; CHECK-FP: stg [[REGISTER:%r[1-9][0-4]?]], [[OFFSET:160|168]](%r11)
-; CHECK-FP: llilh [[REGISTER]], 8
-; CHECK-FP: agr [[REGISTER]], %r11
-; CHECK-FP: mvi 8([[REGISTER]]), 42
-; CHECK-FP: lg [[REGISTER]], [[OFFSET]](%r11)
-; CHECK-FP: lmg %r6, %r15,
-; CHECK-FP: br %r14
   %i0 = load volatile i32 *%vptr
   %i1 = load volatile i32 *%vptr
   %i3 = load volatile i32 *%vptr
diff --git a/test/CodeGen/SystemZ/insert-06.ll b/test/CodeGen/SystemZ/insert-06.ll
index edcd0c5..81a9c87 100644
--- a/test/CodeGen/SystemZ/insert-06.ll
+++ b/test/CodeGen/SystemZ/insert-06.ll
@@ -178,3 +178,17 @@ define i64 @f14(i64 %a, i64 %b) {
   %ext = sext i1 %res to i64
   ret i64 %ext
 }
+
+; Check another representation of f8.
+define i64 @f15(i64 %a, i8 *%src) {
+; CHECK-LABEL: f15:
+; CHECK-NOT: {{%r[23]}}
+; CHECK: lb %r2, 0(%r3)
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %b = sext i8 %byte to i64
+  %low = and i64 %b, 4294967295
+  %high = and i64 %a, -4294967296
+  %res = or i64 %high, %low
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-abs-01.ll b/test/CodeGen/SystemZ/int-abs-01.ll
index 40fb611..053c347 100644
--- a/test/CodeGen/SystemZ/int-abs-01.ll
+++ b/test/CodeGen/SystemZ/int-abs-01.ll
@@ -81,3 +81,67 @@ define i64 @f7(i64 %val) {
   %res = select i1 %cmp, i64 %neg, i64 %val
   ret i64 %res
 }
+
+; Test another form of f6, which is that produced by InstCombine.
+define i64 @f8(i64 %val) {
+; CHECK-LABEL: f8:
+; CHECK: lpgfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp slt i64 %shl, 0
+  %abs = select i1 %cmp, i64 %neg, i64 %ashr
+  ret i64 %abs
+}
+
+; Try again with sle rather than slt.
+define i64 @f9(i64 %val) {
+; CHECK-LABEL: f9:
+; CHECK: lpgfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp sle i64 %shl, 0
+  %abs = select i1 %cmp, i64 %neg, i64 %ashr
+  ret i64 %abs
+}
+
+; Repeat f8 with the operands reversed.
+define i64 @f10(i64 %val) {
+; CHECK-LABEL: f10:
+; CHECK: lpgfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp sgt i64 %shl, 0
+  %abs = select i1 %cmp, i64 %ashr, i64 %neg
+  ret i64 %abs
+}
+
+; Try again with sge rather than sgt.
+define i64 @f11(i64 %val) {
+; CHECK-LABEL: f11:
+; CHECK: lpgfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp sge i64 %shl, 0
+  %abs = select i1 %cmp, i64 %ashr, i64 %neg
+  ret i64 %abs
+}
+
+; Repeat f5 with the comparison on the unextended value.
+define i64 @f12(i32 %val) {
+; CHECK-LABEL: f12:
+; CHECK: lpgfr %r2, %r2
+; CHECK: br %r14
+  %ext = sext i32 %val to i64
+  %cmp = icmp slt i32 %val, 0
+  %neg = sub i64 0, %ext
+  %abs = select i1 %cmp, i64 %neg, i64 %ext
+  ret i64 %abs
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-05.ll b/test/CodeGen/SystemZ/int-cmp-05.ll
index f15b76b..0be43a3 100644
--- a/test/CodeGen/SystemZ/int-cmp-05.ll
+++ b/test/CodeGen/SystemZ/int-cmp-05.ll
@@ -291,9 +291,22 @@ define i64 @f15(i32 *%ptr0) {
   ret i64 %sel9
 }
 
-; Check the comparison can be reversed if that allows CGF to be used.
-define double @f16(double %a, double %b, i64 %i2, i32 *%ptr) {
+; Check the comparison can be reversed if that allows CGFR to be used.
+define double @f16(double %a, double %b, i64 %i1, i32 %unext) {
 ; CHECK-LABEL: f16:
+; CHECK: cgfr %r2, %r3
+; CHECK-NEXT: jh
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %i2 = sext i32 %unext to i64
+  %cond = icmp slt i64 %i2, %i1
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Likewise CGF.
+define double @f17(double %a, double %b, i64 %i2, i32 *%ptr) {
+; CHECK-LABEL: f17:
 ; CHECK: cgf %r2, 0(%r3)
 ; CHECK-NEXT: jh {{\.L.*}}
 ; CHECK: ldr %f0, %f2
diff --git a/test/CodeGen/SystemZ/int-cmp-06.ll b/test/CodeGen/SystemZ/int-cmp-06.ll
index 8ab62e8..82007e2 100644
--- a/test/CodeGen/SystemZ/int-cmp-06.ll
+++ b/test/CodeGen/SystemZ/int-cmp-06.ll
@@ -341,9 +341,35 @@ define i64 @f19(i32 *%ptr0) {
   ret i64 %sel9
 }
 
-; Check the comparison can be reversed if that allows CLGF to be used.
-define double @f20(double %a, double %b, i64 %i2, i32 *%ptr) {
+; Check the comparison can be reversed if that allows CLGFR to be used.
+define double @f20(double %a, double %b, i64 %i1, i32 %unext) {
 ; CHECK-LABEL: f20:
+; CHECK: clgfr %r2, %r3
+; CHECK-NEXT: jh
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %i2 = zext i32 %unext to i64
+  %cond = icmp ult i64 %i2, %i1
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; ...and again with the AND representation.
+define double @f21(double %a, double %b, i64 %i1, i64 %unext) {
+; CHECK-LABEL: f21:
+; CHECK: clgfr %r2, %r3
+; CHECK-NEXT: jh
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %i2 = and i64 %unext, 4294967295
+  %cond = icmp ult i64 %i2, %i1
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Check the comparison can be reversed if that allows CLGF to be used.
+define double @f22(double %a, double %b, i64 %i2, i32 *%ptr) {
+; CHECK-LABEL: f22:
 ; CHECK: clgf %r2, 0(%r3)
 ; CHECK-NEXT: jh {{\.L.*}}
 ; CHECK: ldr %f0, %f2
diff --git a/test/CodeGen/SystemZ/int-cmp-44.ll b/test/CodeGen/SystemZ/int-cmp-44.ll
index ae0133f..f065e64 100644
--- a/test/CodeGen/SystemZ/int-cmp-44.ll
+++ b/test/CodeGen/SystemZ/int-cmp-44.ll
@@ -1,4 +1,4 @@
-; Test that compares are ommitted if CC already has the right value
+; Test that compares are omitted if CC already has the right value
 ; (z10 version).
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
@@ -797,3 +797,93 @@ store:
 exit:
   ret i32 %val
 }
+
+; Test f35 for in-register extensions.
+define i64 @f39(i64 %dummy, i64 %a, i64 *%dest) {
+; CHECK-LABEL: f39:
+; CHECK: ltgfr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jh .L{{.*}}
+; CHECK: br %r14
+entry:
+  %val = trunc i64 %a to i32
+  %ext = sext i32 %val to i64
+  call void asm sideeffect "blah $0", "{r2}"(i64 %ext)
+  %cmp = icmp sgt i64 %ext, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 %ext, i64 *%dest
+  br label %exit
+
+exit:
+  ret i64 %ext
+}
+
+; ...and again with what InstCombine would produce for f40.
+define i64 @f40(i64 %dummy, i64 %a, i64 *%dest) {
+; CHECK-LABEL: f40:
+; CHECK: ltgfr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jh .L{{.*}}
+; CHECK: br %r14
+entry:
+  %shl = shl i64 %a, 32
+  %ext = ashr i64 %shl, 32
+  call void asm sideeffect "blah $0", "{r2}"(i64 %ext)
+  %cmp = icmp sgt i64 %shl, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 %ext, i64 *%dest
+  br label %exit
+
+exit:
+  ret i64 %ext
+}
+
+; Try a form of f7 in which the subtraction operands are compared directly.
+define i32 @f41(i32 %a, i32 %b, i32 *%dest) {
+; CHECK-LABEL: f41:
+; CHECK: s %r2, 0(%r4)
+; CHECK-NEXT: jne .L{{.*}}
+; CHECK: br %r14
+entry:
+  %cur = load i32 *%dest
+  %res = sub i32 %a, %cur
+  %cmp = icmp ne i32 %a, %cur
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 %b, i32 *%dest
+  br label %exit
+
+exit:
+  ret i32 %res
+}
+
+; A version of f32 that tests the unextended value.
+define i64 @f42(i64 %base, i64 %index, i64 *%dest) {
+; CHECK-LABEL: f42:
+; CHECK: ltgf %r2, 0({{%r2,%r3|%r3,%r2}})
+; CHECK-NEXT: jh .L{{.*}}
+; CHECK: br %r14
+entry:
+  %add = add i64 %base, %index
+  %ptr = inttoptr i64 %add to i32 *
+  %val = load i32 *%ptr
+  %res = sext i32 %val to i64
+  %cmp = icmp sgt i32 %val, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 %res, i64 *%dest
+  br label %exit
+
+exit:
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-45.ll b/test/CodeGen/SystemZ/int-cmp-45.ll
index 753a528..9c9c49c 100644
--- a/test/CodeGen/SystemZ/int-cmp-45.ll
+++ b/test/CodeGen/SystemZ/int-cmp-45.ll
@@ -1,4 +1,4 @@
-; Test that compares are ommitted if CC already has the right value
+; Test that compares are omitted if CC already has the right value
 ; (z196 version).
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
diff --git a/test/CodeGen/SystemZ/int-cmp-47.ll b/test/CodeGen/SystemZ/int-cmp-47.ll
index 9ebcbfe..038a25b 100644
--- a/test/CodeGen/SystemZ/int-cmp-47.ll
+++ b/test/CodeGen/SystemZ/int-cmp-47.ll
@@ -232,3 +232,112 @@ store:
 exit:
   ret void
 }
+
+; Check a case where TMHH can be used to implement a ult comparison.
+define void @f13(i64 %a) {
+; CHECK-LABEL: f13:
+; CHECK: tmhh %r2, 49152
+; CHECK: jno {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %cmp = icmp ult i64 %a, 13835058055282163712
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; And again with ule.
+define void @f14(i64 %a) {
+; CHECK-LABEL: f14:
+; CHECK: tmhh %r2, 49152
+; CHECK: jno {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %cmp = icmp ule i64 %a, 13835058055282163711
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; And again with ugt.
+define void @f15(i64 %a) {
+; CHECK-LABEL: f15:
+; CHECK: tmhh %r2, 49152
+; CHECK: jo {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %cmp = icmp ugt i64 %a, 13835058055282163711
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; And again with uge.
+define void @f16(i64 %a) {
+; CHECK-LABEL: f16:
+; CHECK: tmhh %r2, 49152
+; CHECK: jo {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %cmp = icmp uge i64 %a, 13835058055282163712
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Decrease the constant from f13 to make TMHH invalid.
+define void @f17(i64 %a) {
+; CHECK-LABEL: f17:
+; CHECK-NOT: tmhh
+; CHECK: llihh {{%r[0-5]}}, 49151
+; CHECK-NOT: tmhh
+; CHECK: br %r14
+entry:
+  %cmp = icmp ult i64 %a, 13834776580305453056
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we don't use TMHH just to test the top bit.
+define void @f18(i64 %a) {
+; CHECK-LABEL: f18:
+; CHECK-NOT: tmhh
+; CHECK: cgijhe %r2, 0,
+; CHECK: br %r14
+entry:
+  %cmp = icmp ult i64 %a, 9223372036854775808
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-neg-02.ll b/test/CodeGen/SystemZ/int-neg-02.ll
index e26194c..7f3f637 100644
--- a/test/CodeGen/SystemZ/int-neg-02.ll
+++ b/test/CodeGen/SystemZ/int-neg-02.ll
@@ -89,3 +89,136 @@ define i64 @f7(i64 %val) {
   %res = sub i64 0, %abs
   ret i64 %res
 }
+
+; Test another form of f6, which is that produced by InstCombine.
+define i64 @f8(i64 %val) {
+; CHECK-LABEL: f8:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp slt i64 %shl, 0
+  %abs = select i1 %cmp, i64 %neg, i64 %ashr
+  %res = sub i64 0, %abs
+  ret i64 %res
+}
+
+; Try again with sle rather than slt.
+define i64 @f9(i64 %val) {
+; CHECK-LABEL: f9:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp sle i64 %shl, 0
+  %abs = select i1 %cmp, i64 %neg, i64 %ashr
+  %res = sub i64 0, %abs
+  ret i64 %res
+}
+
+; Repeat f8 with the operands reversed.
+define i64 @f10(i64 %val) {
+; CHECK-LABEL: f10:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp sgt i64 %shl, 0
+  %abs = select i1 %cmp, i64 %ashr, i64 %neg
+  %res = sub i64 0, %abs
+  ret i64 %res
+}
+
+; Try again with sge rather than sgt.
+define i64 @f11(i64 %val) {
+; CHECK-LABEL: f11:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp sge i64 %shl, 0
+  %abs = select i1 %cmp, i64 %ashr, i64 %neg
+  %res = sub i64 0, %abs
+  ret i64 %res
+}
+
+; Repeat f8 with the negation coming from swapped operands.
+define i64 @f12(i64 %val) {
+; CHECK-LABEL: f12:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp slt i64 %shl, 0
+  %negabs = select i1 %cmp, i64 %ashr, i64 %neg
+  ret i64 %negabs
+}
+
+; Likewise f9.
+define i64 @f13(i64 %val) {
+; CHECK-LABEL: f13:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp sle i64 %shl, 0
+  %negabs = select i1 %cmp, i64 %ashr, i64 %neg
+  ret i64 %negabs
+}
+
+; Likewise f10.
+define i64 @f14(i64 %val) {
+; CHECK-LABEL: f14:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp sgt i64 %shl, 0
+  %negabs = select i1 %cmp, i64 %neg, i64 %ashr
+  ret i64 %negabs
+}
+
+; Likewise f11.
+define i64 @f15(i64 %val) {
+; CHECK-LABEL: f15:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %shl = shl i64 %val, 32
+  %ashr = ashr i64 %shl, 32
+  %neg = sub i64 0, %ashr
+  %cmp = icmp sge i64 %shl, 0
+  %negabs = select i1 %cmp, i64 %neg, i64 %ashr
+  ret i64 %negabs
+}
+
+; Repeat f5 with the comparison on the unextended value.
+define i64 @f16(i32 %val) {
+; CHECK-LABEL: f16:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %ext = sext i32 %val to i64
+  %cmp = icmp slt i32 %val, 0
+  %neg = sub i64 0, %ext
+  %abs = select i1 %cmp, i64 %neg, i64 %ext
+  %res = sub i64 0, %abs
+  ret i64 %res
+}
+
+; And again with the negation coming from swapped operands.
+define i64 @f17(i32 %val) {
+; CHECK-LABEL: f17:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %ext = sext i32 %val to i64
+  %cmp = icmp slt i32 %val, 0
+  %neg = sub i64 0, %ext
+  %abs = select i1 %cmp, i64 %ext, i64 %neg
+  ret i64 %abs
+}
diff --git a/test/CodeGen/SystemZ/mature-mc-support.ll b/test/CodeGen/SystemZ/mature-mc-support.ll
new file mode 100644
index 0000000..5520f55
--- /dev/null
+++ b/test/CodeGen/SystemZ/mature-mc-support.ll
@@ -0,0 +1,15 @@
+; Test that inline assembly is parsed by the MC layer when MC support is mature
+; (even when the output is assembly).
+; FIXME: SystemZ doesn't use the integrated assembler by default so we only test
+; that -filetype=obj tries to parse the assembly.
+
+; SKIP: not llc -march=systemz < %s > /dev/null 2> %t1
+; SKIP: FileCheck %s < %t1
+
+; RUN: not llc -march=systemz -filetype=obj < %s > /dev/null 2> %t2
+; RUN: FileCheck %s < %t2
+
+
+module asm "	.this_directive_is_very_unlikely_to_exist"
+
+; CHECK: LLVM ERROR: Error parsing inline asm
diff --git a/test/CodeGen/SystemZ/risbg-01.ll b/test/CodeGen/SystemZ/risbg-01.ll
index a4d11fd..d75e8e4 100644
--- a/test/CodeGen/SystemZ/risbg-01.ll
+++ b/test/CodeGen/SystemZ/risbg-01.ll
@@ -269,12 +269,12 @@ define i64 @f23(i64 %foo) {
 ; mask and rotate.
 define i32 @f24(i32 %foo) {
 ; CHECK-LABEL: f24:
-; CHECK: nilf %r2, 14
-; CHECK: rll %r2, %r2, 3
+; CHECK: nilf %r2, 254
+; CHECK: rll %r2, %r2, 29
 ; CHECK: br %r14
-  %and = and i32 %foo, 14
-  %parta = shl i32 %and, 3
-  %partb = lshr i32 %and, 29
+  %and = and i32 %foo, 254
+  %parta = lshr i32 %and, 3
+  %partb = shl i32 %and, 29
   %rotl = or i32 %parta, %partb
   ret i32 %rotl
 }
@@ -295,7 +295,6 @@ define i64 @f25(i64 %foo) {
 ; This again needs a separate mask and rotate.
 define i32 @f26(i32 %foo) {
 ; CHECK-LABEL: f26:
-; CHECK: nill %r2, 65487
 ; CHECK: rll %r2, %r2, 5
 ; CHECK: br %r14
   %and = and i32 %foo, -49
@@ -457,11 +456,22 @@ define i64 @f40(i64 %foo, i64 *%dest) {
   ret i64 %and
 }
 
+; Check a case where the result is zero-extended.
+define i64 @f41(i32 %a) {
+; CHECK-LABEL: f41
+; CHECK: risbg %r2, %r2, 36, 191, 62
+; CHECK: br %r14
+  %shl = shl i32 %a, 2
+  %shr = lshr i32 %shl, 4
+  %ext = zext i32 %shr to i64
+  ret i64 %ext
+}
+
 ; In this case the sign extension is converted to a pair of 32-bit shifts,
 ; which is then extended to 64 bits.  We previously used the wrong bit size
 ; when testing whether the shifted-in bits of the shift right were significant.
-define i64 @f41(i1 %x) {
-; CHECK-LABEL: f41:
+define i64 @f42(i1 %x) {
+; CHECK-LABEL: f42:
 ; CHECK: sll %r2, 31
 ; CHECK: sra %r2, 31
 ; CHECK: llgcr %r2, %r2
diff --git a/test/CodeGen/SystemZ/rnsbg-01.ll b/test/CodeGen/SystemZ/rnsbg-01.ll
index 666aeb2..282810a 100644
--- a/test/CodeGen/SystemZ/rnsbg-01.ll
+++ b/test/CodeGen/SystemZ/rnsbg-01.ll
@@ -255,3 +255,14 @@ define i64 @f22(i64 %a, i64 %b) {
   %and = and i64 %a, %rotlorb
   ret i64 %and
 }
+
+; Check the handling of zext and AND, which isn't suitable for RNSBG.
+define i64 @f23(i64 %a, i32 %b) {
+; CHECK-LABEL: f23:
+; CHECK-NOT: rnsbg
+; CHECK: br %r14
+  %add = add i32 %b, 1
+  %ext = zext i32 %add to i64
+  %and = and i64 %a, %ext
+  ret i64 %and
+}
diff --git a/test/CodeGen/SystemZ/rosbg-01.ll b/test/CodeGen/SystemZ/rosbg-01.ll
index 0abaccc..96ee870 100644
--- a/test/CodeGen/SystemZ/rosbg-01.ll
+++ b/test/CodeGen/SystemZ/rosbg-01.ll
@@ -108,3 +108,14 @@ define i64 @f11(i64 %a, i64 %b) {
   %or = or i64 %a, %andb
   ret i64 %or
 }
+
+; Check the handling of zext and OR, which can use ROSBG.
+define i64 @f12(i64 %a, i32 %b) {
+; CHECK-LABEL: f12:
+; CHECK: rosbg %r2, %r3, 32, 63, 0
+; CHECK: br %r14
+  %add = add i32 %b, 1
+  %ext = zext i32 %add to i64
+  %or = or i64 %a, %ext
+  ret i64 %or
+}
diff --git a/test/CodeGen/SystemZ/rxsbg-01.ll b/test/CodeGen/SystemZ/rxsbg-01.ll
index 5491bff..339fe2a 100644
--- a/test/CodeGen/SystemZ/rxsbg-01.ll
+++ b/test/CodeGen/SystemZ/rxsbg-01.ll
@@ -110,3 +110,14 @@ define i64 @f11(i64 %a, i64 %b) {
   %xor = xor i64 %a, %andb
   ret i64 %xor
 }
+
+; Check the handling of zext and XOR, which can use ROSBG.
+define i64 @f12(i64 %a, i32 %b) {
+; CHECK-LABEL: f12:
+; CHECK: rxsbg %r2, %r3, 32, 63, 0
+; CHECK: br %r14
+  %add = add i32 %b, 1
+  %ext = zext i32 %add to i64
+  %xor = xor i64 %a, %ext
+  ret i64 %xor
+}
diff --git a/test/CodeGen/SystemZ/selectcc-01.ll b/test/CodeGen/SystemZ/selectcc-01.ll
new file mode 100644
index 0000000..a57444c
--- /dev/null
+++ b/test/CodeGen/SystemZ/selectcc-01.ll
@@ -0,0 +1,178 @@
+; Test an i32 0/-1 SELECTCCC for every floating-point condition.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test CC in { 0 }
+define i32 @f1(float %a, float %b) {
+; CHECK-LABEL: f1:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp oeq float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 1 }
+define i32 @f2(float %a, float %b) {
+; CHECK-LABEL: f2:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp olt float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 0, 1 }
+define i32 @f3(float %a, float %b) {
+; CHECK-LABEL: f3:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -536870912
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ole float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 2 }
+define i32 @f4(float %a, float %b) {
+; CHECK-LABEL: f4:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, 1342177280
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ogt float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 0, 2 }
+define i32 @f5(float %a, float %b) {
+; CHECK-LABEL: f5:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 4294967295
+; CHECK-NEXT: sll %r2, 3
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp oge float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 1, 2 }
+define i32 @f6(float %a, float %b) {
+; CHECK-LABEL: f6:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 268435456
+; CHECK-NEXT: sll %r2, 2
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp one float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 0, 1, 2 }
+define i32 @f7(float %a, float %b) {
+; CHECK-LABEL: f7:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -805306368
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ord float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 3 }
+define i32 @f8(float %a, float %b) {
+; CHECK-LABEL: f8:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 1342177280
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp uno float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 0, 3 }
+define i32 @f9(float %a, float %b) {
+; CHECK-LABEL: f9:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: sll %r2, 2
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ueq float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 1, 3 }
+define i32 @f10(float %a, float %b) {
+; CHECK-LABEL: f10:
+; CHECK: ipm %r2
+; CHECK-NEXT: sll %r2, 3
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ult float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 0, 1, 3 }
+define i32 @f11(float %a, float %b) {
+; CHECK-LABEL: f11:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, -805306368
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ule float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 2, 3 }
+define i32 @f12(float %a, float %b) {
+; CHECK-LABEL: f12:
+; CHECK: ipm %r2
+; CHECK-NEXT: sll %r2, 2
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ugt float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 0, 2, 3 }
+define i32 @f13(float %a, float %b) {
+; CHECK-LABEL: f13:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, 1879048192
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp uge float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
+
+; Test CC in { 1, 2, 3 }
+define i32 @f14(float %a, float %b) {
+; CHECK-LABEL: f14:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 1879048192
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp une float %a, %b
+  %res = select i1 %cond, i32 -1, i32 0
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/selectcc-02.ll b/test/CodeGen/SystemZ/selectcc-02.ll
new file mode 100644
index 0000000..b1081a0
--- /dev/null
+++ b/test/CodeGen/SystemZ/selectcc-02.ll
@@ -0,0 +1,178 @@
+; Test an i32 0/-1 SELECTCCC for every floating-point condition.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test CC in { 1, 2, 3 }
+define i32 @f1(float %a, float %b) {
+; CHECK-LABEL: f1:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 1879048192
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp oeq float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 0, 2, 3 }
+define i32 @f2(float %a, float %b) {
+; CHECK-LABEL: f2:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, 1879048192
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp olt float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 2, 3 }
+define i32 @f3(float %a, float %b) {
+; CHECK-LABEL: f3:
+; CHECK: ipm %r2
+; CHECK-NEXT: sll %r2, 2
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ole float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 0, 1, 3 }
+define i32 @f4(float %a, float %b) {
+; CHECK-LABEL: f4:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, -805306368
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ogt float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 1, 3 }
+define i32 @f5(float %a, float %b) {
+; CHECK-LABEL: f5:
+; CHECK: ipm %r2
+; CHECK-NEXT: sll %r2, 3
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp oge float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 0, 3 }
+define i32 @f6(float %a, float %b) {
+; CHECK-LABEL: f6:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: sll %r2, 2
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp one float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 3 }
+define i32 @f7(float %a, float %b) {
+; CHECK-LABEL: f7:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 1342177280
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ord float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 0, 1, 2 }
+define i32 @f8(float %a, float %b) {
+; CHECK-LABEL: f8:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -805306368
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp uno float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 1, 2 }
+define i32 @f9(float %a, float %b) {
+; CHECK-LABEL: f9:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 268435456
+; CHECK-NEXT: sll %r2, 2
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ueq float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 0, 2 }
+define i32 @f10(float %a, float %b) {
+; CHECK-LABEL: f10:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 4294967295
+; CHECK-NEXT: sll %r2, 3
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ult float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 2 }
+define i32 @f11(float %a, float %b) {
+; CHECK-LABEL: f11:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, 1342177280
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ule float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 0, 1 }
+define i32 @f12(float %a, float %b) {
+; CHECK-LABEL: f12:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -536870912
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ugt float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 1 }
+define i32 @f13(float %a, float %b) {
+; CHECK-LABEL: f13:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp uge float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
+
+; Test CC in { 0 }
+define i32 @f14(float %a, float %b) {
+; CHECK-LABEL: f14:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: sra %r2, 31
+; CHECK: br %r14
+  %cond = fcmp une float %a, %b
+  %res = select i1 %cond, i32 0, i32 -1
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/selectcc-03.ll b/test/CodeGen/SystemZ/selectcc-03.ll
new file mode 100644
index 0000000..cafb4a2
--- /dev/null
+++ b/test/CodeGen/SystemZ/selectcc-03.ll
@@ -0,0 +1,187 @@
+; Test an i64 0/-1 SELECTCCC for every floating-point condition.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test CC in { 0 }
+define i64 @f1(float %a, float %b) {
+; CHECK-LABEL: f1:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: afi [[REG]], -268435456
+; CHECK-NEXT: sllg [[REG]], [[REG]], 32
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp oeq float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 1 }
+define i64 @f2(float %a, float %b) {
+; CHECK-LABEL: f2:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: xilf [[REG]], 268435456
+; CHECK-NEXT: afi [[REG]], -268435456
+; CHECK-NEXT: sllg [[REG]], [[REG]], 32
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp olt float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 0, 1 }
+define i64 @f3(float %a, float %b) {
+; CHECK-LABEL: f3:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: afi [[REG]], -536870912
+; CHECK-NEXT: sllg [[REG]], [[REG]], 32
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp ole float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 2 }
+define i64 @f4(float %a, float %b) {
+; CHECK-LABEL: f4:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: xilf [[REG]], 268435456
+; CHECK-NEXT: afi [[REG]], 1342177280
+; CHECK-NEXT: sllg [[REG]], [[REG]], 32
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp ogt float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 0, 2 }
+define i64 @f5(float %a, float %b) {
+; CHECK-LABEL: f5:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: xilf [[REG]], 4294967295
+; CHECK-NEXT: sllg [[REG]], [[REG]], 35
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp oge float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 1, 2 }
+define i64 @f6(float %a, float %b) {
+; CHECK-LABEL: f6:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: afi [[REG]], 268435456
+; CHECK-NEXT: sllg [[REG]], [[REG]], 34
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp one float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 0, 1, 2 }
+define i64 @f7(float %a, float %b) {
+; CHECK-LABEL: f7:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: afi [[REG]], -805306368
+; CHECK-NEXT: sllg [[REG]], [[REG]], 32
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp ord float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 3 }
+define i64 @f8(float %a, float %b) {
+; CHECK-LABEL: f8:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: afi [[REG]], 1342177280
+; CHECK-NEXT: sllg [[REG]], [[REG]], 32
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp uno float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 0, 3 }
+define i64 @f9(float %a, float %b) {
+; CHECK-LABEL: f9:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: afi [[REG]], -268435456
+; CHECK-NEXT: sllg [[REG]], [[REG]], 34
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp ueq float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 1, 3 }
+define i64 @f10(float %a, float %b) {
+; CHECK-LABEL: f10:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: sllg [[REG]], [[REG]], 35
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp ult float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 0, 1, 3 }
+define i64 @f11(float %a, float %b) {
+; CHECK-LABEL: f11:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: xilf [[REG]], 268435456
+; CHECK-NEXT: afi [[REG]], -805306368
+; CHECK-NEXT: sllg [[REG]], [[REG]], 32
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp ule float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 2, 3 }
+define i64 @f12(float %a, float %b) {
+; CHECK-LABEL: f12:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: sllg [[REG]], [[REG]], 34
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp ugt float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 0, 2, 3 }
+define i64 @f13(float %a, float %b) {
+; CHECK-LABEL: f13:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: xilf [[REG]], 268435456
+; CHECK-NEXT: afi [[REG]], 1879048192
+; CHECK-NEXT: sllg [[REG]], [[REG]], 32
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp uge float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
+
+; Test CC in { 1, 2, 3 }
+define i64 @f14(float %a, float %b) {
+; CHECK-LABEL: f14:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: afi [[REG]], 1879048192
+; CHECK-NEXT: sllg [[REG]], [[REG]], 32
+; CHECK-NEXT: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %cond = fcmp une float %a, %b
+  %res = select i1 %cond, i64 -1, i64 0
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/serialize-01.ll b/test/CodeGen/SystemZ/serialize-01.ll
new file mode 100644
index 0000000..7801fac
--- /dev/null
+++ b/test/CodeGen/SystemZ/serialize-01.ll
@@ -0,0 +1,21 @@
+; Test serialization instructions.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \
+; RUN:   FileCheck %s -check-prefix=CHECK-FULL
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | \
+; RUN:   FileCheck %s -check-prefix=CHECK-FAST
+
+; Check that volatile loads produce a serialisation.
+define i32 @f1(i32 *%src) {
+; CHECK-FULL-LABEL: f1:
+; CHECK-FULL: bcr 15, %r0
+; CHECK-FULL: l %r2, 0(%r2)
+; CHECK-FULL: br %r14
+;
+; CHECK-FAST-LABEL: f1:
+; CHECK-FAST: bcr 14, %r0
+; CHECK-FAST: l %r2, 0(%r2)
+; CHECK-FAST: br %r14
+  %val = load volatile i32 *%src
+  ret i32 %val
+}
diff --git a/test/CodeGen/SystemZ/shift-04.ll b/test/CodeGen/SystemZ/shift-04.ll
index 04b39d0..de2d74f 100644
--- a/test/CodeGen/SystemZ/shift-04.ll
+++ b/test/CodeGen/SystemZ/shift-04.ll
@@ -187,3 +187,104 @@ define i32 @f14(i32 %a, i32 *%ptr) {
   %or = or i32 %parta, %partb
   ret i32 %or
 }
+
+; Check another form of f5, which is the one produced by running f5 through
+; instcombine.
+define i32 @f15(i32 %a, i32 %amt) {
+; CHECK-LABEL: f15:
+; CHECK: rll %r2, %r2, 10(%r3)
+; CHECK: br %r14
+  %add = add i32 %amt, 10
+  %sub = sub i32 22, %amt
+  %parta = shl i32 %a, %add
+  %partb = lshr i32 %a, %sub
+  %or = or i32 %parta, %partb
+  ret i32 %or
+}
+
+; Likewise for f7.
+define i32 @f16(i32 %a, i64 %amt) {
+; CHECK-LABEL: f16:
+; CHECK: rll %r2, %r2, 10(%r3)
+; CHECK: br %r14
+  %add = add i64 %amt, 10
+  %sub = sub i64 22, %amt
+  %addtrunc = trunc i64 %add to i32
+  %subtrunc = trunc i64 %sub to i32
+  %parta = shl i32 %a, %addtrunc
+  %partb = lshr i32 %a, %subtrunc
+  %or = or i32 %parta, %partb
+  ret i32 %or
+}
+
+; Check cases where (-x & 31) is used instead of 32 - x.
+define i32 @f17(i32 %x, i32 %y) {
+; CHECK-LABEL: f17:
+; CHECK: rll %r2, %r2, 0(%r3)
+; CHECK: br %r14
+entry:
+  %shl = shl i32 %x, %y
+  %sub = sub i32 0, %y
+  %and = and i32 %sub, 31
+  %shr = lshr i32 %x, %and
+  %or = or i32 %shr, %shl
+  ret i32 %or
+}
+
+; ...and again with ((32 - x) & 31).
+define i32 @f18(i32 %x, i32 %y) {
+; CHECK-LABEL: f18:
+; CHECK: rll %r2, %r2, 0(%r3)
+; CHECK: br %r14
+entry:
+  %shl = shl i32 %x, %y
+  %sub = sub i32 32, %y
+  %and = and i32 %sub, 31
+  %shr = lshr i32 %x, %and
+  %or = or i32 %shr, %shl
+  ret i32 %or
+}
+
+; This is not a rotation.
+define i32 @f19(i32 %x, i32 %y) {
+; CHECK-LABEL: f19:
+; CHECK-NOT: rll
+; CHECK: br %r14
+entry:
+  %shl = shl i32 %x, %y
+  %sub = sub i32 16, %y
+  %and = and i32 %sub, 31
+  %shr = lshr i32 %x, %and
+  %or = or i32 %shr, %shl
+  ret i32 %or
+}
+
+; Repeat f17 with an addition on the shift count.
+define i32 @f20(i32 %x, i32 %y) {
+; CHECK-LABEL: f20:
+; CHECK: rll %r2, %r2, 199(%r3)
+; CHECK: br %r14
+entry:
+  %add = add i32 %y, 199
+  %shl = shl i32 %x, %add
+  %sub = sub i32 0, %add
+  %and = and i32 %sub, 31
+  %shr = lshr i32 %x, %and
+  %or = or i32 %shr, %shl
+  ret i32 %or
+}
+
+; ...and again with the InstCombine version.
+define i32 @f21(i32 %x, i32 %y) {
+; CHECK-LABEL: f21:
+; CHECK: rll %r2, %r2, 199(%r3)
+; CHECK: br %r14
+entry:
+  %add = add i32 %y, 199
+  %shl = shl i32 %x, %add
+  %sub = sub i32 -199, %y
+  %and = and i32 %sub, 31
+  %shr = lshr i32 %x, %and
+  %or = or i32 %shr, %shl
+  ret i32 %or
+}
diff --git a/test/CodeGen/SystemZ/shift-10.ll b/test/CodeGen/SystemZ/shift-10.ll
index 46ed218..bf2f0f1 100644
--- a/test/CodeGen/SystemZ/shift-10.ll
+++ b/test/CodeGen/SystemZ/shift-10.ll
@@ -14,13 +14,14 @@ define i64 @f1(i32 %a) {
   ret i64 %ext
 }
 
-; ...and again with the highest shift count.
+; ...and again with the highest shift count that doesn't reduce to an
+; ashr/sext pair.
 define i64 @f2(i32 %a) {
 ; CHECK-LABEL: f2:
-; CHECK: sllg [[REG:%r[0-5]]], %r2, 32
+; CHECK: sllg [[REG:%r[0-5]]], %r2, 33
 ; CHECK: srag %r2, [[REG]], 63
 ; CHECK: br %r14
-  %shr = lshr i32 %a, 31
+  %shr = lshr i32 %a, 30
   %trunc = trunc i32 %shr to i1
   %ext = sext i1 %trunc to i64
   ret i64 %ext
@@ -76,3 +77,15 @@ define i64 @f6(i64 %a) {
   %and = and i64 %shr, 256
   ret i64 %and
 }
+
+; Test another form of f1.
+define i64 @f7(i32 %a) {
+; CHECK-LABEL: f7:
+; CHECK: sllg [[REG:%r[0-5]]], %r2, 62
+; CHECK: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %1 = shl i32 %a, 30
+  %sext = ashr i32 %1, 31
+  %ext = sext i32 %sext to i64
+  ret i64 %ext
+}
diff --git a/test/CodeGen/SystemZ/spill-01.ll b/test/CodeGen/SystemZ/spill-01.ll
index ca64a88..c1f780c 100644
--- a/test/CodeGen/SystemZ/spill-01.ll
+++ b/test/CodeGen/SystemZ/spill-01.ll
@@ -400,6 +400,7 @@ define void @f10() {
 ; CHECK: stgrl [[REG]], h8
 ; CHECK: br %r14
 entry:
+  %val8 = load volatile i64 *@h8
   %val0 = load volatile i64 *@h0
   %val1 = load volatile i64 *@h1
   %val2 = load volatile i64 *@h2
@@ -408,7 +409,6 @@ entry:
   %val5 = load volatile i64 *@h5
   %val6 = load volatile i64 *@h6
   %val7 = load volatile i64 *@h7
-  %val8 = load volatile i64 *@h8
   %val9 = load volatile i64 *@h9
 
   call void @foo()
diff --git a/test/CodeGen/Thumb/cortex-m0-unaligned-access.ll b/test/CodeGen/Thumb/cortex-m0-unaligned-access.ll
new file mode 100644
index 0000000..c4403fe
--- /dev/null
+++ b/test/CodeGen/Thumb/cortex-m0-unaligned-access.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=thumbv6m-apple-unknown-macho < %s | FileCheck --check-prefix=V6M %s
+; RUN: llc -mtriple=thumbv7m-apple-unknown-macho < %s | FileCheck --check-prefix=V7M %s
+
+define i32 @split_load(i32* %p) nounwind {
+; V6M-LABEL: split_load
+; V6M: ldrh
+; V6M: ldrh
+; V7M-LABEL: split_load
+; V7M-NOT: ldrh
+; V7M: bx lr
+  %val = load i32* %p, align 2
+  ret i32 %val
+}
diff --git a/test/CodeGen/Thumb/inlineasm-imm-thumb.ll b/test/CodeGen/Thumb/inlineasm-imm-thumb.ll
index 5c8a52a..d557b9d 100644
--- a/test/CodeGen/Thumb/inlineasm-imm-thumb.ll
+++ b/test/CodeGen/Thumb/inlineasm-imm-thumb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb
+; RUN: llc < %s -march=thumb -no-integrated-as
 
 ; Test Thumb-mode "I" constraint, for ADD immediate.
 define i32 @testI(i32 %x) {
diff --git a/test/CodeGen/Thumb/mature-mc-support.ll b/test/CodeGen/Thumb/mature-mc-support.ll
new file mode 100644
index 0000000..d7f8ae6
--- /dev/null
+++ b/test/CodeGen/Thumb/mature-mc-support.ll
@@ -0,0 +1,12 @@
+; Test that inline assembly is parsed by the MC layer when MC support is mature
+; (even when the output is assembly).
+
+; RUN: not llc -mtriple=thumb-pc-linux < %s > /dev/null 2> %t1
+; RUN: FileCheck %s < %t1
+
+; RUN: not llc -mtriple=thumb-pc-linux -filetype=obj < %s > /dev/null 2> %t2
+; RUN: FileCheck %s < %t2
+
+module asm "	.this_directive_is_very_unlikely_to_exist"
+
+; CHECK: LLVM ERROR: Error parsing inline asm
diff --git a/test/CodeGen/Thumb/segmented-stacks-dynamic.ll b/test/CodeGen/Thumb/segmented-stacks-dynamic.ll
new file mode 100644
index 0000000..067c07b
--- /dev/null
+++ b/test/CodeGen/Thumb/segmented-stacks-dynamic.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-linux
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -filetype=obj
+
+; Just to prevent the alloca from being optimized away
+declare void @dummy_use(i32*, i32)
+
+define i32 @test_basic(i32 %l) {
+        %mem = alloca i32, i32 %l
+        call void @dummy_use (i32* %mem, i32 %l)
+        %terminate = icmp eq i32 %l, 0
+        br i1 %terminate, label %true, label %false
+
+true:
+        ret i32 0
+
+false:
+        %newlen = sub i32 %l, 1
+        %retvalue = call i32 @test_basic(i32 %newlen)
+        ret i32 %retvalue
+
+; Thumb-linux:      test_basic:
+
+; Thumb-linux:      push {r4, r5}
+; Thumb-linux:      mov	r5, sp
+; Thumb-linux-NEXT: ldr r4, .LCPI0_0
+; Thumb-linux-NEXT: ldr r4, [r4]
+; Thumb-linux-NEXT: cmp	r4, r5
+; Thumb-linux-NEXT: blo	.LBB0_2
+
+; Thumb-linux:      mov r4, #16
+; Thumb-linux-NEXT: mov r5, #0
+; Thumb-linux-NEXT: push {lr}
+; Thumb-linux-NEXT: bl	__morestack
+; Thumb-linux-NEXT: pop {r4}
+; Thumb-linux-NEXT: mov lr, r4
+; Thumb-linux-NEXT: pop {r4, r5}
+; Thumb-linux-NEXT: bx lr
+
+; Thumb-linux:      pop {r4, r5}
+
+; Thumb-android:      test_basic:
+
+; Thumb-android:      push {r4, r5}
+; Thumb-android:      mov	r5, sp
+; Thumb-android-NEXT: ldr r4, .LCPI0_0
+; Thumb-android-NEXT: ldr r4, [r4]
+; Thumb-android-NEXT: cmp	r4, r5
+; Thumb-android-NEXT: blo	.LBB0_2
+
+; Thumb-android:      mov r4, #16
+; Thumb-android-NEXT: mov r5, #0
+; Thumb-android-NEXT: push {lr}
+; Thumb-android-NEXT: bl __morestack
+; Thumb-android-NEXT: pop {r4}
+; Thumb-android-NEXT: mov lr, r4
+; Thumb-android-NEXT: pop {r4, r5}
+; Thumb-android-NEXT: bx lr
+
+; Thumb-android:      pop {r4, r5}
+
+}
diff --git a/test/CodeGen/Thumb/segmented-stacks.ll b/test/CodeGen/Thumb/segmented-stacks.ll
new file mode 100644
index 0000000..5649b00
--- /dev/null
+++ b/test/CodeGen/Thumb/segmented-stacks.ll
@@ -0,0 +1,247 @@
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-linux
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+
+
+; Just to prevent the alloca from being optimized away
+declare void @dummy_use(i32*, i32)
+
+define void @test_basic() {
+        %mem = alloca i32, i32 10
+        call void @dummy_use (i32* %mem, i32 10)
+	ret void
+
+; Thumb-android:      test_basic:
+
+; Thumb-android:      push    {r4, r5}
+; Thumb-android-NEXT: mov     r5, sp
+; Thumb-android-NEXT: ldr     r4, .LCPI0_0
+; Thumb-android-NEXT: ldr     r4, [r4]
+; Thumb-android-NEXT: cmp     r4, r5
+; Thumb-android-NEXT: blo     .LBB0_2
+
+; Thumb-android:      mov     r4, #48
+; Thumb-android-NEXT: mov     r5, #0
+; Thumb-android-NEXT: push    {lr}
+; Thumb-android-NEXT: bl      __morestack
+; Thumb-android-NEXT: pop     {r4}
+; Thumb-android-NEXT: mov     lr, r4
+; Thumb-android-NEXT: pop     {r4, r5}
+; Thumb-android-NEXT: bx      lr
+
+; Thumb-android:      pop     {r4, r5}
+
+; Thumb-linux:      test_basic:
+
+; Thumb-linux:      push    {r4, r5}
+; Thumb-linux-NEXT: mov     r5, sp
+; Thumb-linux-NEXT: ldr     r4, .LCPI0_0
+; Thumb-linux-NEXT: ldr     r4, [r4]
+; Thumb-linux-NEXT: cmp     r4, r5
+; Thumb-linux-NEXT: blo     .LBB0_2
+
+; Thumb-linux:      mov     r4, #48
+; Thumb-linux-NEXT: mov     r5, #0
+; Thumb-linux-NEXT: push    {lr}
+; Thumb-linux-NEXT: bl      __morestack
+; Thumb-linux-NEXT: pop     {r4}
+; Thumb-linux-NEXT: mov     lr, r4
+; Thumb-linux-NEXT: pop     {r4, r5}
+; Thumb-linux-NEXT: bx      lr
+
+; Thumb-linux:      pop     {r4, r5}
+
+}
+
+define i32 @test_nested(i32 * nest %closure, i32 %other) {
+       %addend = load i32 * %closure
+       %result = add i32 %other, %addend
+       ret i32 %result
+
+; Thumb-android:      test_nested:
+
+; Thumb-android:      push  {r4, r5}
+; Thumb-android-NEXT: mov     r5, sp
+; Thumb-android-NEXT: ldr     r4, .LCPI1_0
+; Thumb-android-NEXT: ldr     r4, [r4]
+; Thumb-android-NEXT: cmp     r4, r5
+; Thumb-android-NEXT: blo     .LBB1_2
+
+; Thumb-android:      mov     r4, #0
+; Thumb-android-NEXT: mov     r5, #0
+; Thumb-android-NEXT: push    {lr}
+; Thumb-android-NEXT: bl      __morestack
+; Thumb-android-NEXT: pop     {r4}
+; Thumb-android-NEXT: mov     lr, r4
+; Thumb-android-NEXT: pop     {r4, r5}
+; Thumb-android-NEXT: bx      lr
+
+; Thumb-android:      pop     {r4, r5}
+
+; Thumb-linux:      test_nested:
+
+; Thumb-linux:      push    {r4, r5}
+; Thumb-linux-NEXT: mov     r5, sp
+; Thumb-linux-NEXT: ldr     r4, .LCPI1_0
+; Thumb-linux-NEXT: ldr     r4, [r4]
+; Thumb-linux-NEXT: cmp     r4, r5
+; Thumb-linux-NEXT: blo     .LBB1_2
+
+; Thumb-linux:      mov     r4, #0
+; Thumb-linux-NEXT: mov     r5, #0
+; Thumb-linux-NEXT: push    {lr}
+; Thumb-linux-NEXT: bl      __morestack
+; Thumb-linux-NEXT: pop     {r4}
+; Thumb-linux-NEXT: mov     lr, r4
+; Thumb-linux-NEXT: pop     {r4, r5}
+; Thumb-linux-NEXT: bx      lr
+
+; Thumb-linux:      pop     {r4, r5}
+
+}
+
+define void @test_large() {
+        %mem = alloca i32, i32 10000
+        call void @dummy_use (i32* %mem, i32 0)
+        ret void
+
+; Thumb-android:      test_large:
+
+; Thumb-android:      push    {r4, r5}
+; Thumb-android-NEXT: mov     r5, sp
+; Thumb-android-NEXT: sub     r5, #40192
+; Thumb-android-NEXT: ldr     r4, .LCPI2_2
+; Thumb-android-NEXT: ldr     r4, [r4]
+; Thumb-android-NEXT: cmp     r4, r5
+; Thumb-android-NEXT: blo     .LBB2_2
+
+; Thumb-android:      mov     r4, #40192
+; Thumb-android-NEXT: mov     r5, #0
+; Thumb-android-NEXT: push    {lr}
+; Thumb-android-NEXT: bl      __morestack
+; Thumb-android-NEXT: pop     {r4}
+; Thumb-android-NEXT: mov     lr, r4
+; Thumb-android-NEXT: pop     {r4, r5}
+; Thumb-android-NEXT: bx      lr
+
+; Thumb-android:      pop     {r4, r5}
+
+; Thumb-linux:      test_large:
+
+; Thumb-linux:      push    {r4, r5}
+; Thumb-linux-NEXT: mov     r5, sp
+; Thumb-linux-NEXT: sub     r5, #40192
+; Thumb-linux-NEXT: ldr     r4, .LCPI2_2
+; Thumb-linux-NEXT: ldr     r4, [r4]
+; Thumb-linux-NEXT: cmp     r4, r5
+; Thumb-linux-NEXT: blo     .LBB2_2
+
+; Thumb-linux:      mov     r4, #40192
+; Thumb-linux-NEXT: mov     r5, #0
+; Thumb-linux-NEXT: push    {lr}
+; Thumb-linux-NEXT: bl      __morestack
+; Thumb-linux-NEXT: pop     {r4}
+; Thumb-linux-NEXT: mov     lr, r4
+; Thumb-linux-NEXT: pop     {r4, r5}
+; Thumb-linux-NEXT: bx      lr
+
+; Thumb-linux:      pop     {r4, r5}
+
+}
+
+define fastcc void @test_fastcc() {
+        %mem = alloca i32, i32 10
+        call void @dummy_use (i32* %mem, i32 10)
+        ret void
+
+; Thumb-android:      test_fastcc:
+
+; Thumb-android:      push    {r4, r5}
+; Thumb-android-NEXT: mov     r5, sp
+; Thumb-android-NEXT: ldr     r4, .LCPI3_0
+; Thumb-android-NEXT: ldr     r4, [r4]
+; Thumb-android-NEXT: cmp     r4, r5
+; Thumb-android-NEXT: blo     .LBB3_2
+
+; Thumb-android:      mov     r4, #48
+; Thumb-android-NEXT: mov     r5, #0
+; Thumb-android-NEXT: push    {lr}
+; Thumb-android-NEXT: bl      __morestack
+; Thumb-android-NEXT: pop     {r4}
+; Thumb-android-NEXT: mov     lr, r4
+; Thumb-android-NEXT: pop     {r4, r5}
+; Thumb-android-NEXT: bx      lr
+
+; Thumb-android:      pop     {r4, r5}
+
+; Thumb-linux:      test_fastcc:
+
+; Thumb-linux:      push    {r4, r5}
+; Thumb-linux-NEXT: mov     r5, sp
+; Thumb-linux-NEXT: ldr     r4, .LCPI3_0
+; Thumb-linux-NEXT: ldr     r4, [r4]
+; Thumb-linux-NEXT: cmp     r4, r5
+; Thumb-linux-NEXT: blo     .LBB3_2
+
+; Thumb-linux:      mov     r4, #48
+; Thumb-linux-NEXT: mov     r5, #0
+; Thumb-linux-NEXT: push    {lr}
+; Thumb-linux-NEXT: bl      __morestack
+; Thumb-linux-NEXT: pop     {r4}
+; Thumb-linux-NEXT: mov     lr, r4
+; Thumb-linux-NEXT: pop     {r4, r5}
+; Thumb-linux-NEXT: bx      lr
+
+; Thumb-linux:      pop     {r4, r5}
+
+}
+
+define fastcc void @test_fastcc_large() {
+        %mem = alloca i32, i32 10000
+        call void @dummy_use (i32* %mem, i32 0)
+        ret void
+
+; Thumb-android:      test_fastcc_large:
+
+; Thumb-android:      push    {r4, r5}
+; Thumb-android-NEXT: mov     r5, sp
+; Thumb-android-NEXT: sub     r5, #40192
+; Thumb-android-NEXT: ldr     r4, .LCPI4_2
+; Thumb-android-NEXT: ldr     r4, [r4]
+; Thumb-android-NEXT: cmp     r4, r5
+; Thumb-android-NEXT: blo     .LBB4_2
+
+; Thumb-android:      mov     r4, #40192
+; Thumb-android-NEXT: mov     r5, #0
+; Thumb-android-NEXT: push    {lr}
+; Thumb-android-NEXT: bl      __morestack
+; Thumb-android-NEXT: pop     {r4}
+; Thumb-android-NEXT: mov     lr, r4
+; Thumb-android-NEXT: pop     {r4, r5}
+; Thumb-android-NEXT: bx      lr
+
+; Thumb-android:      pop     {r4, r5}
+
+; Thumb-linux:      test_fastcc_large:
+
+; Thumb-linux:      push    {r4, r5}
+; Thumb-linux-NEXT: mov     r5, sp
+; Thumb-linux-NEXT: sub     r5, #40192
+; Thumb-linux-NEXT: ldr     r4, .LCPI4_2
+; Thumb-linux-NEXT: ldr     r4, [r4]
+; Thumb-linux-NEXT: cmp     r4, r5
+; Thumb-linux-NEXT: blo     .LBB4_2
+
+; Thumb-linux:      mov     r4, #40192
+; Thumb-linux-NEXT: mov     r5, #0
+; Thumb-linux-NEXT: push    {lr}
+; Thumb-linux-NEXT: bl      __morestack
+; Thumb-linux-NEXT: pop     {r4}
+; Thumb-linux-NEXT: mov     lr, r4
+; Thumb-linux-NEXT: pop     {r4, r5}
+; Thumb-linux-NEXT: bx      lr
+
+; Thumb-linux:      pop     {r4, r5}
+
+}
diff --git a/test/CodeGen/Thumb/sjljehprepare-lower-vector.ll b/test/CodeGen/Thumb/sjljehprepare-lower-vector.ll
new file mode 100644
index 0000000..ab082c7
--- /dev/null
+++ b/test/CodeGen/Thumb/sjljehprepare-lower-vector.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=thumbv7-apple-ios < %s
+; SjLjEHPrepare shouldn't crash when lowering vectors.
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios"
+
+define i8* @foo(<4 x i32> %c) {
+entry:
+  invoke void @bar ()
+    to label %unreachable unwind label %handler
+
+unreachable:
+  unreachable
+
+handler:
+  %tmp = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @baz to i8*)
+  cleanup
+  resume { i8*, i32 } undef
+}
+
+declare void @bar()
+declare i32 @baz(...)
+
diff --git a/test/CodeGen/Thumb/triple.ll b/test/CodeGen/Thumb/triple.ll
new file mode 100644
index 0000000..0a1759f
--- /dev/null
+++ b/test/CodeGen/Thumb/triple.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -mtriple=thumb | FileCheck %s
+
+; CHECK: .code	16
+
+define void @f() {
+  ret void
+}
diff --git a/test/CodeGen/Thumb/unord.ll b/test/CodeGen/Thumb/unord.ll
index 39458ae..3cf9ebf 100644
--- a/test/CodeGen/Thumb/unord.ll
+++ b/test/CodeGen/Thumb/unord.ll
@@ -1,13 +1,20 @@
-; RUN: llc < %s -march=thumb | grep bne | count 1
-; RUN: llc < %s -march=thumb | grep beq | count 1
+; RUN: llc < %s -mtriple=thumb-apple-darwin | FileCheck %s
 
 define i32 @f1(float %X, float %Y) {
+; CHECK-LABEL: _f1:
+; CHECK: bne
+; CHECK: .data_region
+; CHECK: .long   ___unordsf2
 	%tmp = fcmp uno float %X, %Y
 	%retval = select i1 %tmp, i32 1, i32 -1
 	ret i32 %retval
 }
 
 define i32 @f2(float %X, float %Y) {
+; CHECK-LABEL: _f2:
+; CHECK: beq
+; CHECK: .data_region
+; CHECK: .long   ___unordsf2
 	%tmp = fcmp ord float %X, %Y
 	%retval = select i1 %tmp, i32 1, i32 -1
 	ret i32 %retval
diff --git a/test/CodeGen/Thumb2/2011-12-16-T2SizeReduceAssert.ll b/test/CodeGen/Thumb2/2011-12-16-T2SizeReduceAssert.ll
index dadbdc5..ea8d233 100644
--- a/test/CodeGen/Thumb2/2011-12-16-T2SizeReduceAssert.ll
+++ b/test/CodeGen/Thumb2/2011-12-16-T2SizeReduceAssert.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=pic -disable-fp-elim -mcpu=cortex-a8 
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=pic -disable-fp-elim -mcpu=cortex-a8
+; RUN: llc < %s -mtriple=thumbv8-none-linux-gnueabi
 
 %struct.LIST_NODE.0.16 = type { %struct.LIST_NODE.0.16*, i8* }
 
@@ -26,3 +27,23 @@ bb3:                                              ; preds = %bb2, %entry
 bb5:                                              ; preds = %bb3, %bb
   ret %struct.LIST_NODE.0.16* null
 }
+
+declare void @use(i32)
+define double @find_max_double(i32 %n, double* nocapture readonly %aa) {
+entry:
+  br i1 undef, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %0 = load double* null, align 8
+  %cmp2.6 = fcmp ogt double %0, 0.000000e+00
+  %idx.1.6 = select i1 %cmp2.6, i32 undef, i32 0
+  %idx.1.7 = select i1 undef, i32 undef, i32 %idx.1.6
+  %max.1.7 = select i1 undef, double 0.000000e+00, double undef
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %max.0.lcssa = phi double [ undef, %entry ], [ %max.1.7, %for.body ]
+  %idx.0.lcssa = phi i32 [ 0, %entry ], [ %idx.1.7, %for.body ]
+  tail call void @use(i32 %idx.0.lcssa)
+  ret double %max.0.lcssa
+}
diff --git a/test/CodeGen/Thumb2/bfx.ll b/test/CodeGen/Thumb2/bfx.ll
index 489349d..e380b8f 100644
--- a/test/CodeGen/Thumb2/bfx.ll
+++ b/test/CodeGen/Thumb2/bfx.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @sbfx1(i32 %a) {
 ; CHECK: sbfx1
diff --git a/test/CodeGen/Thumb2/carry.ll b/test/CodeGen/Thumb2/carry.ll
index da1902b..48fba4e 100644
--- a/test/CodeGen/Thumb2/carry.ll
+++ b/test/CodeGen/Thumb2/carry.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 entry:
diff --git a/test/CodeGen/Thumb2/cortex-fp.ll b/test/CodeGen/Thumb2/cortex-fp.ll
index f6cea72..e63970a 100644
--- a/test/CodeGen/Thumb2/cortex-fp.ll
+++ b/test/CodeGen/Thumb2/cortex-fp.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-m3 | FileCheck %s -check-prefix=CORTEXM3
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-m4 | FileCheck %s -check-prefix=CORTEXM4
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=CORTEXM3
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=CORTEXM4
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK -check-prefix=CORTEXA8
 
 
 define float @foo(float %a, float %b) {
 entry:
-; CHECK: foo
+; CHECK-LABEL: foo:
 ; CORTEXM3: blx ___mulsf3
 ; CORTEXM4: vmul.f32  s
 ; CORTEXA8: vmul.f32  d
@@ -15,7 +15,7 @@ entry:
 
 define double @bar(double %a, double %b) {
 entry:
-; CHECK: bar
+; CHECK-LABEL: bar:
   %0 = fmul double %a, %b
 ; CORTEXM3: blx ___muldf3
 ; CORTEXM4: blx ___muldf3
diff --git a/test/CodeGen/Thumb2/div.ll b/test/CodeGen/Thumb2/div.ll
index 003d717..e783c88 100644
--- a/test/CodeGen/Thumb2/div.ll
+++ b/test/CodeGen/Thumb2/div.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -mattr=+thumb2 \
+; RUN: llc < %s -mtriple=thumb-apple-darwin -mcpu=arm1156t2-s -mattr=+thumb2 \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMB
 ; RUN: llc < %s -march=thumb -mcpu=cortex-m3 -mattr=+thumb2 \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMBV7M
diff --git a/test/CodeGen/Thumb2/large-stack.ll b/test/CodeGen/Thumb2/large-stack.ll
index 36f3ce2..8d79da7 100644
--- a/test/CodeGen/Thumb2/large-stack.ll
+++ b/test/CodeGen/Thumb2/large-stack.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=DARWIN
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -mtriple=arm-linux-gnueabi | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 \
+; RUN:  -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 \
+; RUN:  -mtriple=arm-linux-gnueabi | FileCheck %s -check-prefix=LINUX
 
 define void @test1() {
 ; DARWIN-LABEL: test1:
diff --git a/test/CodeGen/Thumb2/longMACt.ll b/test/CodeGen/Thumb2/longMACt.ll
index a457333..abe65f2 100644
--- a/test/CodeGen/Thumb2/longMACt.ll
+++ b/test/CodeGen/Thumb2/longMACt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 ; Check generated signed and unsigned multiply accumulate long.
 
 define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
diff --git a/test/CodeGen/Thumb2/mul_const.ll b/test/CodeGen/Thumb2/mul_const.ll
index 488f4d1..41de477 100644
--- a/test/CodeGen/Thumb2/mul_const.ll
+++ b/test/CodeGen/Thumb2/mul_const.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 ; rdar://7069502
 
 define i32 @t1(i32 %v) nounwind readnone {
diff --git a/test/CodeGen/Thumb2/segmented-stacks.ll b/test/CodeGen/Thumb2/segmented-stacks.ll
new file mode 100644
index 0000000..602fc84
--- /dev/null
+++ b/test/CodeGen/Thumb2/segmented-stacks.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -segmented-stacks -filetype=obj
+
+
+; Just to prevent the alloca from being optimized away
+declare void @dummy_use(i32*, i32)
+
+define void @test_basic() {
+        %mem = alloca i32, i32 10
+        call void @dummy_use (i32* %mem, i32 10)
+	ret void
+
+; Thumb-android:      test_basic:
+
+; Thumb-android:      push    {r4, r5}
+; Thumb-android-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; Thumb-android-NEXT: mov     r5, sp
+; Thumb-android-NEXT: ldr     r4, [r4, #252]
+; Thumb-android-NEXT: cmp     r4, r5
+; Thumb-android-NEXT: blo     .LBB0_2
+
+; Thumb-android:      mov     r4, #48
+; Thumb-android-NEXT: mov     r5, #0
+; Thumb-android-NEXT: push    {lr}
+; Thumb-android-NEXT: bl      __morestack
+; Thumb-android-NEXT: ldr     lr, [sp], #4
+; Thumb-android-NEXT: pop     {r4, r5}
+; Thumb-android-NEXT: bx      lr
+
+; Thumb-android:      pop     {r4, r5}
+
+}
diff --git a/test/CodeGen/Thumb2/tail-call-r9.ll b/test/CodeGen/Thumb2/tail-call-r9.ll
index 24c76c9..673aa7c 100644
--- a/test/CodeGen/Thumb2/tail-call-r9.ll
+++ b/test/CodeGen/Thumb2/tail-call-r9.ll
@@ -6,7 +6,7 @@
 ; the destination address. It's callee-saved in AAPCS.
 define arm_aapcscc void @test(i32 %a) nounwind {
 ; CHECK-LABEL: test:
-; CHECK-NOT bx r9
+; CHECK-NOT: bx r9
   %tmp = load void ()** @foo, align 4
   tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r12}"() nounwind
   tail call arm_aapcscc void %tmp() nounwind
diff --git a/test/CodeGen/Thumb2/thumb2-adc.ll b/test/CodeGen/Thumb2/thumb2-adc.ll
index 7c34cfd..58e4c59 100644
--- a/test/CodeGen/Thumb2/thumb2-adc.ll
+++ b/test/CodeGen/Thumb2/thumb2-adc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; 734439407618 = 0x000000ab00000002
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-add.ll b/test/CodeGen/Thumb2/thumb2-add.ll
index c23c74a..5e81fcf 100644
--- a/test/CodeGen/Thumb2/thumb2-add.ll
+++ b/test/CodeGen/Thumb2/thumb2-add.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s 
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @t2ADDrc_255(i32 %lhs) {
 ; CHECK-LABEL: t2ADDrc_255:
diff --git a/test/CodeGen/Thumb2/thumb2-add2.ll b/test/CodeGen/Thumb2/thumb2-add2.ll
index 3bbc3bf..ff0e087 100644
--- a/test/CodeGen/Thumb2/thumb2-add2.ll
+++ b/test/CodeGen/Thumb2/thumb2-add2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-add3.ll b/test/CodeGen/Thumb2/thumb2-add3.ll
index 6cd818c..bb7788f 100644
--- a/test/CodeGen/Thumb2/thumb2-add3.ll
+++ b/test/CodeGen/Thumb2/thumb2-add3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a) {
     %tmp = add i32 %a, 4095
diff --git a/test/CodeGen/Thumb2/thumb2-add4.ll b/test/CodeGen/Thumb2/thumb2-add4.ll
index 8b95711..ed68d62 100644
--- a/test/CodeGen/Thumb2/thumb2-add4.ll
+++ b/test/CodeGen/Thumb2/thumb2-add4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-add5.ll b/test/CodeGen/Thumb2/thumb2-add5.ll
index beaa09e..7ef756f 100644
--- a/test/CodeGen/Thumb2/thumb2-add5.ll
+++ b/test/CodeGen/Thumb2/thumb2-add5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-add6.ll b/test/CodeGen/Thumb2/thumb2-add6.ll
index 0d2f122..c4a13be 100644
--- a/test/CodeGen/Thumb2/thumb2-add6.ll
+++ b/test/CodeGen/Thumb2/thumb2-add6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-and.ll b/test/CodeGen/Thumb2/thumb2-and.ll
index c9578d9..3ffcfd7 100644
--- a/test/CodeGen/Thumb2/thumb2-and.ll
+++ b/test/CodeGen/Thumb2/thumb2-and.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-and2.ll b/test/CodeGen/Thumb2/thumb2-and2.ll
index c0501ab..3bfe9b2 100644
--- a/test/CodeGen/Thumb2/thumb2-and2.ll
+++ b/test/CodeGen/Thumb2/thumb2-and2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-asr.ll b/test/CodeGen/Thumb2/thumb2-asr.ll
index ba782dd..fbe3971 100644
--- a/test/CodeGen/Thumb2/thumb2-asr.ll
+++ b/test/CodeGen/Thumb2/thumb2-asr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-asr2.ll b/test/CodeGen/Thumb2/thumb2-asr2.ll
index 3685bad..321b3f5 100644
--- a/test/CodeGen/Thumb2/thumb2-asr2.ll
+++ b/test/CodeGen/Thumb2/thumb2-asr2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-bcc.ll b/test/CodeGen/Thumb2/thumb2-bcc.ll
index 81f7de9..61171ac 100644
--- a/test/CodeGen/Thumb2/thumb2-bcc.ll
+++ b/test/CodeGen/Thumb2/thumb2-bcc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 ; If-conversion defeats the purpose of this test, which is to check CBZ
 ; generation, so use memory barrier instruction to make sure it doesn't
 ; happen and we get actual branches.
diff --git a/test/CodeGen/Thumb2/thumb2-bfc.ll b/test/CodeGen/Thumb2/thumb2-bfc.ll
index 327b6d1..844fb4a 100644
--- a/test/CodeGen/Thumb2/thumb2-bfc.ll
+++ b/test/CodeGen/Thumb2/thumb2-bfc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; 4278190095 = 0xff00000f
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-bic.ll b/test/CodeGen/Thumb2/thumb2-bic.ll
index 5938fa1..fc57ec8 100644
--- a/test/CodeGen/Thumb2/thumb2-bic.ll
+++ b/test/CodeGen/Thumb2/thumb2-bic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-clz.ll b/test/CodeGen/Thumb2/thumb2-clz.ll
index dbdaae2..a5cd074 100644
--- a/test/CodeGen/Thumb2/thumb2-clz.ll
+++ b/test/CodeGen/Thumb2/thumb2-clz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2,+v7 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+v7 | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-cmn.ll b/test/CodeGen/Thumb2/thumb2-cmn.ll
index 8bcaa7e..da7d4b1 100644
--- a/test/CodeGen/Thumb2/thumb2-cmn.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; These tests could be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-cmn2.ll b/test/CodeGen/Thumb2/thumb2-cmn2.ll
index f5db728..a09a149 100644
--- a/test/CodeGen/Thumb2/thumb2-cmn2.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmn2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; -0x000000bb = 4294967109
 define i1 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-cmp.ll b/test/CodeGen/Thumb2/thumb2-cmp.ll
index 8741344..06c611d 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-cmp2.ll b/test/CodeGen/Thumb2/thumb2-cmp2.ll
index 5b880f1..8ca3caf 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp2.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-eor.ll b/test/CodeGen/Thumb2/thumb2-eor.ll
index b3e323c..6dfc5cd 100644
--- a/test/CodeGen/Thumb2/thumb2-eor.ll
+++ b/test/CodeGen/Thumb2/thumb2-eor.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-eor2.ll b/test/CodeGen/Thumb2/thumb2-eor2.ll
index 5daa13d..cf27448 100644
--- a/test/CodeGen/Thumb2/thumb2-eor2.ll
+++ b/test/CodeGen/Thumb2/thumb2-eor2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; 0x000000bb = 187
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-jtb.ll b/test/CodeGen/Thumb2/thumb2-jtb.ll
index 0748b9b..11620c2 100644
--- a/test/CodeGen/Thumb2/thumb2-jtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-jtb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -arm-adjust-jump-tables=0 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -arm-adjust-jump-tables=0 | FileCheck %s
 
 ; Do not use tbb / tbh if any destination is before the jumptable.
 ; rdar://7102917
diff --git a/test/CodeGen/Thumb2/thumb2-ldr.ll b/test/CodeGen/Thumb2/thumb2-ldr.ll
index 7f68f66..09212d3 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32* %v) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_ext.ll b/test/CodeGen/Thumb2/thumb2-ldr_ext.ll
index 9e6aef4..b865cf4 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_ext.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_ext.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | grep ldrb | count 1
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | grep ldrh | count 1
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | grep ldrsb | count 1
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | grep ldrsh | count 1
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrb | count 1
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrh | count 1
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrsb | count 1
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrsh | count 1
 
 define i32 @test1(i8* %v.pntr.s0.u1) {
     %tmp.u = load i8* %v.pntr.s0.u1
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_post.ll b/test/CodeGen/Thumb2/thumb2-ldr_post.ll
index bce8474..4f04647 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_post.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_post.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @test(i32 %a, i32 %b, i32 %c) {
         %tmp1 = mul i32 %a, %b          ; <i32> [#uses=2]
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_pre.ll b/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
index 601c0b5..4907dec 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | \
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | \
 ; RUN:   grep "ldr.*\!" | count 3
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | \
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | \
 ; RUN:   grep "ldrsb.*\!" | count 1
 
 define i32* @test1(i32* %X, i32* %dest) {
diff --git a/test/CodeGen/Thumb2/thumb2-ldrb.ll b/test/CodeGen/Thumb2/thumb2-ldrb.ll
index c135eff..c79f732 100644
--- a/test/CodeGen/Thumb2/thumb2-ldrb.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldrb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i8 @f1(i8* %v) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-ldrh.ll b/test/CodeGen/Thumb2/thumb2-ldrh.ll
index 99f6aba..7ba9f22 100644
--- a/test/CodeGen/Thumb2/thumb2-ldrh.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldrh.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i16 @f1(i16* %v) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-lsl.ll b/test/CodeGen/Thumb2/thumb2-lsl.ll
index 1b48538..015a9dd 100644
--- a/test/CodeGen/Thumb2/thumb2-lsl.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsl2.ll b/test/CodeGen/Thumb2/thumb2-lsl2.ll
index bc0978e..c64897a 100644
--- a/test/CodeGen/Thumb2/thumb2-lsl2.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsl2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsr.ll b/test/CodeGen/Thumb2/thumb2-lsr.ll
index a3b207c..24973c7 100644
--- a/test/CodeGen/Thumb2/thumb2-lsr.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsr2.ll b/test/CodeGen/Thumb2/thumb2-lsr2.ll
index ae55735..0b199bb 100644
--- a/test/CodeGen/Thumb2/thumb2-lsr2.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsr2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsr3.ll b/test/CodeGen/Thumb2/thumb2-lsr3.ll
index e7ba782..c814123 100644
--- a/test/CodeGen/Thumb2/thumb2-lsr3.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsr3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i1 @test1(i64 %poscnt, i32 %work) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-mla.ll b/test/CodeGen/Thumb2/thumb2-mla.ll
index 709fa13..a99ffe7 100644
--- a/test/CodeGen/Thumb2/thumb2-mla.ll
+++ b/test/CodeGen/Thumb2/thumb2-mla.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 \
+; RUN:  -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
diff --git a/test/CodeGen/Thumb2/thumb2-mls.ll b/test/CodeGen/Thumb2/thumb2-mls.ll
index 86e147b..45d6d13 100644
--- a/test/CodeGen/Thumb2/thumb2-mls.ll
+++ b/test/CodeGen/Thumb2/thumb2-mls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
diff --git a/test/CodeGen/Thumb2/thumb2-mov.ll b/test/CodeGen/Thumb2/thumb2-mov.ll
index 148bafe..7c0dc01 100644
--- a/test/CodeGen/Thumb2/thumb2-mov.ll
+++ b/test/CodeGen/Thumb2/thumb2-mov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; Test #<const>
 
diff --git a/test/CodeGen/Thumb2/thumb2-mul.ll b/test/CodeGen/Thumb2/thumb2-mul.ll
index a989989..5f68250 100644
--- a/test/CodeGen/Thumb2/thumb2-mul.ll
+++ b/test/CodeGen/Thumb2/thumb2-mul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-mulhi.ll b/test/CodeGen/Thumb2/thumb2-mulhi.ll
index 9d4840a..e32bd26 100644
--- a/test/CodeGen/Thumb2/thumb2-mulhi.ll
+++ b/test/CodeGen/Thumb2/thumb2-mulhi.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2dsp | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2dsp | FileCheck %s
 
 define i32 @smulhi(i32 %x, i32 %y) {
 ; CHECK: smulhi
diff --git a/test/CodeGen/Thumb2/thumb2-mvn2.ll b/test/CodeGen/Thumb2/thumb2-mvn2.ll
index bce54a3..cee6f23 100644
--- a/test/CodeGen/Thumb2/thumb2-mvn2.ll
+++ b/test/CodeGen/Thumb2/thumb2-mvn2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-neg.ll b/test/CodeGen/Thumb2/thumb2-neg.ll
index 40e8098..491e4de 100644
--- a/test/CodeGen/Thumb2/thumb2-neg.ll
+++ b/test/CodeGen/Thumb2/thumb2-neg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-orn.ll b/test/CodeGen/Thumb2/thumb2-orn.ll
index 5bbe653..08676b1 100644
--- a/test/CodeGen/Thumb2/thumb2-orn.ll
+++ b/test/CodeGen/Thumb2/thumb2-orn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 
 define i32 @f1(i32 %a, i32 %b) {
diff --git a/test/CodeGen/Thumb2/thumb2-orn2.ll b/test/CodeGen/Thumb2/thumb2-orn2.ll
index eff3ae3..a8f4a84 100644
--- a/test/CodeGen/Thumb2/thumb2-orn2.ll
+++ b/test/CodeGen/Thumb2/thumb2-orn2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 
 ; 0x000000bb = 187
diff --git a/test/CodeGen/Thumb2/thumb2-orr.ll b/test/CodeGen/Thumb2/thumb2-orr.ll
index 13ed862..776d7fe 100644
--- a/test/CodeGen/Thumb2/thumb2-orr.ll
+++ b/test/CodeGen/Thumb2/thumb2-orr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-orr2.ll b/test/CodeGen/Thumb2/thumb2-orr2.ll
index 837bb1c..37885e2 100644
--- a/test/CodeGen/Thumb2/thumb2-orr2.ll
+++ b/test/CodeGen/Thumb2/thumb2-orr2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 
 ; 0x000000bb = 187
diff --git a/test/CodeGen/Thumb2/thumb2-pack.ll b/test/CodeGen/Thumb2/thumb2-pack.ll
index 1052dd2..9a0d889 100644
--- a/test/CodeGen/Thumb2/thumb2-pack.ll
+++ b/test/CodeGen/Thumb2/thumb2-pack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk | FileCheck %s
 
 ; CHECK: test1
 ; CHECK: pkhbt   r0, r0, r1, lsl #16
diff --git a/test/CodeGen/Thumb2/thumb2-rev.ll b/test/CodeGen/Thumb2/thumb2-rev.ll
index 67cd623..d710113 100644
--- a/test/CodeGen/Thumb2/thumb2-rev.ll
+++ b/test/CodeGen/Thumb2/thumb2-rev.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2,+v7,+t2xtpk | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+v7,+t2xtpk | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-rev16.ll b/test/CodeGen/Thumb2/thumb2-rev16.ll
index 10cd539..3e26587 100644
--- a/test/CodeGen/Thumb2/thumb2-rev16.ll
+++ b/test/CodeGen/Thumb2/thumb2-rev16.ll
@@ -1,7 +1,7 @@
 ; XFAIL: *
 ; fixme rev16 pattern is not matching
 
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | grep "rev16\W*r[0-9]*,\W*r[0-9]*" | count 1
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep "rev16\W*r[0-9]*,\W*r[0-9]*" | count 1
 
 ; 0xff00ff00 = 4278255360
 ; 0x00ff00ff = 16711935
diff --git a/test/CodeGen/Thumb2/thumb2-ror.ll b/test/CodeGen/Thumb2/thumb2-ror.ll
index 2a218ea..3a21560 100644
--- a/test/CodeGen/Thumb2/thumb2-ror.ll
+++ b/test/CodeGen/Thumb2/thumb2-ror.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 ; RUN: llc < %s -march=thumb | FileCheck %s -check-prefix=THUMB1
 
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-rsb.ll b/test/CodeGen/Thumb2/thumb2-rsb.ll
index 150a25f..94a1fb0 100644
--- a/test/CodeGen/Thumb2/thumb2-rsb.ll
+++ b/test/CodeGen/Thumb2/thumb2-rsb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
     %tmp = shl i32 %b, 5
diff --git a/test/CodeGen/Thumb2/thumb2-rsb2.ll b/test/CodeGen/Thumb2/thumb2-rsb2.ll
index 15aa8af..248ab16 100644
--- a/test/CodeGen/Thumb2/thumb2-rsb2.ll
+++ b/test/CodeGen/Thumb2/thumb2-rsb2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-sbc.ll b/test/CodeGen/Thumb2/thumb2-sbc.ll
index 0c37984..7c69451 100644
--- a/test/CodeGen/Thumb2/thumb2-sbc.ll
+++ b/test/CodeGen/Thumb2/thumb2-sbc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=thumb -mattr=+thumb2 < %s | FileCheck %s
+; RUN: llc -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 < %s | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK: f1
diff --git a/test/CodeGen/Thumb2/thumb2-select.ll b/test/CodeGen/Thumb2/thumb2-select.ll
index 5f5fa19..949b611 100644
--- a/test/CodeGen/Thumb2/thumb2-select.ll
+++ b/test/CodeGen/Thumb2/thumb2-select.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -show-mc-encoding | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -show-mc-encoding | FileCheck %s
 
 define i32 @f1(i32 %a.s) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-select_xform.ll b/test/CodeGen/Thumb2/thumb2-select_xform.ll
index ed4d26d..f8ceba2 100644
--- a/test/CodeGen/Thumb2/thumb2-select_xform.ll
+++ b/test/CodeGen/Thumb2/thumb2-select_xform.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK: t1
diff --git a/test/CodeGen/Thumb2/thumb2-smla.ll b/test/CodeGen/Thumb2/thumb2-smla.ll
index aaaedfa..f96263e 100644
--- a/test/CodeGen/Thumb2/thumb2-smla.ll
+++ b/test/CodeGen/Thumb2/thumb2-smla.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk,+t2dsp | FileCheck %s
-; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk,+t2dsp -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f3(i32 %a, i16 %x, i32 %y) {
 ; CHECK: f3
diff --git a/test/CodeGen/Thumb2/thumb2-smul.ll b/test/CodeGen/Thumb2/thumb2-smul.ll
index 7a13269..742e766 100644
--- a/test/CodeGen/Thumb2/thumb2-smul.ll
+++ b/test/CodeGen/Thumb2/thumb2-smul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk,+t2dsp |  FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp |  FileCheck %s
 
 @x = weak global i16 0          ; <i16*> [#uses=1]
 @y = weak global i16 0          ; <i16*> [#uses=0]
diff --git a/test/CodeGen/Thumb2/thumb2-str.ll b/test/CodeGen/Thumb2/thumb2-str.ll
index fb5fa16..f800974 100644
--- a/test/CodeGen/Thumb2/thumb2-str.ll
+++ b/test/CodeGen/Thumb2/thumb2-str.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32* %v) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-str_post.ll b/test/CodeGen/Thumb2/thumb2-str_post.ll
index 2133d28..716c2d2 100644
--- a/test/CodeGen/Thumb2/thumb2-str_post.ll
+++ b/test/CodeGen/Thumb2/thumb2-str_post.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i16 @test1(i32* %X, i16* %A) {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/Thumb2/thumb2-str_pre.ll b/test/CodeGen/Thumb2/thumb2-str_pre.ll
index 1e6616a..83b3779 100644
--- a/test/CodeGen/Thumb2/thumb2-str_pre.ll
+++ b/test/CodeGen/Thumb2/thumb2-str_pre.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define void @test1(i32* %X, i32* %A, i32** %dest) {
 ; CHECK: test1
diff --git a/test/CodeGen/Thumb2/thumb2-strb.ll b/test/CodeGen/Thumb2/thumb2-strb.ll
index cc39b7d..39e376d 100644
--- a/test/CodeGen/Thumb2/thumb2-strb.ll
+++ b/test/CodeGen/Thumb2/thumb2-strb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i8 @f1(i8 %a, i8* %v) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-strh.ll b/test/CodeGen/Thumb2/thumb2-strh.ll
index d686938..9444383 100644
--- a/test/CodeGen/Thumb2/thumb2-strh.ll
+++ b/test/CodeGen/Thumb2/thumb2-strh.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i16 @f1(i16 %a, i16* %v) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-sub.ll b/test/CodeGen/Thumb2/thumb2-sub.ll
index f83dfe2..ad5eda1 100644
--- a/test/CodeGen/Thumb2/thumb2-sub.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-sub2.ll b/test/CodeGen/Thumb2/thumb2-sub2.ll
index 47eb1e1..f114892 100644
--- a/test/CodeGen/Thumb2/thumb2-sub2.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a) {
     %tmp = sub i32 %a, 4095
diff --git a/test/CodeGen/Thumb2/thumb2-sub3.ll b/test/CodeGen/Thumb2/thumb2-sub3.ll
index 1dbda57..ae12b28 100644
--- a/test/CodeGen/Thumb2/thumb2-sub3.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub3.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=thumb -mattr=+thumb2 < %s | FileCheck %s
+; RUN: llc -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 < %s | FileCheck %s
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-sub4.ll b/test/CodeGen/Thumb2/thumb2-sub4.ll
index ff1441a..873080a 100644
--- a/test/CodeGen/Thumb2/thumb2-sub4.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-sub5.ll b/test/CodeGen/Thumb2/thumb2-sub5.ll
index 5941dd6..02c83f6 100644
--- a/test/CodeGen/Thumb2/thumb2-sub5.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub5.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 -mattr=+32bit | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+32bit \
+; RUN:  | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
index f3d0edf..75bbd83 100644
--- a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk | FileCheck %s
 
 define i32 @test0(i8 %A) {
 ; CHECK: test0
diff --git a/test/CodeGen/Thumb2/thumb2-teq.ll b/test/CodeGen/Thumb2/thumb2-teq.ll
index 5acda35..6b34e70 100644
--- a/test/CodeGen/Thumb2/thumb2-teq.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-teq2.ll b/test/CodeGen/Thumb2/thumb2-teq2.ll
index 27ecad8..ea43e560 100644
--- a/test/CodeGen/Thumb2/thumb2-teq2.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; tst as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-tst.ll b/test/CodeGen/Thumb2/thumb2-tst.ll
index 31eafea..c17510d 100644
--- a/test/CodeGen/Thumb2/thumb2-tst.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; tst as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-tst2.ll b/test/CodeGen/Thumb2/thumb2-tst2.ll
index f71e91d..764e3d4 100644
--- a/test/CodeGen/Thumb2/thumb2-tst2.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; tst as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/tls1.ll b/test/CodeGen/Thumb2/tls1.ll
index d91e3b3..4097356 100644
--- a/test/CodeGen/Thumb2/tls1.ll
+++ b/test/CodeGen/Thumb2/tls1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi | \
-; RUN:     grep "i(tpoff)"
+; RUN:     grep "i(TPOFF)"
 ; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi | \
 ; RUN:     grep "__aeabi_read_tp"
 ; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi \
diff --git a/test/CodeGen/Thumb2/tls2.ll b/test/CodeGen/Thumb2/tls2.ll
index 6cb019f..e6bed2f 100644
--- a/test/CodeGen/Thumb2/tls2.ll
+++ b/test/CodeGen/Thumb2/tls2.ll
@@ -8,7 +8,7 @@ entry:
 ; CHECK-NOT-PIC-LABEL: f:
 ; CHECK-NOT-PIC: add r0, pc
 ; CHECK-NOT-PIC: ldr r1, [r0]
-; CHECK-NOT-PIC: i(gottpoff)
+; CHECK-NOT-PIC: i(GOTTPOFF)
 
 ; CHECK-PIC-LABEL: f:
 ; CHECK-PIC: bl __tls_get_addr(PLT)
@@ -21,7 +21,7 @@ entry:
 ; CHECK-NOT-PIC-LABEL: g:
 ; CHECK-NOT-PIC: add r0, pc
 ; CHECK-NOT-PIC: ldr r1, [r0]
-; CHECK-NOT-PIC: i(gottpoff)
+; CHECK-NOT-PIC: i(GOTTPOFF)
 
 ; CHECK-PIC-LABEL: g:
 ; CHECK-PIC: bl __tls_get_addr(PLT)
diff --git a/test/CodeGen/Thumb2/v8_IT_5.ll b/test/CodeGen/Thumb2/v8_IT_5.ll
index 30250c8..2f352d6 100644
--- a/test/CodeGen/Thumb2/v8_IT_5.ll
+++ b/test/CodeGen/Thumb2/v8_IT_5.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
 ; CHECK: it	ne
 ; CHECK-NEXT: cmpne
-; CHECK-NEXT: beq
+; CHECK-NEXT: bne [[JUMPTARGET:.LBB[0-9]+_[0-9]+]]
 ; CHECK: cmp
 ; CHECK-NEXT: beq
 ; CHECK-NEXT: %if.else163
@@ -10,6 +10,7 @@
 ; CHECK-NEXT: b
 ; CHECK-NEXT: %if.else145
 ; CHECK-NEXT: mov.w
+; CHECK: [[JUMPTARGET]]:{{.*}}%if.else173
 
 %struct.hc = type { i32, i32, i32, i32 }
 
diff --git a/test/CodeGen/Thumb2/v8_IT_6.ll b/test/CodeGen/Thumb2/v8_IT_6.ll
new file mode 100644
index 0000000..b12c479
--- /dev/null
+++ b/test/CodeGen/Thumb2/v8_IT_6.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -mtriple=thumbv8 -show-mc-encoding | FileCheck %s
+; CHECK-NOT: orrsne r0, r1 @ encoding: [0x08,0x43]
+; Narrow tORR cannot be predicated and set CPSR at the same time!
+
+declare void @f(i32)
+
+define void @initCloneLookups() #1 {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc24, %entry
+  %cmp108 = phi i1 [ true, %entry ], [ %cmp, %for.inc24 ]
+  %y.0105 = phi i32 [ 1, %entry ], [ %inc25, %for.inc24 ]
+  %notlhs = icmp slt i32 %y.0105, 6
+  %notlhs69 = icmp sgt i32 %y.0105, 4
+  %sub = add nsw i32 %y.0105, -1
+  %cmp1.i = icmp sgt i32 %sub, 5
+  %cmp1.i54 = icmp sgt i32 %y.0105, 5
+  br i1 %cmp108, label %if.then.us, label %for.cond1.preheader.for.cond1.preheader.split_crit_edge
+
+for.cond1.preheader.for.cond1.preheader.split_crit_edge: ; preds = %for.cond1.preheader
+  br i1 %notlhs, label %for.inc.us101, label %for.inc
+
+if.then.us:                                       ; preds = %for.cond1.preheader, %for.inc.us
+  %x.071.us = phi i32 [ %inc.us.pre-phi, %for.inc.us ], [ 1, %for.cond1.preheader ]
+  %notrhs.us = icmp sge i32 %x.071.us, %y.0105
+  %or.cond44.not.us = or i1 %notrhs.us, %notlhs
+  %notrhs70.us = icmp sle i32 %x.071.us, %y.0105
+  %tobool.us = or i1 %notrhs70.us, %notlhs69
+  %or.cond66.us = and i1 %or.cond44.not.us, %tobool.us
+  br i1 %or.cond66.us, label %getHexxagonIndex.exit52.us, label %if.then.us.for.inc.us_crit_edge
+
+if.then.us.for.inc.us_crit_edge:                  ; preds = %if.then.us
+  %inc.us.pre = add nsw i32 %x.071.us, 1
+  br label %for.inc.us
+
+getHexxagonIndex.exit52.us:                       ; preds = %if.then.us
+  %cmp3.i.us = icmp slt i32 %x.071.us, 5
+  %or.cond.i.us = and i1 %cmp1.i, %cmp3.i.us
+  %..i.us = sext i1 %or.cond.i.us to i32
+  tail call void @f(i32 %..i.us) #3
+  %add.us = add nsw i32 %x.071.us, 1
+  %cmp3.i55.us = icmp slt i32 %add.us, 5
+  %or.cond.i56.us = and i1 %cmp1.i54, %cmp3.i55.us
+  %..i57.us = sext i1 %or.cond.i56.us to i32
+  tail call void @f(i32 %..i57.us) #3
+  %or.cond.i48.us = and i1 %notlhs69, %cmp3.i55.us
+  %..i49.us = sext i1 %or.cond.i48.us to i32
+  tail call void @f(i32 %..i49.us) #3
+  br label %for.inc.us
+
+for.inc.us:                                       ; preds = %if.then.us.for.inc.us_crit_edge, %getHexxagonIndex.exit52.us
+  %inc.us.pre-phi = phi i32 [ %inc.us.pre, %if.then.us.for.inc.us_crit_edge ], [ %add.us, %getHexxagonIndex.exit52.us ]
+  %exitcond109 = icmp eq i32 %inc.us.pre-phi, 10
+  br i1 %exitcond109, label %for.inc24, label %if.then.us
+
+for.inc.us101:                                    ; preds = %for.cond1.preheader.for.cond1.preheader.split_crit_edge, %for.inc.us101
+  %x.071.us74 = phi i32 [ %add.us89, %for.inc.us101 ], [ 1, %for.cond1.preheader.for.cond1.preheader.split_crit_edge ]
+  %cmp3.i.us84 = icmp slt i32 %x.071.us74, 5
+  %or.cond.i.us85 = and i1 %cmp1.i, %cmp3.i.us84
+  %..i.us86 = sext i1 %or.cond.i.us85 to i32
+  tail call void @f(i32 %..i.us86) #3
+  %add.us89 = add nsw i32 %x.071.us74, 1
+  %cmp3.i55.us93 = icmp slt i32 %add.us89, 5
+  %or.cond.i56.us94 = and i1 %cmp1.i54, %cmp3.i55.us93
+  %..i57.us95 = sext i1 %or.cond.i56.us94 to i32
+  tail call void @f(i32 %..i57.us95) #3
+  %or.cond.i48.us97 = and i1 %notlhs69, %cmp3.i55.us93
+  %..i49.us98 = sext i1 %or.cond.i48.us97 to i32
+  tail call void @f(i32 %..i49.us98) #3
+  %exitcond110 = icmp eq i32 %add.us89, 10
+  br i1 %exitcond110, label %for.inc24, label %for.inc.us101
+
+for.inc:                                          ; preds = %for.cond1.preheader.for.cond1.preheader.split_crit_edge, %for.inc
+  %x.071 = phi i32 [ %add, %for.inc ], [ 1, %for.cond1.preheader.for.cond1.preheader.split_crit_edge ]
+  %cmp3.i = icmp slt i32 %x.071, 5
+  %or.cond.i = and i1 %cmp1.i, %cmp3.i
+  %..i = sext i1 %or.cond.i to i32
+  tail call void @f(i32 %..i) #3
+  %add = add nsw i32 %x.071, 1
+  %cmp3.i55 = icmp slt i32 %add, 5
+  %or.cond.i56 = and i1 %cmp1.i54, %cmp3.i55
+  %..i57 = sext i1 %or.cond.i56 to i32
+  tail call void @f(i32 %..i57) #3
+  %or.cond.i48 = and i1 %notlhs69, %cmp3.i55
+  %..i49 = sext i1 %or.cond.i48 to i32
+  tail call void @f(i32 %..i49) #3
+  %exitcond = icmp eq i32 %add, 10
+  br i1 %exitcond, label %for.inc24, label %for.inc
+
+for.inc24:                                        ; preds = %for.inc, %for.inc.us101, %for.inc.us
+  %inc25 = add nsw i32 %y.0105, 1
+  %cmp = icmp slt i32 %inc25, 10
+  %exitcond111 = icmp eq i32 %inc25, 10
+  br i1 %exitcond111, label %for.end26, label %for.cond1.preheader
+
+for.end26:                                        ; preds = %for.inc24
+  ret void
+}
+
diff --git a/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll b/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
index d906da4..1b3fc38 100644
--- a/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
+++ b/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | \
+; RUN: llc < %s -march=x86 -mcpu=generic | \
 ; RUN:   grep shld | count 1
 ;
 ; Check that the isel does not fold the shld, which already folds a load
diff --git a/test/CodeGen/X86/2006-07-20-InlineAsm.ll b/test/CodeGen/X86/2006-07-20-InlineAsm.ll
index cac47cd..1facf15 100644
--- a/test/CodeGen/X86/2006-07-20-InlineAsm.ll
+++ b/test/CodeGen/X86/2006-07-20-InlineAsm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -march=x86 -no-integrated-as
 ; PR833
 
 @G = weak global i32 0		; <i32*> [#uses=3]
diff --git a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
index c4b08a3..2a9c832 100644
--- a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
+++ b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
@@ -1,5 +1,5 @@
 ; PR850
-; RUN: llc < %s -march=x86 -x86-asm-syntax=att | FileCheck %s
+; RUN: llc < %s -march=x86 -x86-asm-syntax=att -no-integrated-as | FileCheck %s
 
 ; CHECK: {{movl 4[(]%eax[)],%ebp}}
 ; CHECK: {{movl 0[(]%eax[)], %ebx}}
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
index 3b2e443..93fb344 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "mov %gs:72, %eax"
+; RUN: llc < %s -march=x86 -no-integrated-as | grep "mov %gs:72, %eax"
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin9"
 
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
index 366f583..6cf8bf9 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=yonah -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=yonah -march=x86 -no-integrated-as | FileCheck %s
 
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin9"
diff --git a/test/CodeGen/X86/2007-05-05-Personality.ll b/test/CodeGen/X86/2007-05-05-Personality.ll
index 7d21b71..5b8fe72 100644
--- a/test/CodeGen/X86/2007-05-05-Personality.ll
+++ b/test/CodeGen/X86/2007-05-05-Personality.ll
@@ -1,7 +1,12 @@
-; RUN: llc < %s -mtriple=i686-pc-linux-gnu -o - | FileCheck %s
-
-; CHECK: .cfi_personality 0, __gnat_eh_personality
-; CHECK: .cfi_lsda 0, .Lexception0
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -o -     | FileCheck %s  --check-prefix=LIN
+; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu -o - | FileCheck %s  --check-prefix=LIN
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -o -       | FileCheck %s  --check-prefix=WIN
+; RUN: llc < %s -mtriple=i686-pc-windows-gnu -o -   | FileCheck %s  --check-prefix=WIN
+
+; LIN: .cfi_personality 0, __gnat_eh_personality
+; LIN: .cfi_lsda 0, .Lexception0
+; WIN: .cfi_personality 0, ___gnat_eh_personality
+; WIN: .cfi_lsda 0, Lexception0
 
 @error = external global i8
 
diff --git a/test/CodeGen/X86/2007-09-17-ObjcFrameEH.ll b/test/CodeGen/X86/2007-09-17-ObjcFrameEH.ll
deleted file mode 100644
index 15466a1..0000000
--- a/test/CodeGen/X86/2007-09-17-ObjcFrameEH.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: llc < %s -disable-cfi -march=x86 -mtriple=i686-apple-darwin | FileCheck %s
-
-; CHECK: "_-[NSString(local) isNullOrNil].eh":
-
-	%struct.NSString = type {  }
-	%struct._objc__method_prototype_list = type opaque
-	%struct._objc_category = type { i8*, i8*, %struct._objc_method_list*, %struct._objc_method_list*, %struct._objc_protocol**, i32, %struct._prop_list_t* }
-	%struct._objc_method = type { %struct.objc_selector*, i8*, i8* }
-	%struct._objc_method_list = type opaque
-	%struct._objc_module = type { i32, i32, i8*, %struct._objc_symtab* }
-	%struct._objc_protocol = type { %struct._objc_protocol_extension*, i8*, %struct._objc_protocol**, %struct._objc__method_prototype_list*, %struct._objc__method_prototype_list* }
-	%struct._objc_protocol_extension = type opaque
-	%struct._objc_symtab = type { i32, %struct.objc_selector**, i16, i16, [1 x i8*] }
-	%struct._prop_list_t = type opaque
-	%struct.anon = type { %struct._objc__method_prototype_list*, i32, [1 x %struct._objc_method] }
-	%struct.objc_selector = type opaque
-@"\01L_OBJC_SYMBOLS" = internal global { i32, i32, i16, i16, [1 x %struct._objc_category*] } {
-    i32 0, 
-    i32 0, 
-    i16 0, 
-    i16 1, 
-    [1 x %struct._objc_category*] [ %struct._objc_category* bitcast ({ i8*, i8*, %struct._objc_method_list*, i32, i32, i32, i32 }* @"\01L_OBJC_CATEGORY_NSString_local" to %struct._objc_category*) ] }, section "__OBJC,__symbols,regular,no_dead_strip"		; <{ i32, i32, i16, i16, [1 x %struct._objc_category*] }*> [#uses=2]
-@"\01L_OBJC_CATEGORY_INSTANCE_METHODS_NSString_local" = internal global { i32, i32, [1 x %struct._objc_method] } {
-    i32 0, 
-    i32 1, 
-    [1 x %struct._objc_method] [ %struct._objc_method {
-        %struct.objc_selector* bitcast ([12 x i8]* @"\01L_OBJC_METH_VAR_NAME_0" to %struct.objc_selector*), 
-        i8* getelementptr ([7 x i8]* @"\01L_OBJC_METH_VAR_TYPE_0", i32 0, i32 0), 
-        i8* bitcast (i8 (%struct.NSString*, %struct.objc_selector*)  * @"-[NSString(local) isNullOrNil]" to i8*) } ] }, section "__OBJC,__cat_inst_meth,regular,no_dead_strip"		; <{ i32, i32, [1 x %struct._objc_method] }*> [#uses=3]
-@"\01L_OBJC_CATEGORY_NSString_local" = internal global { i8*, i8*, %struct._objc_method_list*, i32, i32, i32, i32 } {
-    i8* getelementptr ([6 x i8]* @"\01L_OBJC_CLASS_NAME_0", i32 0, i32 0), 
-    i8* getelementptr ([9 x i8]* @"\01L_OBJC_CLASS_NAME_1", i32 0, i32 0), 
-    %struct._objc_method_list* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01L_OBJC_CATEGORY_INSTANCE_METHODS_NSString_local" to %struct._objc_method_list*), 
-    i32 0, 
-    i32 0, 
-    i32 28, 
-    i32 0 }, section "__OBJC,__category,regular,no_dead_strip"		; <{ i8*, i8*, %struct._objc_method_list*, i32, i32, i32, i32 }*> [#uses=2]
-@"\01L_OBJC_IMAGE_INFO" = internal constant [2 x i32] zeroinitializer, section "__OBJC,__image_info,regular"		; <[2 x i32]*> [#uses=1]
-@"\01L_OBJC_MODULES" = internal global %struct._objc_module {
-    i32 7, 
-    i32 16, 
-    i8* getelementptr ([1 x i8]* @"\01L_OBJC_CLASS_NAME_2", i32 0, i32 0), 
-    %struct._objc_symtab* bitcast ({ i32, i32, i16, i16, [1 x %struct._objc_category*] }* @"\01L_OBJC_SYMBOLS" to %struct._objc_symtab*) }, section "__OBJC,__module_info,regular,no_dead_strip"		; <%struct._objc_module*> [#uses=1]
-@"\01.objc_class_ref_NSString" = internal global i8* @"\01.objc_class_name_NSString"		; <i8**> [#uses=0]
-@"\01.objc_class_name_NSString" = external global i8		; <i8*> [#uses=1]
-@"\01.objc_category_name_NSString_local" = constant i32 0		; <i32*> [#uses=1]
-@"\01L_OBJC_CLASS_NAME_2" = internal global [1 x i8] zeroinitializer, section "__TEXT,__cstring,cstring_literals"		; <[1 x i8]*> [#uses=2]
-@"\01L_OBJC_CLASS_NAME_1" = internal global [9 x i8] c"NSString\00", section "__TEXT,__cstring,cstring_literals"		; <[9 x i8]*> [#uses=2]
-@"\01L_OBJC_CLASS_NAME_0" = internal global [6 x i8] c"local\00", section "__TEXT,__cstring,cstring_literals"		; <[6 x i8]*> [#uses=2]
-@"\01L_OBJC_METH_VAR_NAME_0" = internal global [12 x i8] c"isNullOrNil\00", section "__TEXT,__cstring,cstring_literals"		; <[12 x i8]*> [#uses=3]
-@"\01L_OBJC_METH_VAR_TYPE_0" = internal global [7 x i8] c"c8@0:4\00", section "__TEXT,__cstring,cstring_literals"		; <[7 x i8]*> [#uses=2]
-@llvm.used = appending global [11 x i8*] [ i8* bitcast ({ i32, i32, i16, i16, [1 x %struct._objc_category*] }* @"\01L_OBJC_SYMBOLS" to i8*), i8* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01L_OBJC_CATEGORY_INSTANCE_METHODS_NSString_local" to i8*), i8* bitcast ({ i8*, i8*, %struct._objc_method_list*, i32, i32, i32, i32 }* @"\01L_OBJC_CATEGORY_NSString_local" to i8*), i8* bitcast ([2 x i32]* @"\01L_OBJC_IMAGE_INFO" to i8*), i8* bitcast (%struct._objc_module* @"\01L_OBJC_MODULES" to i8*), i8* bitcast (i32* @"\01.objc_category_name_NSString_local" to i8*), i8* getelementptr ([1 x i8]* @"\01L_OBJC_CLASS_NAME_2", i32 0, i32 0), i8* getelementptr ([9 x i8]* @"\01L_OBJC_CLASS_NAME_1", i32 0, i32 0), i8* getelementptr ([6 x i8]* @"\01L_OBJC_CLASS_NAME_0", i32 0, i32 0), i8* getelementptr ([12 x i8]* @"\01L_OBJC_METH_VAR_NAME_0", i32 0, i32 0), i8* getelementptr ([7 x i8]* @"\01L_OBJC_METH_VAR_TYPE_0", i32 0, i32 0) ], section "llvm.metadata"		; <[11 x i8*]*> [#uses=0]
-
-define internal signext i8 @"-[NSString(local) isNullOrNil]"(%struct.NSString* %self, %struct.objc_selector* %_cmd)   {
-entry:
-	%self_addr = alloca %struct.NSString*		; <%struct.NSString**> [#uses=1]
-	%_cmd_addr = alloca %struct.objc_selector*		; <%struct.objc_selector**> [#uses=1]
-	%retval = alloca i8, align 1		; <i8*> [#uses=1]
-	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
-	store %struct.NSString* %self, %struct.NSString** %self_addr
-	store %struct.objc_selector* %_cmd, %struct.objc_selector** %_cmd_addr
-	br label %return
-
-return:		; preds = %entry
-	%retval1 = load i8* %retval		; <i8> [#uses=1]
-	ret i8 %retval1
-}
diff --git a/test/CodeGen/X86/2007-10-17-IllegalAsm.ll b/test/CodeGen/X86/2007-10-17-IllegalAsm.ll
deleted file mode 100644
index c0bb55e..0000000
--- a/test/CodeGen/X86/2007-10-17-IllegalAsm.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | grep addb | not grep x
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | grep cmpb | not grep x
-; PR1734
-
-target triple = "x86_64-unknown-linux-gnu"
-	%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
-	%struct.eh_status = type opaque
-	%struct.emit_status = type { i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.sequence_stack*, i32, %struct.location_t, i32, i8*, %struct.rtx_def** }
-	%struct.expr_status = type { i32, i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* }
-	%struct.function = type { %struct.eh_status*, %struct.expr_status*, %struct.emit_status*, %struct.varasm_status*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.function*, i32, i32, i32, i32, %struct.rtx_def*, %struct.CUMULATIVE_ARGS, %struct.rtx_def*, %struct.rtx_def*, %struct.initial_value_struct*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, i8, i32, i64, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, %struct.varray_head_tag*, %struct.temp_slot*, i32, %struct.var_refs_queue*, i32, i32, %struct.rtvec_def*, %struct.tree_node*, i32, i32, i32, %struct.machine_function*, i32, i32, i8, i8, %struct.language_function*, %struct.rtx_def*, i32, i32, i32, i32, %struct.location_t, %struct.varray_head_tag*, %struct.tree_node*, %struct.tree_node*, i8, i8, i8 }
-	%struct.initial_value_struct = type opaque
-	%struct.lang_decl = type opaque
-	%struct.language_function = type opaque
-	%struct.location_t = type { i8*, i32 }
-	%struct.machine_function = type { %struct.stack_local_entry*, i8*, %struct.rtx_def*, i32, i32, i32, i32, i32 }
-	%struct.rtunion = type { i8* }
-	%struct.rtvec_def = type { i32, [1 x %struct.rtx_def*] }
-	%struct.rtx_def = type { i16, i8, i8, %struct.u }
-	%struct.sequence_stack = type { %struct.rtx_def*, %struct.rtx_def*, %struct.sequence_stack* }
-	%struct.stack_local_entry = type opaque
-	%struct.temp_slot = type opaque
-	%struct.tree_common = type { %struct.tree_node*, %struct.tree_node*, %union.tree_ann_d*, i8, i8, i8, i8, i8 }
-	%struct.tree_decl = type { %struct.tree_common, %struct.location_t, i32, %struct.tree_node*, i8, i8, i8, i8, i8, i8, i8, i8, i32, %struct.tree_decl_u1, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, i32, %struct.tree_decl_u2, %struct.tree_node*, %struct.tree_node*, i64, %struct.lang_decl* }
-	%struct.tree_decl_u1 = type { i64 }
-	%struct.tree_decl_u2 = type { %struct.function* }
-	%struct.tree_node = type { %struct.tree_decl }
-	%struct.u = type { [1 x %struct.rtunion] }
-	%struct.var_refs_queue = type { %struct.rtx_def*, i32, i32, %struct.var_refs_queue* }
-	%struct.varasm_status = type opaque
-	%struct.varray_data = type { [1 x i64] }
-	%struct.varray_head_tag = type { i64, i64, i32, i8*, %struct.varray_data }
-	%union.tree_ann_d = type opaque
-
-define void @layout_type(%struct.tree_node* %type) {
-entry:
-	%tmp32 = load i32* null, align 8		; <i32> [#uses=3]
-	%tmp3435 = trunc i32 %tmp32 to i8		; <i8> [#uses=1]
-	%tmp53 = icmp eq %struct.tree_node* null, null		; <i1> [#uses=1]
-	br i1 %tmp53, label %cond_next57, label %UnifiedReturnBlock
-
-cond_next57:		; preds = %entry
-	%tmp65 = and i32 %tmp32, 255		; <i32> [#uses=1]
-	switch i32 %tmp65, label %UnifiedReturnBlock [
-		 i32 6, label %bb140
-		 i32 7, label %bb140
-		 i32 8, label %bb140
-		 i32 13, label %bb478
-	]
-
-bb140:		; preds = %cond_next57, %cond_next57, %cond_next57
-	%tmp219 = load i32* null, align 8		; <i32> [#uses=1]
-	%tmp221222 = trunc i32 %tmp219 to i8		; <i8> [#uses=1]
-	%tmp223 = icmp eq i8 %tmp221222, 24		; <i1> [#uses=1]
-	br i1 %tmp223, label %cond_true226, label %cond_next340
-
-cond_true226:		; preds = %bb140
-	switch i8 %tmp3435, label %cond_true288 [
-		 i8 6, label %cond_next340
-		 i8 9, label %cond_next340
-		 i8 7, label %cond_next340
-		 i8 8, label %cond_next340
-		 i8 10, label %cond_next340
-	]
-
-cond_true288:		; preds = %cond_true226
-	unreachable
-
-cond_next340:		; preds = %cond_true226, %cond_true226, %cond_true226, %cond_true226, %cond_true226, %bb140
-	ret void
-
-bb478:		; preds = %cond_next57
-	br i1 false, label %cond_next500, label %cond_true497
-
-cond_true497:		; preds = %bb478
-	unreachable
-
-cond_next500:		; preds = %bb478
-	%tmp513 = load i32* null, align 8		; <i32> [#uses=1]
-	%tmp545 = and i32 %tmp513, 8192		; <i32> [#uses=1]
-	%tmp547 = and i32 %tmp32, -8193		; <i32> [#uses=1]
-	%tmp548 = or i32 %tmp547, %tmp545		; <i32> [#uses=1]
-	store i32 %tmp548, i32* null, align 8
-	ret void
-
-UnifiedReturnBlock:		; preds = %cond_next57, %entry
-	ret void
-}
diff --git a/test/CodeGen/X86/2007-10-28-inlineasm-q-modifier.ll b/test/CodeGen/X86/2007-10-28-inlineasm-q-modifier.ll
index 984094d..d02346d 100644
--- a/test/CodeGen/X86/2007-10-28-inlineasm-q-modifier.ll
+++ b/test/CodeGen/X86/2007-10-28-inlineasm-q-modifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -no-integrated-as < %s
 ; PR1748
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2007-11-04-LiveVariablesBug.ll b/test/CodeGen/X86/2007-11-04-LiveVariablesBug.ll
index 6b871aa..ec3bce9 100644
--- a/test/CodeGen/X86/2007-11-04-LiveVariablesBug.ll
+++ b/test/CodeGen/X86/2007-11-04-LiveVariablesBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu
+; RUN: llc -no-integrated-as < %s -mtriple=x86_64-unknown-linux-gnu
 ; PR1767
 
 define void @xor_sse_2(i64 %bytes, i64* %p1, i64* %p2) {
diff --git a/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll b/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
index c467024..d1699d5 100644
--- a/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
+++ b/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=static | FileCheck %s
+; RUN: llc < %s -relocation-model=static -no-integrated-as | FileCheck %s
 ; PR1761
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-pc-linux"
diff --git a/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll b/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
index b06b249..319e884 100644
--- a/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
+++ b/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -no-integrated-as < %s | FileCheck %s
 ; PR2078
 ; The clobber list says that "ax" is clobbered.  Make sure that eax isn't 
 ; allocated to the input/output register.
diff --git a/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll b/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll
index 0b4eb3a..11b55a6 100644
--- a/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll
+++ b/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -march=x86 -no-integrated-as
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i386-pc-linux-gnu"
diff --git a/test/CodeGen/X86/2008-03-14-SpillerCrash.ll b/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
index 18b3714..6b374a7 100644
--- a/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
+++ b/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
@@ -6,7 +6,7 @@
 	%struct.locale_data = type { i8*, i8*, i32, i32, { void (%struct.locale_data*)*, %struct.anon }, i32, i32, i32, [0 x %struct.locale_data_value] }
 	%struct.locale_data_value = type { i32* }
 
-@wcstoll_l = alias i64 (i32*, i32**, i32, %struct.__locale_struct*)* @__wcstoll_l		; <i64 (i32*, i32**, i32, %struct.__locale_struct*)*> [#uses=0]
+@wcstoll_l = alias i64 (i32*, i32**, i32, %struct.__locale_struct*)* @__wcstoll_l
 
 define i64 @____wcstoll_l_internal(i32* %nptr, i32** %endptr, i32 %base, i32 %group, %struct.__locale_struct* %loc) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/2008-04-02-unnamedEH.ll b/test/CodeGen/X86/2008-04-02-unnamedEH.ll
index ab8ec80..70812ea 100644
--- a/test/CodeGen/X86/2008-04-02-unnamedEH.ll
+++ b/test/CodeGen/X86/2008-04-02-unnamedEH.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -disable-cfi | FileCheck %s
+; RUN: llc < %s | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 
@@ -11,6 +11,8 @@ define internal void @""() {
 	call i32 @_Z3barv( )		; <i32>:4 [#uses=1]
 	ret void
 }
-; CHECK: unnamed_1.eh
+
+; CHECK:      ___unnamed_1:
+; CHECK-NEXT: .cfi_startproc
 
 declare i32 @_Z3barv()
diff --git a/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll b/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
index 5089e8c..d439e82 100644
--- a/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
+++ b/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx
+; RUN: llc < %s -mtriple=i686-pc-linux -mattr=+mmx
 
 define i32 @t2() nounwind  {
 entry:
diff --git a/test/CodeGen/X86/2008-04-26-Asm-Optimize-Imm.ll b/test/CodeGen/X86/2008-04-26-Asm-Optimize-Imm.ll
index d4805b4..6d45f1f 100644
--- a/test/CodeGen/X86/2008-04-26-Asm-Optimize-Imm.ll
+++ b/test/CodeGen/X86/2008-04-26-Asm-Optimize-Imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -no-integrated-as < %s | FileCheck %s
 ; rdar://5720231
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
index 496779c..51064f1 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: movq %rsp, %rbp
 ; CHECK: popq %rbp
 ; CHECK: movq %rcx, %rsp
-; CHECK: ret # eh_return, addr: %rcx
+; CHECK: retq # eh_return, addr: %rcx
 define i8* @test(i64 %a, i8* %b)  {
 entry:
   call void @llvm.eh.unwind.init()
diff --git a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
index 5c2fbee..f4a43a1 100644
--- a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
+++ b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 | FileCheck %s
-; RUN: llc < %s -march=x86 -regalloc=basic      | FileCheck %s
-; RUN: llc < %s -march=x86 -regalloc=greedy     | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 -no-integrated-as | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=basic -no-integrated-as      | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=greedy -no-integrated-as     | FileCheck %s
 
 ; The 1st, 2nd, 3rd and 5th registers must all be different.  The registers
 ; referenced in the 4th and 6th operands must not be the same as the 1st or 5th
diff --git a/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll b/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll
index b2e6061..2b2f704 100644
--- a/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll
+++ b/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -march=x86 -no-integrated-as
+; RUN: llc < %s -march=x86-64 -no-integrated-as
 
 define void @test(i64 %x) nounwind {
 entry:
diff --git a/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll b/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll
index 353d1c7..e23dfe5 100644
--- a/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll
+++ b/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -march=x86 -no-integrated-as
+; RUN: llc < %s -march=x86-64 -no-integrated-as
 
 ; from gcc.c-torture/compile/920520-1.c
 
diff --git a/test/CodeGen/X86/2008-12-12-PrivateEHSymbol.ll b/test/CodeGen/X86/2008-12-12-PrivateEHSymbol.ll
deleted file mode 100644
index 2e27811..0000000
--- a/test/CodeGen/X86/2008-12-12-PrivateEHSymbol.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc < %s -disable-cfi -march=x86-64 -mtriple=x86_64-apple-darwin9 | grep ^__Z1fv.eh
-; RUN: llc < %s -disable-cfi -march=x86    -mtriple=i386-apple-darwin9 | grep ^__Z1fv.eh
-
-define void @_Z1fv() {
-entry:
-	br label %return
-
-return:
-	ret void
-}
diff --git a/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll b/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll
index 7549651..5004f04 100644
--- a/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll
+++ b/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -no-integrated-as | FileCheck %s
 
 ; ModuleID = 'shant.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/2009-04-13-2AddrAssert-2.ll b/test/CodeGen/X86/2009-04-13-2AddrAssert-2.ll
index 3d70b58..bd1b47a 100644
--- a/test/CodeGen/X86/2009-04-13-2AddrAssert-2.ll
+++ b/test/CodeGen/X86/2009-04-13-2AddrAssert-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin
+; RUN: llc < %s -mtriple=i386-apple-darwin -no-integrated-as
 ; rdar://6781755
 ; PR3934
 
diff --git a/test/CodeGen/X86/2009-05-08-InlineAsmIOffset.ll b/test/CodeGen/X86/2009-05-08-InlineAsmIOffset.ll
index 7468acb..fa240f6 100644
--- a/test/CodeGen/X86/2009-05-08-InlineAsmIOffset.ll
+++ b/test/CodeGen/X86/2009-05-08-InlineAsmIOffset.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=static | FileCheck %s
+; RUN: llc < %s -relocation-model=static -no-integrated-as | FileCheck %s
 ; PR4152
 
 ; CHECK: {{1: ._pv_cpu_ops[+]8}}
diff --git a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
index 5f5d5cc..50c62df 100644
--- a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
+++ b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 > %t1
+; RUN: llc < %s -march=x86 -mcpu=core2 > %t1
 ; RUN: grep movzwl %t1 | count 2
 ; RUN: grep movzbl %t1 | count 1
 ; RUN: grep movd %t1 | count 4
diff --git a/test/CodeGen/X86/2009-08-23-linkerprivate.ll b/test/CodeGen/X86/2009-08-23-linkerprivate.ll
deleted file mode 100644
index 90fac15..0000000
--- a/test/CodeGen/X86/2009-08-23-linkerprivate.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | FileCheck %s
-
-; ModuleID = '/Volumes/MacOS9/tests/WebKit/JavaScriptCore/profiler/ProfilerServer.mm'
-
-@"\01l_objc_msgSend_fixup_alloc" = linker_private_weak hidden global i32 0, section "__DATA, __objc_msgrefs, coalesced", align 16
-
-; CHECK: .globl l_objc_msgSend_fixup_alloc
-; CHECK: .weak_definition l_objc_msgSend_fixup_alloc
diff --git a/test/CodeGen/X86/2009-09-19-earlyclobber.ll b/test/CodeGen/X86/2009-09-19-earlyclobber.ll
index 66f5118..7df62fd 100644
--- a/test/CodeGen/X86/2009-09-19-earlyclobber.ll
+++ b/test/CodeGen/X86/2009-09-19-earlyclobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -no-integrated-as < %s | FileCheck %s
 ; ModuleID = '4964.c'
 ; PR 4964
 ; Registers other than RAX, RCX are OK, but they must be different.
diff --git a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
index 08a99e3..b828c27 100644
--- a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
+++ b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://7396984
 
-@str = private constant [28 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
+@str = private unnamed_addr constant [28 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
 
 define void @t(i32 %count) ssp nounwind {
 entry:
diff --git a/test/CodeGen/X86/2009-12-01-EarlyClobberBug.ll b/test/CodeGen/X86/2009-12-01-EarlyClobberBug.ll
index b166447..5c10c55 100644
--- a/test/CodeGen/X86/2009-12-01-EarlyClobberBug.ll
+++ b/test/CodeGen/X86/2009-12-01-EarlyClobberBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s
 ; pr5391
 
 define void @t() nounwind ssp {
diff --git a/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll b/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll
index 74a5ec2..fc8c895 100644
--- a/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll
+++ b/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -regalloc=fast | FileCheck %s
+; RUN: llc < %s -O0 -regalloc=fast -no-integrated-as | FileCheck %s
 ; PR6520
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index c5736eb..e11b538 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -26,7 +26,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !0 = metadata !{i32 786484, i32 0, metadata !1, metadata !"ret", metadata !"ret", metadata !"", metadata !1, i32 7, metadata !3, i1 false, i1 true, null, null} ; [ DW_TAG_variable ]
 !1 = metadata !{i32 786473, metadata !36} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !36, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !37, metadata !37, metadata !32, metadata !31,  metadata !31, metadata !""} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{i32 786449, metadata !36, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !37, metadata !37, metadata !32, metadata !31,  metadata !37, metadata !""} ; [ DW_TAG_compile_unit ]
 !3 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !4 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !1, i32 12, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
 !5 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, void (i32)* @foo, null, null, metadata !33, i32 13} ; [ DW_TAG_subprogram ]
@@ -61,7 +61,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !34 = metadata !{metadata !8}
 !35 = metadata !{metadata !18, metadata !25, metadata !26}
 !36 = metadata !{metadata !"foo.c", metadata !"/tmp/"}
-!37 = metadata !{i32 0}
+!37 = metadata !{}
 
 ; The variable bar:myvar changes registers after the first movq.
 ; It is cobbered by popq %rbx
diff --git a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
index 9b47bb7..0f8855d 100644
--- a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
+++ b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=fast -optimize-regalloc=0 < %s | FileCheck %s
+; RUN: llc -regalloc=fast -optimize-regalloc=0 -no-integrated-as < %s | FileCheck %s
 ; PR7382
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll b/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll
index 68a6a13..0df9dc1 100644
--- a/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll
+++ b/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -disable-fp-elim -mtriple=i686-pc-mingw32
+; RUN: llc < %s -disable-fp-elim -mtriple=i686-pc-mingw32 -no-integrated-as
 
 %struct.__SEH2Frame = type {}
 
diff --git a/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll b/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
index e1491a0..d7bc21f 100644
--- a/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
+++ b/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -O0 | FileCheck %s
+; RUN: llc < %s -march=x86 -O0 -no-integrated-as | FileCheck %s
 ; PR7509
 target triple = "i386-apple-darwin10"
 %asmtype = type { i32, i8*, i32, i32 }
diff --git a/test/CodeGen/X86/2010-06-28-matched-g-constraint.ll b/test/CodeGen/X86/2010-06-28-matched-g-constraint.ll
index 82dac9d..a0798ae 100644
--- a/test/CodeGen/X86/2010-06-28-matched-g-constraint.ll
+++ b/test/CodeGen/X86/2010-06-28-matched-g-constraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin11 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -no-integrated-as | FileCheck %s
 ; Any register is OK for %0, but it must be a register, not memory.
 
 define i32 @foo() nounwind ssp {
diff --git a/test/CodeGen/X86/2010-07-02-asm-alignstack.ll b/test/CodeGen/X86/2010-07-02-asm-alignstack.ll
index 0bbb24f..4302add 100644
--- a/test/CodeGen/X86/2010-07-02-asm-alignstack.ll
+++ b/test/CodeGen/X86/2010-07-02-asm-alignstack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -no-integrated-as | FileCheck %s
 
 define void @foo() nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/2010-07-06-asm-RIP.ll b/test/CodeGen/X86/2010-07-06-asm-RIP.ll
index 9526b8d..818bbc6 100644
--- a/test/CodeGen/X86/2010-07-06-asm-RIP.ll
+++ b/test/CodeGen/X86/2010-07-06-asm-RIP.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s
 ; PR 4752
 
 @n = global i32 0                                 ; <i32*> [#uses=2]
diff --git a/test/CodeGen/X86/2010-07-13-indirectXconstraint.ll b/test/CodeGen/X86/2010-07-13-indirectXconstraint.ll
index 97cbe3e..306e22a 100644
--- a/test/CodeGen/X86/2010-07-13-indirectXconstraint.ll
+++ b/test/CodeGen/X86/2010-07-13-indirectXconstraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s
 ; PR 7528
 ; formerly crashed
 
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index 9aa41c3..a65b632 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin10 < %s - | FileCheck %s
 ; Radar 8286101
-; CHECK: .file   2 "<stdin>"
+; CHECK: .file   {{[0-9]+}} "<stdin>"
 
 define i32 @foo() nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll b/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
index 0e4118a..f69cedc 100644
--- a/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
+++ b/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
@@ -18,7 +18,7 @@ entry:
 loop:
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchg8b
-  %r = cmpxchg i64* %ptr, i64 0, i64 1 monotonic
+  %r = cmpxchg i64* %ptr, i64 0, i64 1 monotonic monotonic
   %stored1  = icmp eq i64 %r, 0
   br i1 %stored1, label %loop, label %continue
 continue:
diff --git a/test/CodeGen/X86/2010-12-02-MC-Set.ll b/test/CodeGen/X86/2010-12-02-MC-Set.ll
deleted file mode 100644
index 5a407d3..0000000
--- a/test/CodeGen/X86/2010-12-02-MC-Set.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc < %s -disable-dot-loc -mtriple=x86_64-apple-darwin -O0 | FileCheck %s
-
-
-define void @foo() nounwind ssp {
-entry:
-  ret void, !dbg !5
-}
-
-!llvm.dbg.cu = !{!2}
-!llvm.module.flags = !{!10}
-!7 = metadata !{metadata !0}
-
-!0 = metadata !{i32 786478, metadata !9, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !9} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !9, i32 12, metadata !"clang version 2.9 (trunk 120563)", i1 false, metadata !"", i32 0, metadata !8, metadata !8, metadata !7, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !9, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{i32 5, i32 1, metadata !6, null}
-!6 = metadata !{i32 786443, metadata !9, metadata !0, i32 3, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{i32 0}
-!9 = metadata !{metadata !"e.c", metadata !"/private/tmp"}
-
-; CHECK: .subsections_via_symbols
-; CHECK-NEXT: __debug_line
-; CHECK-NEXT: Lline_table_start0
-; CHECK-NEXT: Ltmp{{[0-9]}} = (Ltmp
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index d534030..f016528 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -1,14 +1,20 @@
-; RUN: llc < %s | FileCheck %s
-; RUN: llc < %s -regalloc=basic | FileCheck %s
+; RUN: llc < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; RUN: llc < %s -filetype=obj -regalloc=basic | llvm-dwarfdump -debug-dump=info -  | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
 
 ; Check debug info for variable z_s
-;CHECK: .long Lset14
-;CHECK-NEXT:  ## DW_AT_decl_file
-;CHECK-NEXT:  ## DW_AT_decl_line
-;CHECK-NEXT:  ## DW_AT_type
-;CHECK-NEXT:  ## DW_AT_location
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_variable
+; CHECK: DW_TAG_variable
+; CHECK-NEXT:   DW_AT_name {{.*}} "z_s"
+; CHECK-NEXT:   DW_AT_decl_file
+; CHECK-NEXT:   DW_AT_decl_line
+; CHECK-NEXT:   DW_AT_type{{.*}}{[[TYPE:.*]]}
+; CHECK-NEXT:   DW_AT_location
+; CHECK: [[TYPE]]:
+; CHECK-NEXT: DW_AT_name {{.*}} "int"
 
 
 @.str1 = private unnamed_addr constant [14 x i8] c"m=%u, z_s=%d\0A\00"
diff --git a/test/CodeGen/X86/2011-05-09-loaduse.ll b/test/CodeGen/X86/2011-05-09-loaduse.ll
index adcea5c..c772e4c 100644
--- a/test/CodeGen/X86/2011-05-09-loaduse.ll
+++ b/test/CodeGen/X86/2011-05-09-loaduse.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
 
 ;CHECK-LABEL: test:
-;CHECK-not: pshufd
+;CHECK-NOT: pshufd
 ;CHECK: ret
 define float @test(<4 x float>* %A) nounwind {
 entry:
diff --git a/test/CodeGen/X86/2011-10-11-SpillDead.ll b/test/CodeGen/X86/2011-10-11-SpillDead.ll
index 8e70d65..19c3d6c 100644
--- a/test/CodeGen/X86/2011-10-11-SpillDead.ll
+++ b/test/CodeGen/X86/2011-10-11-SpillDead.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-regalloc
+; RUN: llc < %s -verify-regalloc -no-integrated-as
 ; PR11125
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7"
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index e08c5b2..222068d 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -1,12 +1,10 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
 
-target triple = "x86_64-unknown-linux-gnu"
-
-; Make sure that we don't crash when legalizng vselect and vsetcc and that
+; Make sure that we don't crash when legalizing vselect and vsetcc and that
 ; we are able to generate vector blend instructions.
 
-; CHECK: simple_widen
-; CHECK: blend
+; CHECK-LABEL: simple_widen
+; CHECK-NOT: blend
 ; CHECK: ret
 define void @simple_widen() {
 entry:
@@ -15,7 +13,7 @@ entry:
   ret void
 }
 
-; CHECK: complex_inreg_work
+; CHECK-LABEL: complex_inreg_work
 ; CHECK: blend
 ; CHECK: ret
 
@@ -27,8 +25,8 @@ entry:
   ret void
 }
 
-; CHECK: zero_test
-; CHECK: blend
+; CHECK-LABEL: zero_test
+; CHECK: xorps	%xmm0, %xmm0
 ; CHECK: ret
 
 define void @zero_test() {
@@ -38,7 +36,7 @@ entry:
   ret void
 }
 
-; CHECK: full_test
+; CHECK-LABEL: full_test
 ; CHECK: blend
 ; CHECK: ret
 
diff --git a/test/CodeGen/X86/2011-12-28-vselecti8.ll b/test/CodeGen/X86/2011-12-28-vselecti8.ll
index dbc122a..c916466 100644
--- a/test/CodeGen/X86/2011-12-28-vselecti8.ll
+++ b/test/CodeGen/X86/2011-12-28-vselecti8.ll
@@ -3,10 +3,20 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-darwin11.2.0"
 
-; CHECK: @foo8
-; CHECK: psll
-; CHECK: psraw
-; CHECK: pblendvb
+; During legalization, the vselect mask is 'type legalized' into a 
+; wider BUILD_VECTOR. This causes the introduction of a new
+; sign_extend_inreg in the DAG.
+;
+; A sign_extend_inreg of a vector of ConstantSDNode or undef can be
+; always folded into a simple build_vector.
+;
+; Make sure that the sign_extend_inreg is simplified and that we
+; don't generate psll, psraw and pblendvb from the vselect.
+
+; CHECK-LABEL: foo8
+; CHECK-NOT: psll
+; CHECK-NOT: psraw
+; CHECK-NOT: pblendvb
 ; CHECK: ret
 define void @foo8(float* nocapture %RET) nounwind {
 allocas:
@@ -17,4 +27,3 @@ allocas:
   ret void
 }
 
-
diff --git a/test/CodeGen/X86/2012-08-17-legalizer-crash.ll b/test/CodeGen/X86/2012-08-17-legalizer-crash.ll
index 971e56d..0d18267 100644
--- a/test/CodeGen/X86/2012-08-17-legalizer-crash.ll
+++ b/test/CodeGen/X86/2012-08-17-legalizer-crash.ll
@@ -27,5 +27,5 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; CHECK-LABEL: fn1:
 ; CHECK: shrq $32, [[REG:%.*]]
-; CHECK: je
+; CHECK: sete
 }
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index d41b432..62ee1e1 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -38,10 +38,8 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{null}
+!0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
+!2 = metadata !{}
 !4 = metadata !{i32 786689, null, metadata !"hg", metadata !5, i32 67109589, metadata !6, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [hg] [line 725]
 !5 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ]
 !6 = metadata !{i32 786454, metadata !11, null, metadata !"hgstruct", i32 492, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 7befa6b..650839a 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -65,10 +65,9 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!35}
 
-!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
+!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
 !1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{}
+!2 = metadata !{}
 !4 = metadata !{i32 786688, metadata !5, metadata !"num1", metadata !14, i32 815, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [num1] [line 815]
 !5 = metadata !{i32 786443, metadata !6, i32 815, i32 0, metadata !14, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !6 = metadata !{i32 786443, metadata !7, i32 812, i32 0, metadata !14, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
@@ -78,7 +77,7 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 !10 = metadata !{i32 786443, metadata !11, i32 434, i32 0, metadata !14, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !11 = metadata !{i32 786443, metadata !12, i32 250, i32 0, metadata !14, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !12 = metadata !{i32 786443, metadata !13, i32 249, i32 0, metadata !14, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!13 = metadata !{i32 786443, metadata !3, i32 221, i32 0, metadata !14, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!13 = metadata !{i32 786443, metadata !2, i32 221, i32 0, metadata !14, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !14 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
 !15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
 !16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
diff --git a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
index 3455b68..bbba796 100644
--- a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
+++ b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
@@ -3,7 +3,7 @@
 ; During X86 fastisel, the address of indirect call was resolved
 ; through bitcast, ptrtoint, and inttoptr instructions. This is valid
 ; only if the related instructions are in that same basic block, otherwise
-; we may reference variables that were not live accross basic blocks
+; we may reference variables that were not live across basic blocks
 ; resulting in undefined virtual registers.
 ;
 ; In this example, this is illustrated by a the spill/reload of the
@@ -25,7 +25,7 @@
 ; CHECK: movq [[ARG2_SLOT]], %rdi
 ; Load the second argument
 ; CHECK: movq [[ARG2_SLOT]], %rsi
-; Load the thrid argument
+; Load the third argument
 ; CHECK: movq [[ARG2_SLOT]], %rdx
 ; Load the function pointer.
 ; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
@@ -64,7 +64,7 @@ label_end:
 ; CHECK: movq [[ARG2_SLOT]], %rdi
 ; Load the second argument
 ; CHECK: movq [[ARG2_SLOT]], %rsi
-; Load the thrid argument
+; Load the third argument
 ; CHECK: movq [[ARG2_SLOT]], %rdx
 ; Load the function pointer.
 ; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
@@ -103,7 +103,7 @@ label_end:
 ; CHECK: movq [[ARG2_SLOT]], %rdi
 ; Load the second argument
 ; CHECK: movq [[ARG2_SLOT]], %rsi
-; Load the thrid argument
+; Load the third argument
 ; CHECK: movq [[ARG2_SLOT]], %rdx
 ; Load the function pointer.
 ; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
diff --git a/test/CodeGen/X86/3addr-16bit.ll b/test/CodeGen/X86/3addr-16bit.ll
index fafdfdb..2d6a5e7 100644
--- a/test/CodeGen/X86/3addr-16bit.ll
+++ b/test/CodeGen/X86/3addr-16bit.ll
@@ -34,7 +34,7 @@ entry:
 
 ; 64BIT-LABEL:     t2:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     decl %eax
+; 64BIT:     leal -1(%rsi), %eax
 ; 64BIT:     movzwl %ax
   %0 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   %1 = add i16 %k, -1                             ; <i16> [#uses=3]
@@ -59,7 +59,7 @@ entry:
 
 ; 64BIT-LABEL:     t3:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     addl $2, %eax
+; 64BIT:     leal 2(%rsi), %eax
   %0 = add i16 %k, 2                              ; <i16> [#uses=3]
   %1 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   br i1 %1, label %bb, label %bb1
@@ -82,7 +82,7 @@ entry:
 
 ; 64BIT-LABEL:     t4:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     addl %edi, %eax
+; 64BIT:     leal (%rsi,%rdi), %eax
   %0 = add i16 %k, %c                             ; <i16> [#uses=3]
   %1 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   br i1 %1, label %bb, label %bb1
diff --git a/test/CodeGen/X86/Atomics-64.ll b/test/CodeGen/X86/Atomics-64.ll
index 8b0a349..c274688 100644
--- a/test/CodeGen/X86/Atomics-64.ll
+++ b/test/CodeGen/X86/Atomics-64.ll
@@ -704,7 +704,7 @@ entry:
   %3 = zext i8 %2 to i32
   %4 = trunc i32 %3 to i8
   %5 = trunc i32 %1 to i8
-  %6 = cmpxchg i8* @sc, i8 %4, i8 %5 monotonic
+  %6 = cmpxchg i8* @sc, i8 %4, i8 %5 monotonic monotonic
   store i8 %6, i8* @sc, align 1
   %7 = load i8* @sc, align 1
   %8 = zext i8 %7 to i32
@@ -712,7 +712,7 @@ entry:
   %10 = zext i8 %9 to i32
   %11 = trunc i32 %10 to i8
   %12 = trunc i32 %8 to i8
-  %13 = cmpxchg i8* @uc, i8 %11, i8 %12 monotonic
+  %13 = cmpxchg i8* @uc, i8 %11, i8 %12 monotonic monotonic
   store i8 %13, i8* @uc, align 1
   %14 = load i8* @sc, align 1
   %15 = sext i8 %14 to i16
@@ -722,7 +722,7 @@ entry:
   %19 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
   %20 = trunc i32 %18 to i16
   %21 = trunc i32 %16 to i16
-  %22 = cmpxchg i16* %19, i16 %20, i16 %21 monotonic
+  %22 = cmpxchg i16* %19, i16 %20, i16 %21 monotonic monotonic
   store i16 %22, i16* @ss, align 2
   %23 = load i8* @sc, align 1
   %24 = sext i8 %23 to i16
@@ -732,49 +732,49 @@ entry:
   %28 = bitcast i8* bitcast (i16* @us to i8*) to i16*
   %29 = trunc i32 %27 to i16
   %30 = trunc i32 %25 to i16
-  %31 = cmpxchg i16* %28, i16 %29, i16 %30 monotonic
+  %31 = cmpxchg i16* %28, i16 %29, i16 %30 monotonic monotonic
   store i16 %31, i16* @us, align 2
   %32 = load i8* @sc, align 1
   %33 = sext i8 %32 to i32
   %34 = load i8* @uc, align 1
   %35 = zext i8 %34 to i32
   %36 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %37 = cmpxchg i32* %36, i32 %35, i32 %33 monotonic
+  %37 = cmpxchg i32* %36, i32 %35, i32 %33 monotonic monotonic
   store i32 %37, i32* @si, align 4
   %38 = load i8* @sc, align 1
   %39 = sext i8 %38 to i32
   %40 = load i8* @uc, align 1
   %41 = zext i8 %40 to i32
   %42 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %43 = cmpxchg i32* %42, i32 %41, i32 %39 monotonic
+  %43 = cmpxchg i32* %42, i32 %41, i32 %39 monotonic monotonic
   store i32 %43, i32* @ui, align 4
   %44 = load i8* @sc, align 1
   %45 = sext i8 %44 to i64
   %46 = load i8* @uc, align 1
   %47 = zext i8 %46 to i64
   %48 = bitcast i8* bitcast (i64* @sl to i8*) to i64*
-  %49 = cmpxchg i64* %48, i64 %47, i64 %45 monotonic
+  %49 = cmpxchg i64* %48, i64 %47, i64 %45 monotonic monotonic
   store i64 %49, i64* @sl, align 8
   %50 = load i8* @sc, align 1
   %51 = sext i8 %50 to i64
   %52 = load i8* @uc, align 1
   %53 = zext i8 %52 to i64
   %54 = bitcast i8* bitcast (i64* @ul to i8*) to i64*
-  %55 = cmpxchg i64* %54, i64 %53, i64 %51 monotonic
+  %55 = cmpxchg i64* %54, i64 %53, i64 %51 monotonic monotonic
   store i64 %55, i64* @ul, align 8
   %56 = load i8* @sc, align 1
   %57 = sext i8 %56 to i64
   %58 = load i8* @uc, align 1
   %59 = zext i8 %58 to i64
   %60 = bitcast i8* bitcast (i64* @sll to i8*) to i64*
-  %61 = cmpxchg i64* %60, i64 %59, i64 %57 monotonic
+  %61 = cmpxchg i64* %60, i64 %59, i64 %57 monotonic monotonic
   store i64 %61, i64* @sll, align 8
   %62 = load i8* @sc, align 1
   %63 = sext i8 %62 to i64
   %64 = load i8* @uc, align 1
   %65 = zext i8 %64 to i64
   %66 = bitcast i8* bitcast (i64* @ull to i8*) to i64*
-  %67 = cmpxchg i64* %66, i64 %65, i64 %63 monotonic
+  %67 = cmpxchg i64* %66, i64 %65, i64 %63 monotonic monotonic
   store i64 %67, i64* @ull, align 8
   %68 = load i8* @sc, align 1
   %69 = zext i8 %68 to i32
@@ -782,7 +782,7 @@ entry:
   %71 = zext i8 %70 to i32
   %72 = trunc i32 %71 to i8
   %73 = trunc i32 %69 to i8
-  %74 = cmpxchg i8* @sc, i8 %72, i8 %73 monotonic
+  %74 = cmpxchg i8* @sc, i8 %72, i8 %73 monotonic monotonic
   %75 = icmp eq i8 %74, %72
   %76 = zext i1 %75 to i8
   %77 = zext i8 %76 to i32
@@ -793,7 +793,7 @@ entry:
   %81 = zext i8 %80 to i32
   %82 = trunc i32 %81 to i8
   %83 = trunc i32 %79 to i8
-  %84 = cmpxchg i8* @uc, i8 %82, i8 %83 monotonic
+  %84 = cmpxchg i8* @uc, i8 %82, i8 %83 monotonic monotonic
   %85 = icmp eq i8 %84, %82
   %86 = zext i1 %85 to i8
   %87 = zext i8 %86 to i32
@@ -805,7 +805,7 @@ entry:
   %92 = zext i8 %91 to i32
   %93 = trunc i32 %92 to i8
   %94 = trunc i32 %90 to i8
-  %95 = cmpxchg i8* bitcast (i16* @ss to i8*), i8 %93, i8 %94 monotonic
+  %95 = cmpxchg i8* bitcast (i16* @ss to i8*), i8 %93, i8 %94 monotonic monotonic
   %96 = icmp eq i8 %95, %93
   %97 = zext i1 %96 to i8
   %98 = zext i8 %97 to i32
@@ -817,7 +817,7 @@ entry:
   %103 = zext i8 %102 to i32
   %104 = trunc i32 %103 to i8
   %105 = trunc i32 %101 to i8
-  %106 = cmpxchg i8* bitcast (i16* @us to i8*), i8 %104, i8 %105 monotonic
+  %106 = cmpxchg i8* bitcast (i16* @us to i8*), i8 %104, i8 %105 monotonic monotonic
   %107 = icmp eq i8 %106, %104
   %108 = zext i1 %107 to i8
   %109 = zext i8 %108 to i32
@@ -828,7 +828,7 @@ entry:
   %113 = zext i8 %112 to i32
   %114 = trunc i32 %113 to i8
   %115 = trunc i32 %111 to i8
-  %116 = cmpxchg i8* bitcast (i32* @si to i8*), i8 %114, i8 %115 monotonic
+  %116 = cmpxchg i8* bitcast (i32* @si to i8*), i8 %114, i8 %115 monotonic monotonic
   %117 = icmp eq i8 %116, %114
   %118 = zext i1 %117 to i8
   %119 = zext i8 %118 to i32
@@ -839,7 +839,7 @@ entry:
   %123 = zext i8 %122 to i32
   %124 = trunc i32 %123 to i8
   %125 = trunc i32 %121 to i8
-  %126 = cmpxchg i8* bitcast (i32* @ui to i8*), i8 %124, i8 %125 monotonic
+  %126 = cmpxchg i8* bitcast (i32* @ui to i8*), i8 %124, i8 %125 monotonic monotonic
   %127 = icmp eq i8 %126, %124
   %128 = zext i1 %127 to i8
   %129 = zext i8 %128 to i32
@@ -850,7 +850,7 @@ entry:
   %133 = zext i8 %132 to i64
   %134 = trunc i64 %133 to i8
   %135 = trunc i64 %131 to i8
-  %136 = cmpxchg i8* bitcast (i64* @sl to i8*), i8 %134, i8 %135 monotonic
+  %136 = cmpxchg i8* bitcast (i64* @sl to i8*), i8 %134, i8 %135 monotonic monotonic
   %137 = icmp eq i8 %136, %134
   %138 = zext i1 %137 to i8
   %139 = zext i8 %138 to i32
@@ -861,7 +861,7 @@ entry:
   %143 = zext i8 %142 to i64
   %144 = trunc i64 %143 to i8
   %145 = trunc i64 %141 to i8
-  %146 = cmpxchg i8* bitcast (i64* @ul to i8*), i8 %144, i8 %145 monotonic
+  %146 = cmpxchg i8* bitcast (i64* @ul to i8*), i8 %144, i8 %145 monotonic monotonic
   %147 = icmp eq i8 %146, %144
   %148 = zext i1 %147 to i8
   %149 = zext i8 %148 to i32
@@ -872,7 +872,7 @@ entry:
   %153 = zext i8 %152 to i64
   %154 = trunc i64 %153 to i8
   %155 = trunc i64 %151 to i8
-  %156 = cmpxchg i8* bitcast (i64* @sll to i8*), i8 %154, i8 %155 monotonic
+  %156 = cmpxchg i8* bitcast (i64* @sll to i8*), i8 %154, i8 %155 monotonic monotonic
   %157 = icmp eq i8 %156, %154
   %158 = zext i1 %157 to i8
   %159 = zext i8 %158 to i32
@@ -883,7 +883,7 @@ entry:
   %163 = zext i8 %162 to i64
   %164 = trunc i64 %163 to i8
   %165 = trunc i64 %161 to i8
-  %166 = cmpxchg i8* bitcast (i64* @ull to i8*), i8 %164, i8 %165 monotonic
+  %166 = cmpxchg i8* bitcast (i64* @ull to i8*), i8 %164, i8 %165 monotonic monotonic
   %167 = icmp eq i8 %166, %164
   %168 = zext i1 %167 to i8
   %169 = zext i8 %168 to i32
diff --git a/test/CodeGen/X86/GC/ocaml-gc.ll b/test/CodeGen/X86/GC/ocaml-gc.ll
index 6d5f8ae..37ddaf9 100644
--- a/test/CodeGen/X86/GC/ocaml-gc.ll
+++ b/test/CodeGen/X86/GC/ocaml-gc.ll
@@ -1,8 +1,10 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
 
-define i32 @main(i32 %x) nounwind gc "ocaml" {
 ; CHECK:        .text
-; CHECK-NEXT:   .globl "caml<stdin>__code_begin"
+; CHECK-NEXT:   .file   "<stdin>"
+
+define i32 @main(i32 %x) nounwind gc "ocaml" {
+; CHECK:   .globl "caml<stdin>__code_begin"
 ; CHECK-NEXT: "caml<stdin>__code_begin":
 ; CHECK-NEXT:   .data
 ; CHECK-NEXT:   .globl  "caml<stdin>__data_begin"
diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll
new file mode 100644
index 0000000..a893152
--- /dev/null
+++ b/test/CodeGen/X86/MachineBranchProb.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2>&1 | FileCheck %s
+
+;; Make sure a transformation in SelectionDAGBuilder that converts "or + br" to
+;; two branches correctly updates the branch probability.
+
+@max_regno = common global i32 0, align 4
+
+define void @test(i32* %old, i32 %final) {
+for.cond:
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.inc, %for.cond
+  %i.1 = phi i32 [ %inc19, %for.inc ], [ 0, %for.cond ]
+  %bit.0 = phi i32 [ %shl, %for.inc ], [ 1, %for.cond ]
+  %tobool = icmp eq i32 %bit.0, 0
+  %v3 = load i32* @max_regno, align 4
+  %cmp4 = icmp eq i32 %i.1, %v3
+  %or.cond = or i1 %tobool, %cmp4
+  br i1 %or.cond, label %for.inc20, label %for.inc, !prof !0
+; CHECK: BB#1: derived from LLVM BB %for.cond2
+; CHECK: Successors according to CFG: BB#3(56008718) BB#4(2203492365)
+; CHECK: BB#4: derived from LLVM BB %for.cond2
+; CHECK: Successors according to CFG: BB#3(112017436) BB#2(4294967294)
+
+for.inc:                                          ; preds = %for.cond2
+  %shl = shl i32 %bit.0, 1
+  %inc19 = add nsw i32 %i.1, 1
+  br label %for.cond2
+
+for.inc20:                                        ; preds = %for.cond2
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 112017436, i32 -735157296}
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index 584e644..4ce2fb3 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -13,8 +13,8 @@ define i32 @foo(i32 %i, i32* nocapture %c) nounwind uwtable readonly ssp {
 
 bb1:                                     ; preds = %0
 ;CHECK: DEBUG_VALUE: a
-;CHECK-NEXT: 	.loc	1 5 5
-;CHECK-NEXT:	addl
+;CHECK:      .loc	1 5 5
+;CHECK-NEXT: addl
   %gh = add nsw i32 %ab, 2, !dbg !16
   br label %bb2, !dbg !16
 
diff --git a/test/CodeGen/X86/alias-error.ll b/test/CodeGen/X86/alias-error.ll
deleted file mode 100644
index 8f01dcf..0000000
--- a/test/CodeGen/X86/alias-error.ll
+++ /dev/null
@@ -1,5 +0,0 @@
-; RUN: not llc -mtriple=i686-pc-linux-gnu %s -o /dev/null 2>&1 | FileCheck %s
-
-@a = external global i32
-@b = alias i32* @a
-; CHECK: b: Target doesn't support aliases to declarations
diff --git a/test/CodeGen/X86/anyregcc-crash.ll b/test/CodeGen/X86/anyregcc-crash.ll
index cf6f6ed..3abe3d1 100644
--- a/test/CodeGen/X86/anyregcc-crash.ll
+++ b/test/CodeGen/X86/anyregcc-crash.ll
@@ -7,11 +7,11 @@ define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
                         i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
                         i64 %v13, i64 %v14, i64 %v15, i64 %v16) {
 entry:
-  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 16,
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 16,
                 i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
                 i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
                 i64 %v13, i64 %v14, i64 %v15, i64 %v16)
   ret i64 %result
 }
 
-declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/anyregcc.ll b/test/CodeGen/X86/anyregcc.ll
index 8109f87..98ba17c 100644
--- a/test/CodeGen/X86/anyregcc.ll
+++ b/test/CodeGen/X86/anyregcc.ll
@@ -1,17 +1,44 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin                  -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7     -disable-fp-elim | FileCheck --check-prefix=SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim | FileCheck --check-prefix=AVX %s
+
 
 ; Stackmap Header: no constants - 6 callsites
-; CHECK-LABEL: .section	__LLVM_STACKMAPS,__llvm_stackmaps
-; CHECK-NEXT:  __LLVM_StackMaps:
+; CHECK-LABEL:  .section __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:   __LLVM_StackMaps:
 ; Header
-; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 8
 ; Num Constants
-; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .long 0
 ; Num Callsites
-; CHECK-NEXT:   .long   8
+; CHECK-NEXT:   .long 8
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _test
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _property_access1
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _property_access2
+; CHECK-NEXT:   .quad 24
+; CHECK-NEXT:   .quad _property_access3
+; CHECK-NEXT:   .quad 24
+; CHECK-NEXT:   .quad _anyreg_test1
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _anyreg_test2
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _patchpoint_spilldef
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _patchpoint_spillargs
+; CHECK-NEXT:   .quad 88
 
+; No constants
+
+; Callsites
 ; test
-; CHECK-NEXT:   .long   0
 ; CHECK-LABEL:  .long   L{{.*}}-_test
 ; CHECK-NEXT:   .short  0
 ; 3 locations
@@ -33,12 +60,11 @@
 ; CHECK-NEXT:   .long 3
 define i64 @test() nounwind ssp uwtable {
 entry:
-  call anyregcc void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 0, i32 15, i8* null, i32 2, i32 1, i32 2, i64 3)
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 15, i8* null, i32 2, i32 1, i32 2, i64 3)
   ret i64 0
 }
 
 ; property access 1 - %obj is an anyreg call argument and should therefore be in a register
-; CHECK-NEXT:   .long   1
 ; CHECK-LABEL:  .long   L{{.*}}-_property_access1
 ; CHECK-NEXT:   .short  0
 ; 2 locations
@@ -56,12 +82,11 @@ entry:
 define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 1, i32 15, i8* %f, i32 1, i8* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 15, i8* %f, i32 1, i8* %obj)
   ret i64 %ret
 }
 
 ; property access 2 - %obj is an anyreg call argument and should therefore be in a register
-; CHECK-NEXT:   .long   2
 ; CHECK-LABEL:  .long   L{{.*}}-_property_access2
 ; CHECK-NEXT:   .short  0
 ; 2 locations
@@ -80,12 +105,11 @@ define i64 @property_access2() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 2, i32 15, i8* %f, i32 1, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %f, i32 1, i64* %obj)
   ret i64 %ret
 }
 
 ; property access 3 - %obj is a frame index
-; CHECK-NEXT:   .long   3
 ; CHECK-LABEL:  .long   L{{.*}}-_property_access3
 ; CHECK-NEXT:   .short  0
 ; 2 locations
@@ -95,21 +119,20 @@ entry:
 ; CHECK-NEXT:   .byte 8
 ; CHECK-NEXT:   .short {{[0-9]+}}
 ; CHECK-NEXT:   .long 0
-; Loc 1: Register <-- this will be folded once folding for FI is implemented
-; CHECK-NEXT:   .byte 1
+; Loc 1: Direct RBP - ofs
+; CHECK-NEXT:   .byte 2
 ; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
+; CHECK-NEXT:   .short 6
+; CHECK-NEXT:   .long
 define i64 @property_access3() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 3, i32 15, i8* %f, i32 0, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 15, i8* %f, i32 0, i64* %obj)
   ret i64 %ret
 }
 
 ; anyreg_test1
-; CHECK-NEXT:   .long   4
 ; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test1
 ; CHECK-NEXT:   .short  0
 ; 14 locations
@@ -187,12 +210,11 @@ entry:
 define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 4, i32 15, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 15, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
 ; anyreg_test2
-; CHECK-NEXT:   .long   5
 ; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test2
 ; CHECK-NEXT:   .short  0
 ; 14 locations
@@ -270,7 +292,7 @@ entry:
 define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -278,7 +300,6 @@ entry:
 ;
 ; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
 ;
-; CHECK-LABEL: .long 12
 ; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
 ; CHECK-NEXT: .short 0
 ; CHECK-NEXT: .short 3
@@ -299,7 +320,7 @@ entry:
 ; CHECK-NEXT: .long  0
 define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
-  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
   ret i64 %result
 }
@@ -308,7 +329,6 @@ entry:
 ;
 ; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
 ;
-; CHECK-LABEL: .long 13
 ; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
 ; CHECK-NEXT: .short 0
 ; CHECK-NEXT: .short 5
@@ -330,19 +350,119 @@ entry:
 ; Loc 3: Arg2 spilled to RBP +
 ; CHECK-NEXT: .byte  3
 ; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short 7
-; CHECK-NEXT: .long  {{[0-9]+}}
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
 ; Loc 4: Arg3 spilled to RBP +
 ; CHECK-NEXT: .byte  3
 ; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short 7
-; CHECK-NEXT: .long  {{[0-9]+}}
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
 define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 13, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   ret i64 %result
 }
 
-declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
+; Make sure all regs are spilled
+define anyregcc void @anyregcc1() {
+entry:
+;SSE-LABEL: anyregcc1
+;SSE:      pushq %rbp
+;SSE:      pushq %rax
+;SSE:      pushq %r15
+;SSE:      pushq %r14
+;SSE:      pushq %r13
+;SSE:      pushq %r12
+;SSE:      pushq %r11
+;SSE:      pushq %r10
+;SSE:      pushq %r9
+;SSE:      pushq %r8
+;SSE:      pushq %rdi
+;SSE:      pushq %rsi
+;SSE:      pushq %rdx
+;SSE:      pushq %rcx
+;SSE:      pushq %rbx
+;SSE:      movaps %xmm15
+;SSE-NEXT: movaps %xmm14
+;SSE-NEXT: movaps %xmm13
+;SSE-NEXT: movaps %xmm12
+;SSE-NEXT: movaps %xmm11
+;SSE-NEXT: movaps %xmm10
+;SSE-NEXT: movaps %xmm9
+;SSE-NEXT: movaps %xmm8
+;SSE-NEXT: movaps %xmm7
+;SSE-NEXT: movaps %xmm6
+;SSE-NEXT: movaps %xmm5
+;SSE-NEXT: movaps %xmm4
+;SSE-NEXT: movaps %xmm3
+;SSE-NEXT: movaps %xmm2
+;SSE-NEXT: movaps %xmm1
+;SSE-NEXT: movaps %xmm0
+;AVX-LABEL:anyregcc1
+;AVX:      pushq %rbp
+;AVX:      pushq %rax
+;AVX:      pushq %r15
+;AVX:      pushq %r14
+;AVX:      pushq %r13
+;AVX:      pushq %r12
+;AVX:      pushq %r11
+;AVX:      pushq %r10
+;AVX:      pushq %r9
+;AVX:      pushq %r8
+;AVX:      pushq %rdi
+;AVX:      pushq %rsi
+;AVX:      pushq %rdx
+;AVX:      pushq %rcx
+;AVX:      pushq %rbx
+;AVX:      vmovaps %ymm15
+;AVX-NEXT: vmovaps %ymm14
+;AVX-NEXT: vmovaps %ymm13
+;AVX-NEXT: vmovaps %ymm12
+;AVX-NEXT: vmovaps %ymm11
+;AVX-NEXT: vmovaps %ymm10
+;AVX-NEXT: vmovaps %ymm9
+;AVX-NEXT: vmovaps %ymm8
+;AVX-NEXT: vmovaps %ymm7
+;AVX-NEXT: vmovaps %ymm6
+;AVX-NEXT: vmovaps %ymm5
+;AVX-NEXT: vmovaps %ymm4
+;AVX-NEXT: vmovaps %ymm3
+;AVX-NEXT: vmovaps %ymm2
+;AVX-NEXT: vmovaps %ymm1
+;AVX-NEXT: vmovaps %ymm0
+  call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{rbp},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"()
+  ret void
+}
+
+; Make sure we don't spill any XMMs/YMMs
+declare anyregcc void @foo()
+define void @anyregcc2() {
+entry:
+;SSE-LABEL: anyregcc2
+;SSE-NOT: movaps %xmm
+;AVX-LABEL: anyregcc2
+;AVX-NOT: vmovups %ymm
+  %a0 = call <2 x double> asm sideeffect "", "={xmm0}"() nounwind
+  %a1 = call <2 x double> asm sideeffect "", "={xmm1}"() nounwind
+  %a2 = call <2 x double> asm sideeffect "", "={xmm2}"() nounwind
+  %a3 = call <2 x double> asm sideeffect "", "={xmm3}"() nounwind
+  %a4 = call <2 x double> asm sideeffect "", "={xmm4}"() nounwind
+  %a5 = call <2 x double> asm sideeffect "", "={xmm5}"() nounwind
+  %a6 = call <2 x double> asm sideeffect "", "={xmm6}"() nounwind
+  %a7 = call <2 x double> asm sideeffect "", "={xmm7}"() nounwind
+  %a8 = call <2 x double> asm sideeffect "", "={xmm8}"() nounwind
+  %a9 = call <2 x double> asm sideeffect "", "={xmm9}"() nounwind
+  %a10 = call <2 x double> asm sideeffect "", "={xmm10}"() nounwind
+  %a11 = call <2 x double> asm sideeffect "", "={xmm11}"() nounwind
+  %a12 = call <2 x double> asm sideeffect "", "={xmm12}"() nounwind
+  %a13 = call <2 x double> asm sideeffect "", "={xmm13}"() nounwind
+  %a14 = call <2 x double> asm sideeffect "", "={xmm14}"() nounwind
+  %a15 = call <2 x double> asm sideeffect "", "={xmm15}"() nounwind
+  call anyregcc void @foo()
+  call void asm sideeffect "", "{xmm0},{xmm1},{xmm2},{xmm3},{xmm4},{xmm5},{xmm6},{xmm7},{xmm8},{xmm9},{xmm10},{xmm11},{xmm12},{xmm13},{xmm14},{xmm15}"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3, <2 x double> %a4, <2 x double> %a5, <2 x double> %a6, <2 x double> %a7, <2 x double> %a8, <2 x double> %a9, <2 x double> %a10, <2 x double> %a11, <2 x double> %a12, <2 x double> %a13, <2 x double> %a14, <2 x double> %a15)
+  ret void
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/asm-block-labels.ll b/test/CodeGen/X86/asm-block-labels.ll
index a43d430..6dbfb16 100644
--- a/test/CodeGen/X86/asm-block-labels.ll
+++ b/test/CodeGen/X86/asm-block-labels.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | llc
+; RUN: opt < %s -std-compile-opts | llc -no-integrated-as
 ; ModuleID = 'block12.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
diff --git a/test/CodeGen/X86/asm-global-imm.ll b/test/CodeGen/X86/asm-global-imm.ll
index ebf585a..9e79f6f 100644
--- a/test/CodeGen/X86/asm-global-imm.ll
+++ b/test/CodeGen/X86/asm-global-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s
+; RUN: llc < %s -march=x86 -relocation-model=static -no-integrated-as | FileCheck %s
 ; PR882
 
 target datalayout = "e-p:32:32"
diff --git a/test/CodeGen/X86/atom-cmpb.ll b/test/CodeGen/X86/atom-cmpb.ll
new file mode 100644
index 0000000..034bf2f
--- /dev/null
+++ b/test/CodeGen/X86/atom-cmpb.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck %s
+; CHECK:        movl
+; CHECK:        movb
+; CHECK:        movb
+; CHECK:        cmpb
+; CHECK:        notb
+; CHECK:        notb
+
+; Test for checking of cancel conversion to cmp32 in Atom case 
+; in function 'X86TargetLowering::EmitCmp'
+ 
+define i8 @run_test(i8* %rd_p) {
+entry:
+  %incdec.ptr = getelementptr inbounds i8* %rd_p, i64 1
+  %ld1 = load i8* %rd_p, align 1
+  %incdec.ptr1 = getelementptr inbounds i8* %rd_p, i64 2
+  %ld2 = load i8* %incdec.ptr, align 1
+  %x4 = xor i8 %ld1, -1
+  %x5 = xor i8 %ld2, -1
+  %cmp34 = icmp ult i8 %ld2, %ld1
+  br i1 %cmp34, label %if.then3, label %if.else
+ 
+if.then3:                                        
+  %sub7 = sub i8 %x4, %x5
+  br label %if.end4
+ 
+if.else:                           
+  %sub8 = sub i8 %x5, %x4 
+  br label %if.end4 
+ 
+if.end4:          
+  %res = phi i8 [ %sub7, %if.then3 ], [ %sub8, %if.else ]
+  ret i8 %res
+
+}
+
diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll
index ec2887e..45d3ff4 100644
--- a/test/CodeGen/X86/atomic16.ll
+++ b/test/CodeGen/X86/atomic16.ll
@@ -217,7 +217,7 @@ define void @atomic_fetch_umin16(i16 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg16() nounwind {
-  %t1 = cmpxchg i16* @sc16, i16 0, i16 1 acquire
+  %t1 = cmpxchg i16* @sc16, i16 0, i16 1 acquire acquire
 ; X64:       lock
 ; X64:       cmpxchgw
 ; X32:       lock
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
index 3cb9ca1..474c0e6 100644
--- a/test/CodeGen/X86/atomic32.ll
+++ b/test/CodeGen/X86/atomic32.ll
@@ -243,7 +243,7 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg32() nounwind {
-  %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire
+  %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire acquire
 ; X64:       lock
 ; X64:       cmpxchgl
 ; X32:       lock
diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll
index aa00045..4f55edc 100644
--- a/test/CodeGen/X86/atomic64.ll
+++ b/test/CodeGen/X86/atomic64.ll
@@ -183,7 +183,7 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg64() nounwind {
-  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire acquire
 ; X64:       lock
 ; X64:       cmpxchgq
 ; X32:       lock
diff --git a/test/CodeGen/X86/atomic6432.ll b/test/CodeGen/X86/atomic6432.ll
index 31e66c8..c0f7267 100644
--- a/test/CodeGen/X86/atomic6432.ll
+++ b/test/CodeGen/X86/atomic6432.ll
@@ -184,7 +184,7 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg64() nounwind {
-  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire acquire
 ; X32:       lock
 ; X32:       cmpxchg8b
   ret void
diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll
index 3278ed1..203b26f 100644
--- a/test/CodeGen/X86/atomic8.ll
+++ b/test/CodeGen/X86/atomic8.ll
@@ -217,7 +217,7 @@ define void @atomic_fetch_umin8(i8 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg8() nounwind {
-  %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire
+  %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire acquire
 ; X64:       lock
 ; X64:       cmpxchgb
 ; X32:       lock
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index a378d6e..b3045ed 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -101,11 +101,11 @@ entry:
 	%neg1 = sub i32 0, 10		; <i32> [#uses=1]
         ; CHECK: lock
         ; CHECK: cmpxchgl
-  %16 = cmpxchg i32* %val2, i32 %neg1, i32 1 monotonic
+  %16 = cmpxchg i32* %val2, i32 %neg1, i32 1 monotonic monotonic
 	store i32 %16, i32* %old
         ; CHECK: lock
         ; CHECK: cmpxchgl
-  %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic
+  %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic monotonic
 	store i32 %17, i32* %old
         ; CHECK: movl  [[R17atomic:.*]], %eax
         ; CHECK: movl	$1401, %[[R17mask:[a-z]*]]
@@ -133,6 +133,6 @@ entry:
 ; CHECK: lock
 ; CHECK:	cmpxchgl	%{{.*}}, %gs:(%{{.*}})
 
-  %0 = cmpxchg i32 addrspace(256)* %P, i32 0, i32 1 monotonic
+  %0 = cmpxchg i32 addrspace(256)* %P, i32 0, i32 1 monotonic monotonic
   ret void
 }
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
index a98e076..5fcd5ff 100644
--- a/test/CodeGen/X86/avx-blend.ll
+++ b/test/CodeGen/X86/avx-blend.ll
@@ -6,7 +6,7 @@
 ;CHECK: vblendvps
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
   ret <4 x float> %vsel
 }
 
@@ -15,13 +15,13 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 ;CHECK: vblendvps
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %v1, <4 x i32> %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
   ret <4 x i32> %vsel
 }
 
 
 ;CHECK-LABEL: vsel_double:
-;CHECK: vblendvpd
+;CHECK: vmovsd
 ;CHECK: ret
 define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
   %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
@@ -30,7 +30,7 @@ define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
 
 
 ;CHECK-LABEL: vsel_i64:
-;CHECK: vblendvpd
+;CHECK: vmovsd
 ;CHECK: ret
 define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
   %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
@@ -51,6 +51,7 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
 
 
 ;CHECK-LABEL: vsel_float8:
+;CHECK-NOT: vinsertf128
 ;CHECK: vblendvps
 ;CHECK: ret
 define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
@@ -59,8 +60,9 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
 }
 
 ;CHECK-LABEL: vsel_i328:
+;CHECK-NOT: vinsertf128
 ;CHECK: vblendvps
-;CHECK: ret
+;CHECK-NEXT: ret
 define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
   ret <8 x i32> %vsel
@@ -82,6 +84,15 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
   ret <8 x i64> %vsel
 }
 
+;CHECK-LABEL: vsel_double4:
+;CHECK-NOT: vinsertf128
+;CHECK: vblendvpd
+;CHECK-NEXT: ret
+define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
+  ret <4 x double> %vsel
+}
+
 ;; TEST blend + compares
 ; CHECK: testa
 define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
diff --git a/test/CodeGen/X86/avx-cvt-2.ll b/test/CodeGen/X86/avx-cvt-2.ll
new file mode 100644
index 0000000..8cc7190
--- /dev/null
+++ b/test/CodeGen/X86/avx-cvt-2.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+
+; Check that we generate vector conversion from float to narrower int types
+
+%f32vec_t = type <8 x float>
+%i16vec_t = type <8 x i16>
+%i8vec_t =  type <8 x i8>
+
+define void @fptoui16(%f32vec_t %a, %i16vec_t *%p) {
+; CHECK-LABEL: fptoui16:
+; CHECK: vcvttps2dq %ymm
+; CHECK-NOT: vcvttss2si
+  %b = fptoui %f32vec_t %a to %i16vec_t
+  store %i16vec_t %b, %i16vec_t * %p
+  ret void
+}
+
+define void @fptosi16(%f32vec_t %a, %i16vec_t *%p) {
+; CHECK-LABEL: fptosi16:
+; CHECK: vcvttps2dq %ymm
+; CHECK-NOT: vcvttss2si
+  %b = fptosi %f32vec_t %a to %i16vec_t
+  store %i16vec_t %b, %i16vec_t * %p
+  ret void
+}
+
+define void @fptoui8(%f32vec_t %a, %i8vec_t *%p) {
+; CHECK-LABEL: fptoui8:
+; CHECK: vcvttps2dq %ymm
+; CHECK-NOT: vcvttss2si
+  %b = fptoui %f32vec_t %a to %i8vec_t
+  store %i8vec_t %b, %i8vec_t * %p
+  ret void
+}
+
+define void @fptosi8(%f32vec_t %a, %i8vec_t *%p) {
+; CHECK-LABEL: fptosi8:
+; CHECK: vcvttps2dq %ymm
+; CHECK-NOT: vcvttss2si
+  %b = fptosi %f32vec_t %a to %i8vec_t
+  store %i8vec_t %b, %i8vec_t * %p
+  ret void
+}
diff --git a/test/CodeGen/X86/avx-shift.ll b/test/CodeGen/X86/avx-shift.ll
index d79dfcc..a70d45a 100644
--- a/test/CodeGen/X86/avx-shift.ll
+++ b/test/CodeGen/X86/avx-shift.ll
@@ -115,8 +115,8 @@ define <8 x i32> @vshift08(<8 x i32> %a) nounwind {
 ; PR15141
 ; CHECK: _vshift13:
 ; CHECK-NOT: vpsll
-; CHECK: vcvttps2dq
-; CHECK-NEXT: vpmulld
+; CHECK-NOT: vcvttps2dq
+; CHECK: vpmulld
 define <4 x i32> @vshift13(<4 x i32> %in) {
   %T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %T
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 0956361..02aa617 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -297,3 +297,12 @@ entry:
 }
 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
 declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
+
+; this test case just should not fail
+define void @test20() {
+  %a0 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double 0.000000e+00, i32 2
+  store <3 x double> %a0, <3 x double>* undef, align 1
+  %a1 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double undef, i32 2
+  store <3 x double> %a1, <3 x double>* undef, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index 58d0a35..bf8d9a7 100644
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -1,13 +1,15 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
 define <4 x i32> @trunc_64_32(<4 x i64> %A) nounwind uwtable readnone ssp{
-; CHECK: trunc_64_32
-; CHECK: pshufd
+; CHECK-LABEL: trunc_64_32
+; CHECK: shufps
+; CHECK-NOT: pshufd
+; CHECK-NOT: movlhps 
   %B = trunc <4 x i64> %A to <4 x i32>
   ret <4 x i32>%B
 }
 define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
-; CHECK: trunc_32_16
+; CHECK-LABEL: trunc_32_16
 ; CHECK: pshufb
   %B = trunc <8 x i32> %A to <8 x i16>
   ret <8 x i16>%B
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 0d403d4..2ebe6fd 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -141,3 +141,66 @@ entry:
   ret <4 x float> %t
 }
 
+
+; These tests check that a vbroadcast instruction is used when we have a splat
+; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
+; (via the insertelements).
+
+; CHECK-LABEL: splat_concat1
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastss (%
+; CHECK-NEXT: ret
+define <8 x float> @splat_concat1(float* %p) {
+  %1 = load float* %p, align 4
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %3 = insertelement <4 x float> %2, float %1, i32 1
+  %4 = insertelement <4 x float> %3, float %1, i32 2
+  %5 = insertelement <4 x float> %4, float %1, i32 3
+  %6 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x float> %6
+}
+
+; CHECK-LABEL: splat_concat2
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastss (%
+; CHECK-NEXT: ret
+define <8 x float> @splat_concat2(float* %p) {
+  %1 = load float* %p, align 4
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %3 = insertelement <4 x float> %2, float %1, i32 1
+  %4 = insertelement <4 x float> %3, float %1, i32 2
+  %5 = insertelement <4 x float> %4, float %1, i32 3
+  %6 = insertelement <4 x float> undef, float %1, i32 0
+  %7 = insertelement <4 x float> %6, float %1, i32 1
+  %8 = insertelement <4 x float> %7, float %1, i32 2
+  %9 = insertelement <4 x float> %8, float %1, i32 3
+  %10 = shufflevector <4 x float> %5, <4 x float> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %10
+}
+
+; CHECK-LABEL: splat_concat3
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastsd (%
+; CHECK-NEXT: ret
+define <4 x double> @splat_concat3(double* %p) {
+  %1 = load double* %p, align 8
+  %2 = insertelement <2 x double> undef, double %1, i32 0
+  %3 = insertelement <2 x double> %2, double %1, i32 1
+  %4 = shufflevector <2 x double> %3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x double> %4
+}
+
+; CHECK-LABEL: splat_concat4
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastsd (%
+; CHECK-NEXT: ret
+define <4 x double> @splat_concat4(double* %p) {
+  %1 = load double* %p, align 8
+  %2 = insertelement <2 x double> undef, double %1, i32 0
+  %3 = insertelement <2 x double> %2, double %1, i32 1
+  %4 = insertelement <2 x double> undef, double %1, i32 0
+  %5 = insertelement <2 x double> %2, double %1, i32 1
+  %6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %6
+}
+
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index bf4ab5b..a2163a2 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
+declare i32 @foo()
 declare <4 x float> @do_sse(<4 x float>)
 declare <8 x float> @do_avx(<8 x float>)
 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
@@ -36,20 +37,38 @@ entry:
   ret <8 x float> %c
 }
 
+;; Check that vzeroupper is emitted for tail calls.
+
+; CHECK: _test02
+define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind uwtable ssp {
+entry:
+  %add.i = fadd <8 x float> %a, %b
+  %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
+  ; CHECK: vzeroupper
+  ; CHECK: jmp _do_sse
+  %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind
+  ret <4 x float> %call3
+}
+
 ;; Test the pass convergence and also that vzeroupper is only issued when necessary,
 ;; for this function it should be only once
 
-; CHECK: _test02
-define <4 x float> @test02(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+; CHECK: _test03
+define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
 entry:
   %add.i = fadd <4 x float> %a, %b
-  br label %for.body
+  br label %while.cond
 
-for.body:                                         ; preds = %for.body, %entry
+while.cond: 
+  %call = tail call i32 @foo()
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %for.body, label %while.cond
+
+for.body:
   ; CHECK: LBB
   ; CHECK-NOT: vzeroupper
-  %i.018 = phi i32 [ 0, %entry ], [ %1, %for.body ]
-  %c.017 = phi <4 x float> [ %add.i, %entry ], [ %call14, %for.body ]
+  %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ]
+  %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ]
   ; CHECK: callq _do_sse
   %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
   ; CHECK-NEXT: callq _do_sse
@@ -63,14 +82,14 @@ for.body:                                         ; preds = %for.body, %entry
   %exitcond = icmp eq i32 %1, 4
   br i1 %exitcond, label %for.end, label %for.body
 
-for.end:                                          ; preds = %for.body
+for.end:
   ret <4 x float> %call14
 }
 
 ;; Check that we also perform vzeroupper when we return from a function.
 
-; CHECK: _test03
-define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+; CHECK: _test04
+define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
 entry:
   %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ; CHECK-NOT: vzeroupper
diff --git a/test/CodeGen/X86/avx2-gather.ll b/test/CodeGen/X86/avx2-gather.ll
index ee50c45..a9ac025 100644
--- a/test/CodeGen/X86/avx2-gather.ll
+++ b/test/CodeGen/X86/avx2-gather.ll
@@ -15,4 +15,20 @@ define <4 x float> @test_x86_avx2_gather_d_ps(i8* %a1,
 ; CHECK: vgatherdps
 ; CHECK-NOT: [[DST]]
 ; CHECK: [[DST:%xmm[0-9]+]]{{$}}
+; CHECK: vmovaps
+; CHECK: ret
+
+declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
+                      <4 x i32>, <2 x double>, i8) nounwind readonly
+
+define <2 x double> @test_x86_avx2_gather_d_pd(i8* %a1,
+                     <4 x i32> %idx, <2 x double> %mask) {
+  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef,
+                            i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
+  ret <2 x double> %res
+}
+
+; CHECK: test_x86_avx2_gather_d_pd
+; CHECK: vgatherdpd
+; CHECK: vmovapd
 ; CHECK: ret
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index a6141b0..ab3d591 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -753,7 +753,7 @@ declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
 
 
 define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
-  ; CHECK: vpbroadcastd
+  ; CHECK: vbroadcastss
   %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -761,7 +761,7 @@ declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
 
 
 define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
-  ; CHECK: vpbroadcastd
+  ; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}}
   %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -777,7 +777,7 @@ declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
 
 
 define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
-  ; CHECK: vpbroadcastq
+  ; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
   %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -1142,7 +1142,7 @@ define <8 x float>  @test_gather_mask(<8 x float> %a0, float* %a,
                                       <8 x i32> %idx, <8 x float> %mask,
                                       float* nocapture %out) {
 ; CHECK: test_gather_mask
-; CHECK: vmovdqa %ymm2, [[DEST:%.*]]
+; CHECK: vmovaps %ymm2, [[DEST:%.*]]
 ; CHECK: vgatherdps [[DEST]]
 ;; gather with mask
   %a_i8 = bitcast float* %a to i8*
diff --git a/test/CodeGen/X86/avx2-shift.ll b/test/CodeGen/X86/avx2-shift.ll
index 7fdbaaa..025d52e 100644
--- a/test/CodeGen/X86/avx2-shift.ll
+++ b/test/CodeGen/X86/avx2-shift.ll
@@ -266,3 +266,36 @@ define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
   %c = sext <8 x i16> %b to <8 x i32>
   ret <8 x i32> %c
 }
+
+define <8 x i16> @variable_shl16(<8 x i16> %lhs, <8  x i16> %rhs) {
+; CHECK-LABEL: variable_shl16:
+; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
+; CHECK-DAG: vpmovzxwd %xmm0, [[LHS:%ymm[0-9]+]]
+; CHECK: vpsllvd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
+; CHECK: vpshufb
+; CHECK: vpermq
+  %res = shl <8 x i16> %lhs, %rhs
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @variable_ashr16(<8 x i16> %lhs, <8  x i16> %rhs) {
+; CHECK-LABEL: variable_ashr16:
+; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
+; CHECK-DAG: vpmovsxwd %xmm0, [[LHS:%ymm[0-9]+]]
+; CHECK: vpsravd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
+; CHECK: vpshufb
+; CHECK: vpermq
+  %res = ashr <8 x i16> %lhs, %rhs
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @variable_lshr16(<8 x i16> %lhs, <8  x i16> %rhs) {
+; CHECK-LABEL: variable_lshr16:
+; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
+; CHECK-DAG: vpmovzxwd %xmm0, [[LHS:%ymm[0-9]+]]
+; CHECK: vpsrlvd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
+; CHECK: vpshufb
+; CHECK: vpermq
+  %res = lshr <8 x i16> %lhs, %rhs
+  ret <8 x i16> %res
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 5610416..66f586d 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -98,7 +98,7 @@ entry:
   %qf = insertelement <16 x i16> %qe, i16 %q, i32 15
   ret <16 x i16> %qf
 }
-; CHECK: vpbroadcastd (%
+; CHECK: vbroadcastss (%
 define <4 x i32> @D32(i32* %ptr) nounwind uwtable readnone ssp {
 entry:
   %q = load i32* %ptr, align 4
@@ -108,7 +108,7 @@ entry:
   %q3 = insertelement <4 x i32> %q2, i32 %q, i32 3
   ret <4 x i32> %q3
 }
-; CHECK: vpbroadcastd (%
+; CHECK: vbroadcastss (%
 define <8 x i32> @DD32(i32* %ptr) nounwind uwtable readnone ssp {
 entry:
   %q = load i32* %ptr, align 4
@@ -130,7 +130,7 @@ entry:
   %q1 = insertelement <2 x i64> %q0, i64 %q, i32 1
   ret <2 x i64> %q1
 }
-; CHECK: vpbroadcastq (%
+; CHECK: vbroadcastsd (%
 define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
 entry:
   %q = load i64* %ptr, align 4
@@ -293,7 +293,7 @@ define   <8 x i16> @_inreg8xi16(<8 x i16> %a) {
 
 
 ;CHECK-LABEL: _inreg4xi64:
-;CHECK: vpbroadcastq
+;CHECK: vbroadcastsd
 ;CHECK: ret
 define   <4 x i64> @_inreg4xi64(<4 x i64> %a) {
   %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -325,7 +325,7 @@ define   <2 x double> @_inreg2xdouble(<2 x double> %a) {
 }
 
 ;CHECK-LABEL: _inreg8xi32:
-;CHECK: vpbroadcastd
+;CHECK: vbroadcastss
 ;CHECK: ret
 define   <8 x i32> @_inreg8xi32(<8 x i32> %a) {
   %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -333,7 +333,7 @@ define   <8 x i32> @_inreg8xi32(<8 x i32> %a) {
 }
 
 ;CHECK-LABEL: _inreg4xi32:
-;CHECK: vpbroadcastd
+;CHECK: vbroadcastss
 ;CHECK: ret
 define   <4 x i32> @_inreg4xi32(<4 x i32> %a) {
   %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -355,3 +355,219 @@ define   <16 x i8> @_inreg16xi8(<16 x i8> %a) {
   %b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %b
 }
+
+; These tests check that a vbroadcast instruction is used when we have a splat
+; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
+; (via the insertelements).
+
+; CHECK-LABEL: splat_concat1
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastss
+; CHECK-NEXT: ret
+define <8 x float> @splat_concat1(float %f) {
+  %1 = insertelement <4 x float> undef, float %f, i32 0
+  %2 = insertelement <4 x float> %1, float %f, i32 1
+  %3 = insertelement <4 x float> %2, float %f, i32 2
+  %4 = insertelement <4 x float> %3, float %f, i32 3
+  %5 = shufflevector <4 x float> %4, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x float> %5
+}
+
+; CHECK-LABEL: splat_concat2
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastss
+; CHECK-NEXT: ret
+define <8 x float> @splat_concat2(float %f) {
+  %1 = insertelement <4 x float> undef, float %f, i32 0
+  %2 = insertelement <4 x float> %1, float %f, i32 1
+  %3 = insertelement <4 x float> %2, float %f, i32 2
+  %4 = insertelement <4 x float> %3, float %f, i32 3
+  %5 = insertelement <4 x float> undef, float %f, i32 0
+  %6 = insertelement <4 x float> %5, float %f, i32 1
+  %7 = insertelement <4 x float> %6, float %f, i32 2
+  %8 = insertelement <4 x float> %7, float %f, i32 3
+  %9 = shufflevector <4 x float> %4, <4 x float> %8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %9
+}
+
+; CHECK-LABEL: splat_concat3
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastsd
+; CHECK-NEXT: ret
+define <4 x double> @splat_concat3(double %d) {
+  %1 = insertelement <2 x double> undef, double %d, i32 0
+  %2 = insertelement <2 x double> %1, double %d, i32 1
+  %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x double> %3
+}
+
+; CHECK-LABEL: splat_concat4
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastsd
+; CHECK-NEXT: ret
+define <4 x double> @splat_concat4(double %d) {
+  %1 = insertelement <2 x double> undef, double %d, i32 0
+  %2 = insertelement <2 x double> %1, double %d, i32 1
+  %3 = insertelement <2 x double> undef, double %d, i32 0
+  %4 = insertelement <2 x double> %3, double %d, i32 1
+  %5 = shufflevector <2 x double> %2, <2 x double> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %5
+}
+
+; Test cases for <rdar://problem/16074331>.
+; Instruction selection for broacast instruction fails if
+; the load cannot be folded into the broadcast.
+; This happens if the load has initial one use but other uses are
+; created later, or if selection DAG cannot prove that folding the
+; load will not create a cycle in the DAG.
+; Those test cases exerce the latter.
+
+; CHECK-LABEL: isel_crash_16b
+; CHECK: vpbroadcastb {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_16b(i8* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i8* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <16 x i8> undef, i8 %tmp2, i32 0
+  %splat.splat = shufflevector <16 x i8> %splat.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %tmp3 = bitcast <16 x i8> %splat.splat to <2 x i64>
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_32b
+; CHECK: vpbroadcastb {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_32b(i8* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i8* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <32 x i8> undef, i8 %tmp2, i32 0
+  %splat.splat = shufflevector <32 x i8> %splat.splatinsert, <32 x i8> undef, <32 x i32> zeroinitializer
+  %tmp3 = bitcast <32 x i8> %splat.splat to <4 x i64>
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_8w
+; CHECK: vpbroadcastw {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_8w(i16* %cV_R.addr) {
+entry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i16* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <8 x i16> undef, i16 %tmp2, i32 0
+  %splat.splat = shufflevector <8 x i16> %splat.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %tmp3 = bitcast <8 x i16> %splat.splat to <2 x i64>
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_16w
+; CHECK: vpbroadcastw {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_16w(i16* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i16* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <16 x i16> undef, i16 %tmp2, i32 0
+  %splat.splat = shufflevector <16 x i16> %splat.splatinsert, <16 x i16> undef, <16 x i32> zeroinitializer
+  %tmp3 = bitcast <16 x i16> %splat.splat to <4 x i64>
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_4d
+; CHECK: vbroadcastss {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_4d(i32* %cV_R.addr) {
+entry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i32* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %tmp2, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %tmp3 = bitcast <4 x i32> %splat.splat to <2 x i64>
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_8d
+; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_8d(i32* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i32* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <8 x i32> undef, i32 %tmp2, i32 0
+  %splat.splat = shufflevector <8 x i32> %splat.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
+  %tmp3 = bitcast <8 x i32> %splat.splat to <4 x i64>
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_2q
+; CHECK: vpbroadcastq {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_2q(i64* %cV_R.addr) {
+entry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i64* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %tmp2, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %splat.splat, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_4q
+; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_4q(i64* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i64* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <4 x i64> undef, i64 %tmp2, i32 0
+  %splat.splat = shufflevector <4 x i64> %splat.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %splat.splat, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index 5592e6c..4ae2905 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -9,7 +9,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_sllw_1:
-; CHECK: vpsllw  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsllw  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
@@ -24,12 +24,12 @@ entry:
 
 define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
 entry:
-  %shl = shl <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <16 x i16> %shl
 }
 
 ; CHECK-LABEL: test_sllw_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpsllw $15, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
@@ -39,7 +39,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_slld_1:
-; CHECK: vpslld  $0, %ymm0, %ymm0
+; CHECK-NOT: vpslld  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
@@ -54,12 +54,12 @@ entry:
 
 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
 entry:
-  %shl = shl <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
   ret <8 x i32> %shl
 }
 
 ; CHECK-LABEL: test_slld_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpslld $31, %ymm0, %ymm0
 ; CHECK: ret
 
 define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
@@ -69,7 +69,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_sllq_1:
-; CHECK: vpsllq  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsllq  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
@@ -84,12 +84,12 @@ entry:
 
 define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
 entry:
-  %shl = shl <4 x i64> %InVec, <i64 64, i64 64, i64 64, i64 64>
+  %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
   ret <4 x i64> %shl
 }
 
 ; CHECK-LABEL: test_sllq_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpsllq $63, %ymm0, %ymm0
 ; CHECK: ret
 
 ; AVX2 Arithmetic Shift
@@ -101,7 +101,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_sraw_1:
-; CHECK: vpsraw  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsraw  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
@@ -116,7 +116,7 @@ entry:
 
 define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
 entry:
-  %shl = ashr <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <16 x i16> %shl
 }
 
@@ -131,7 +131,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srad_1:
-; CHECK: vpsrad  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsrad  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
@@ -146,7 +146,7 @@ entry:
 
 define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
 entry:
-  %shl = ashr <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
   ret <8 x i32> %shl
 }
 
@@ -163,7 +163,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srlw_1:
-; CHECK: vpsrlw  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsrlw  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
@@ -178,12 +178,12 @@ entry:
 
 define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
 entry:
-  %shl = lshr <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <16 x i16> %shl
 }
 
 ; CHECK-LABEL: test_srlw_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpsrlw $15, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
@@ -193,7 +193,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srld_1:
-; CHECK: vpsrld  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsrld  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
@@ -208,12 +208,12 @@ entry:
 
 define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
 entry:
-  %shl = lshr <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
   ret <8 x i32> %shl
 }
 
 ; CHECK-LABEL: test_srld_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpsrld $31, %ymm0, %ymm0
 ; CHECK: ret
 
 define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
@@ -223,7 +223,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srlq_1:
-; CHECK: vpsrlq  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsrlq  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
@@ -238,10 +238,21 @@ entry:
 
 define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
 entry:
-  %shl = lshr <4 x i64> %InVec, <i64 64, i64 64, i64 64, i64 64>
+  %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
   ret <4 x i64> %shl
 }
 
 ; CHECK-LABEL: test_srlq_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpsrlq $63, %ymm0, %ymm0
 ; CHECK: ret
+
+; CHECK-LABEL: @srl_trunc_and_v4i64
+; CHECK: vpand
+; CHECK-NEXT: vpsrlvd
+; CHECK: ret
+define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
+  %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
+  %trunc = trunc <4 x i64> %and to <4 x i32>
+  %sra = lshr <4 x i32> %x, %trunc
+  ret <4 x i32> %sra
+}
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index e27600e..4d1c9f7 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
 
 ; CHECK-LABEL: addpd512
 ; CHECK: vaddpd
@@ -163,6 +163,40 @@ define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
   ret <8 x i64> %x
 }
 
+; CHECK-LABEL: vpaddq_fold_test
+; CHECK: vpaddq (%
+; CHECK: ret
+define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind {
+  %tmp = load <8 x i64>* %j, align 4
+  %x = add <8 x i64> %i, %tmp
+  ret <8 x i64> %x
+}
+
+; CHECK-LABEL: vpaddq_broadcast_test
+; CHECK: vpaddq LCP{{.*}}(%rip){1to8}
+; CHECK: ret
+define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
+  %x = add <8 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  ret <8 x i64> %x
+}
+
+; CHECK-LABEL: vpaddq_broadcast2_test
+; CHECK: vpaddq (%rdi){1to8}
+; CHECK: ret
+define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
+  %tmp = load i64* %j
+  %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0
+  %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1
+  %j.2 = insertelement <8 x i64> %j.1, i64 %tmp, i32 2
+  %j.3 = insertelement <8 x i64> %j.2, i64 %tmp, i32 3
+  %j.4 = insertelement <8 x i64> %j.3, i64 %tmp, i32 4
+  %j.5 = insertelement <8 x i64> %j.4, i64 %tmp, i32 5
+  %j.6 = insertelement <8 x i64> %j.5, i64 %tmp, i32 6
+  %j.7 = insertelement <8 x i64> %j.6, i64 %tmp, i32 7
+  %x = add <8 x i64> %i, %j.7
+  ret <8 x i64> %x
+}
+
 ; CHECK-LABEL: vpaddd_test
 ; CHECK: vpaddd %zmm
 ; CHECK: ret
@@ -171,6 +205,85 @@ define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
   ret <16 x i32> %x
 }
 
+; CHECK-LABEL: vpaddd_fold_test
+; CHECK: vpaddd (%
+; CHECK: ret
+define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind {
+  %tmp = load <16 x i32>* %j, align 4
+  %x = add <16 x i32> %i, %tmp
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: vpaddd_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to16}
+; CHECK: ret
+define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
+  %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: vpaddd_mask_test
+; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd_maskz_test
+; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd_mask_fold_test
+; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %j = load <16 x i32>* %j.ptr
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd_mask_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd_maskz_fold_test
+; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z}
+; CHECK: ret
+define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %j = load <16 x i32>* %j.ptr
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd_maskz_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z}
+; CHECK: ret
+define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %r
+}
+
 ; CHECK-LABEL: vpsubq_test
 ; CHECK: vpsubq %zmm
 ; CHECK: ret
@@ -196,7 +309,7 @@ define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
 }
 
 ; CHECK-LABEL: sqrtA
-; CHECK: vsqrtssz
+; CHECK: vsqrtss {{.*}} encoding: [0x62
 ; CHECK: ret
 declare float @sqrtf(float) readnone
 define float @sqrtA(float %a) nounwind uwtable readnone ssp {
@@ -206,7 +319,7 @@ entry:
 }
 
 ; CHECK-LABEL: sqrtB
-; CHECK: vsqrtsdz
+; CHECK: vsqrtsd {{.*}}## encoding: [0x62
 ; CHECK: ret
 declare double @sqrt(double) readnone
 define double @sqrtB(double %a) nounwind uwtable readnone ssp {
@@ -216,7 +329,7 @@ entry:
 }
 
 ; CHECK-LABEL: sqrtC
-; CHECK: vsqrtssz
+; CHECK: vsqrtss {{.*}}## encoding: [0x62
 ; CHECK: ret
 declare float @llvm.sqrt.f32(float)
 define float @sqrtC(float %a) nounwind {
@@ -224,6 +337,24 @@ define float @sqrtC(float %a) nounwind {
   ret float %b
 }
 
+; CHECK-LABEL: sqrtD
+; CHECK: vsqrtps {{.*}}
+; CHECK: ret
+declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
+define <16 x float> @sqrtD(<16 x float> %a) nounwind {
+  %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: sqrtE
+; CHECK: vsqrtpd {{.*}}
+; CHECK: ret
+declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
+define <8 x double> @sqrtE(<8 x double> %a) nounwind {
+  %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
+  ret <8 x double> %b
+}
+
 ; CHECK-LABEL: fadd_broadcast
 ; CHECK: LCP{{.*}}(%rip){1to16}, %zmm0, %zmm0
 ; CHECK: ret
diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
index bc4560b..b5a2aa8 100644
--- a/test/CodeGen/X86/avx512-build-vector.ll
+++ b/test/CodeGen/X86/avx512-build-vector.ll
@@ -15,4 +15,16 @@ define <16 x i32> @test1(i32* %x) {
 define <16 x i32> @test2(<16 x i32> %x) {
    %res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x
    ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test3
+; CHECK: vinsertf128
+; CHECK: vinsertf64x4
+; CHECK: ret
+define <16 x float> @test3(<4 x float> %a) {
+  %b = extractelement <4 x float> %a, i32 2
+  %c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
+  %b1 = extractelement <4 x float> %a, i32 0
+  %c1 = insertelement <16 x float> %c, float %b1, i32 6
+  ret <16 x float>%c1
 }
 \ No newline at end of file
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index ba52745..47e50a9 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
 
-; CHECK: vucomisdz
+; CHECK-LABEL: test1
+; CHECK: vucomisd {{.*}}encoding: [0x62
 define double @test1(double %a, double %b) nounwind {
   %tobool = fcmp une double %a, %b
   br i1 %tobool, label %l1, label %l2
@@ -13,7 +14,8 @@ l2:
   ret double %c1
 }
 
-; CHECK: vucomissz
+; CHECK-LABEL: test2
+; CHECK: vucomiss {{.*}}encoding: [0x62
 define float @test2(float %a, float %b) nounwind {
   %tobool = fcmp olt float %a, %b
   br i1 %tobool, label %l1, label %l2
@@ -25,3 +27,62 @@ l2:
   %c1 = fadd float %a, %b
   ret float %c1
 }
+
+; CHECK-LABEL: test3
+; CHECK: vcmpeqss
+; CHECK: kmov
+; CHECK: ret
+define i32 @test3(float %a, float %b) {
+
+  %cmp10.i = fcmp oeq float %a, %b
+  %conv11.i = zext i1 %cmp10.i to i32
+  ret i32 %conv11.i
+}
+
+; CHECK-LABEL: test5
+; CHECK: ret
+define float @test5(float %p) #0 {
+entry:
+  %cmp = fcmp oeq float %p, 0.000000e+00
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %cmp1 = fcmp ogt float %p, 0.000000e+00
+  %cond = select i1 %cmp1, float 1.000000e+00, float -1.000000e+00
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  %retval.0 = phi float [ %cond, %if.end ], [ %p, %entry ]
+  ret float %retval.0
+}
+
+; CHECK-LABEL: test6
+; CHECK: cmpl
+; CHECK-NOT: kmov
+; CHECK: ret
+define i32 @test6(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %res = zext i1 %cmp to i32
+  ret i32 %res
+}
+
+; CHECK-LABEL: test7
+; CHECK: vucomisd
+; CHECK-NOT: kmov
+; CHECK: ret
+define i32 @test7(double %x, double %y) #2 {
+entry:
+  %0 = fcmp one double %x, %y
+  %or = zext i1 %0 to i32
+  ret i32 %or
+}
+
+define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
+  %tmp1 = icmp eq i32 %a1, -1
+  %tmp2 = icmp eq i32 %a2, -2147483648
+  %tmp3 = and i1 %tmp1, %tmp2
+  %tmp4 = icmp eq i32 %a3, 0
+  %tmp5 = or i1 %tmp3, %tmp4
+  %res = select i1 %tmp5, i32 1, i32 %a3
+  ret i32 %res
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index ed68ff7..1d83485 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
 
 ; CHECK-LABEL: sitof32
 ; CHECK: vcvtdq2ps %zmm
@@ -67,7 +67,7 @@ define <8 x double> @fpext00(<8 x float> %b) nounwind {
 }
 
 ; CHECK-LABEL: funcA
-; CHECK: vcvtsi2sdqz (%
+; CHECK: vcvtsi2sdq (%rdi){{.*}} encoding: [0x62
 ; CHECK: ret
 define double @funcA(i64* nocapture %e) {
 entry:
@@ -77,7 +77,7 @@ entry:
 }
 
 ; CHECK-LABEL: funcB
-; CHECK: vcvtsi2sdlz (%
+; CHECK: vcvtsi2sdl (%{{.*}} encoding: [0x62
 ; CHECK: ret
 define double @funcB(i32* %e) {
 entry:
@@ -87,7 +87,7 @@ entry:
 }
 
 ; CHECK-LABEL: funcC
-; CHECK: vcvtsi2sslz (%
+; CHECK: vcvtsi2ssl (%{{.*}} encoding: [0x62
 ; CHECK: ret
 define float @funcC(i32* %e) {
 entry:
@@ -97,7 +97,7 @@ entry:
 }
 
 ; CHECK-LABEL: i64tof32
-; CHECK: vcvtsi2ssqz  (%
+; CHECK: vcvtsi2ssq  (%{{.*}} encoding: [0x62
 ; CHECK: ret
 define float @i64tof32(i64* %e) {
 entry:
@@ -107,7 +107,7 @@ entry:
 }
 
 ; CHECK-LABEL: fpext
-; CHECK: vcvtss2sdz
+; CHECK: vcvtss2sd {{.*}} encoding: [0x62
 ; CHECK: ret
 define void @fpext() {
 entry:
@@ -120,9 +120,9 @@ entry:
 }
 
 ; CHECK-LABEL: fpround_scalar
-; CHECK: vmovsdz
-; CHECK: vcvtsd2ssz
-; CHECK: vmovssz
+; CHECK: vmovsd {{.*}} encoding: [0x62
+; CHECK: vcvtsd2ss {{.*}} encoding: [0x62
+; CHECK: vmovss {{.*}} encoding: [0x62
 ; CHECK: ret
 define void @fpround_scalar() nounwind uwtable {
 entry:
@@ -135,7 +135,7 @@ entry:
 }
 
 ; CHECK-LABEL: long_to_double
-; CHECK: vmovqz
+; CHECK: vmovq {{.*}} encoding: [0x62
 ; CHECK: ret
 define double @long_to_double(i64 %x) {
    %res = bitcast i64 %x to double
@@ -143,7 +143,7 @@ define double @long_to_double(i64 %x) {
 }
 
 ; CHECK-LABEL: double_to_long
-; CHECK: vmovqz
+; CHECK: vmovq {{.*}} encoding: [0x62
 ; CHECK: ret
 define i64 @double_to_long(double %x) {
    %res = bitcast double %x to i64
@@ -151,7 +151,7 @@ define i64 @double_to_long(double %x) {
 }
 
 ; CHECK-LABEL: int_to_float
-; CHECK: vmovdz
+; CHECK: vmovd {{.*}} encoding: [0x62
 ; CHECK: ret
 define float @int_to_float(i32 %x) {
    %res = bitcast i32 %x to float
@@ -159,7 +159,7 @@ define float @int_to_float(i32 %x) {
 }
 
 ; CHECK-LABEL: float_to_int
-; CHECK: vmovdz
+; CHECK: vmovd {{.*}} encoding: [0x62
 ; CHECK: ret
 define i32 @float_to_int(float %x) {
    %res = bitcast float %x to i32
@@ -185,7 +185,7 @@ define <16 x float> @uitof32(<16 x i32> %a) nounwind {
 }
 
 ; CHECK-LABEL: @fptosi02
-; CHECK vcvttss2siz
+; CHECK: vcvttss2si {{.*}} encoding: [0x62
 ; CHECK: ret
 define i32 @fptosi02(float %a) nounwind {
   %b = fptosi float %a to i32
@@ -193,7 +193,7 @@ define i32 @fptosi02(float %a) nounwind {
 }
 
 ; CHECK-LABEL: @fptoui02
-; CHECK vcvttss2usiz
+; CHECK: vcvttss2usi {{.*}} encoding: [0x62
 ; CHECK: ret
 define i32 @fptoui02(float %a) nounwind {
   %b = fptoui float %a to i32
@@ -201,7 +201,7 @@ define i32 @fptoui02(float %a) nounwind {
 }
 
 ; CHECK-LABEL: @uitofp02
-; CHECK vcvtusi2ss
+; CHECK: vcvtusi2ss
 ; CHECK: ret
 define float @uitofp02(i32 %a) nounwind {
   %b = uitofp i32 %a to float
@@ -209,7 +209,7 @@ define float @uitofp02(i32 %a) nounwind {
 }
 
 ; CHECK-LABEL: @uitofp03
-; CHECK vcvtusi2sd
+; CHECK: vcvtusi2sd
 ; CHECK: ret
 define double @uitofp03(i32 %a) nounwind {
   %b = uitofp i32 %a to double
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index 0321e95..e429a22 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -223,3 +223,81 @@ define void @gather_qpi(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
   call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i32> %x, i32 4)
   ret void
 }
+
+;CHECK-LABEL: gather_mask_dpd_execdomain
+;CHECK: vgatherdpd
+;CHECK: vmovapd
+;CHECK: ret
+define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
+  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  store <8 x double> %x, <8 x double>* %stbuf
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_qpd_execdomain
+;CHECK: vgatherqpd
+;CHECK: vmovapd
+;CHECK: ret
+define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  store <8 x double> %x, <8 x double>* %stbuf
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_dps_execdomain
+;CHECK: vgatherdps
+;CHECK: vmovaps 
+;CHECK: ret
+define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base)  {
+  %res = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  ret <16 x float> %res;
+}
+
+;CHECK-LABEL: gather_mask_qps_execdomain
+;CHECK: vgatherqps
+;CHECK: vmovaps
+;CHECK: ret
+define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base)  {
+  %res = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  ret <8 x float> %res;
+}
+
+;CHECK-LABEL: scatter_mask_dpd_execdomain
+;CHECK: vmovapd
+;CHECK: vscatterdpd
+;CHECK: ret
+define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = load <8 x double>* %src, align 64 
+  call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: scatter_mask_qpd_execdomain
+;CHECK: vmovapd
+;CHECK: vscatterqpd
+;CHECK: ret
+define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = load <8 x double>* %src, align 64
+  call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: scatter_mask_dps_execdomain
+;CHECK: vmovaps
+;CHECK: vscatterdps
+;CHECK: ret
+define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf)  {
+  %x = load <16 x float>* %src, align 64
+  call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: scatter_mask_qps_execdomain
+;CHECK: vmovaps
+;CHECK: vscatterqps
+;CHECK: ret
+define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = load <8 x float>* %src, align 32 
+  call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
+  ret void
+}
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 3f06740..6557ac3 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -44,7 +44,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
 }
 
 ;CHECK-LABEL: test5:
-;CHECK: vextractpsz
+;CHECK: vextractps
 ;CHECK: ret
 define i32 @test5(<4 x float> %x) nounwind {
   %ef = extractelement <4 x float> %x, i32 3
@@ -53,7 +53,7 @@ define i32 @test5(<4 x float> %x) nounwind {
 }
 
 ;CHECK-LABEL: test6:
-;CHECK: vextractpsz {{.*}}, (%rdi)
+;CHECK: vextractps {{.*}}, (%rdi)
 ;CHECK: ret
 define void @test6(<4 x float> %x, float* %out) nounwind {
   %ef = extractelement <4 x float> %x, i32 3
@@ -62,7 +62,7 @@ define void @test6(<4 x float> %x, float* %out) nounwind {
 }
 
 ;CHECK-LABEL: test7
-;CHECK: vmovdz
+;CHECK: vmovd
 ;CHECK: vpermps %zmm
 ;CHECK: ret
 define float @test7(<16 x float> %x, i32 %ind) nounwind {
@@ -71,7 +71,7 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind {
 }
 
 ;CHECK-LABEL: test8
-;CHECK: vmovqz
+;CHECK: vmovq
 ;CHECK: vpermpd %zmm
 ;CHECK: ret
 define double @test8(<8 x double> %x, i32 %ind) nounwind {
@@ -89,9 +89,9 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind {
 }
 
 ;CHECK-LABEL: test10
-;CHECK: vmovdz
+;CHECK: vmovd
 ;CHECK: vpermd %zmm
-;CHEKK: vmovdz  %xmm0, %eax
+;CHECK: vmovd  %xmm0, %eax
 ;CHECK: ret
 define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
   %e = extractelement <16 x i32> %x, i32 %ind
@@ -99,27 +99,62 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
 }
 
 ;CHECK-LABEL: test11
-;CHECK: movl    $260
-;CHECK: bextrl
-;CHECK: movl    $268
-;CHECK: bextrl
+;CHECK: vpcmpltud
+;CHECK: kshiftlw $11
+;CHECK: kshiftrw $15
+;CHECK: kortestw
+;CHECK: je
+;CHECK: ret
 ;CHECK: ret
 define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
   %cmp_res = icmp ult <16 x i32> %a, %b
   %ia = extractelement <16 x i1> %cmp_res, i32 4
-  %ib = extractelement <16 x i1> %cmp_res, i32 12
-
   br i1 %ia, label %A, label %B
-
   A:
     ret <16 x i32>%b
   B:
    %c = add <16 x i32>%b, %a
-  br i1 %ib, label %C, label %D
-  C:
-   %c1 = sub <16 x i32>%c, %a
-   ret <16 x i32>%c1
-  D:
-   %c2 = mul <16 x i32>%c, %a
-   ret <16 x i32>%c2
+   ret <16 x i32>%c
+}
+
+;CHECK-LABEL: test12
+;CHECK: vpcmpgtq
+;CHECK: kshiftlw $15
+;CHECK: kshiftrw $15
+;CHECK: kortestw
+;CHECK: ret
+
+define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
+
+  %cmpvector_func.i = icmp slt <16 x i64> %a, %b
+  %extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
+  %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
+  ret i64 %res
+}
+
+;CHECK-LABEL: test13
+;CHECK: cmpl
+;CHECK: sbbl
+;CHECK: orl $65532
+;CHECK: ret
+define i16 @test13(i32 %a, i32 %b) {
+  %cmp_res = icmp ult i32 %a, %b
+  %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
+  %res = bitcast <16 x i1> %maskv to i16
+  ret i16 %res
+}
+
+;CHECK-LABEL: test14
+;CHECK: vpcmpgtq
+;CHECK: kshiftlw $11
+;CHECK: kshiftrw $15
+;CHECK: kortestw
+;CHECK: ret
+
+define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
+
+  %cmpvector_func.i = icmp slt <8 x i64> %a, %b
+  %extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
+  %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
+  ret i64 %res
 }
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 5bdabf2..3fb38ed 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -1,108 +1,136 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
 
-declare i32 @llvm.x86.avx512.kortestz(i16, i16) nounwind readnone
-; CHECK: test_kortestz
+declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
+; CHECK-LABEL: test_kortestz
 ; CHECK: kortestw
 ; CHECK: sete
 define i32 @test_kortestz(i16 %a0, i16 %a1) {
-  %res = call i32 @llvm.x86.avx512.kortestz(i16 %a0, i16 %a1) 
+  %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1) 
   ret i32 %res
 }
 
-declare i32 @llvm.x86.avx512.kortestc(i16, i16) nounwind readnone
-; CHECK: test_kortestc
+declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
+; CHECK-LABEL: test_kortestc
 ; CHECK: kortestw
 ; CHECK: sbbl
 define i32 @test_kortestc(i16 %a0, i16 %a1) {
-  %res = call i32 @llvm.x86.avx512.kortestc(i16 %a0, i16 %a1) 
+  %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1) 
   ret i32 %res
 }
 
+declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
+; CHECK-LABEL: test_kand
+; CHECK: kandw
+; CHECK: kandw
+define i16 @test_kand(i16 %a0, i16 %a1) {
+  %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
+  %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
+  ret i16 %t2
+}
+
+declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
+; CHECK-LABEL: test_knot
+; CHECK: knotw
+define i16 @test_knot(i16 %a0) {
+  %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
+
+; CHECK-LABEL: unpckbw_test
+; CHECK: kunpckbw
+; CHECK:ret
+define i16 @unpckbw_test(i16 %a0, i16 %a1) {
+  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
+  ret i16 %res
+}
+
 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
-  ; CHECK: vrcp14ps
-  %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ; CHECK: vrcp14ps {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x4c,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
 
 define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
-  ; CHECK: vrcp14pd
-  %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  ; CHECK: vrcp14pd {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x4c,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
 
 define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
-  ; CHECK: vrcp28ps
-  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ; CHECK: vrcp28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.rcp28.ps.512(<16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
-  ; CHECK: vrcp28pd
-  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  ; CHECK: vrcp28pd {sae}, {{.*}}encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.avx512.rcp28.pd.512(<8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
 
-define <8 x double> @test_rndscale_pd_512(<8 x double> %a0) {
-  ; CHECK: vrndscale
-  %res = call <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double> %a0, i32 7) ; <<8 x double>> [#uses=1]
-  ret <8 x double> %res
+declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
+
+define <8 x double> @test7(<8 x double> %a) {
+; CHECK: vrndscalepd {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x0b]
+  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> zeroinitializer, i8 -1, i32 4)
+  ret <8 x double>%res
 }
-declare <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double>, i32) nounwind readnone
 
+declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
 
-define <16 x float> @test_rndscale_ps_512(<16 x float> %a0) {
-  ; CHECK: vrndscale
-  %res = call <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float> %a0, i32 7) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
+define <16 x float> @test8(<16 x float> %a) {
+; CHECK: vrndscaleps {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x0b]
+  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> zeroinitializer, i16 -1, i32 4)
+  ret <16 x float>%res
 }
-declare <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float>, i32) nounwind readnone
-
 
 define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
-  ; CHECK: vrsqrt14ps
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ; CHECK: vrsqrt14ps {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x4e,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
 
 define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) {
-  ; CHECK: vrsqrt28ps
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ; CHECK: vrsqrt28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.rsqrt28.ps.512(<16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
-  ; CHECK: vrsqrt14ss
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ; CHECK: vrsqrt14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4f,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
 define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
-  ; CHECK: vrsqrt28ss
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
-  ; CHECK: vrcp14ss
-  %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ; CHECK: vrcp14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4d,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
 define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
-  ; CHECK: vrcp28ss
-  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
   ; CHECK: vsqrtpd
@@ -119,42 +147,42 @@ define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone
 
 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) {
-  ; CHECK: vsqrtssz
+  ; CHECK: vsqrtss {{.*}}encoding: [0x62
   %res = call <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1) {
-  ; CHECK: vsqrtsdz
+  ; CHECK: vsqrtsd {{.*}}encoding: [0x62
   %res = call <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double>, <2 x double>) nounwind readnone
 
 define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
-  ; CHECK: vcvtsd2siz
+  ; CHECK: vcvtsd2si {{.*}}encoding: [0x62
   %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
 
 define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
-  ; CHECK: vcvtsi2sdqz
+  ; CHECK: vcvtsi2sdq {{.*}}encoding: [0x62
   %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
 
 define <2 x double> @test_x86_avx512_cvtusi642sd(<2 x double> %a0, i64 %a1) {
-  ; CHECK: vcvtusi2sdqz
+  ; CHECK: vcvtusi2sdq {{.*}}encoding: [0x62
   %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double>, i64) nounwind readnone
 
 define i64 @test_x86_sse2_cvttsd2si64(<2 x double> %a0) {
-  ; CHECK: vcvttsd2siz
+  ; CHECK: vcvttsd2si {{.*}}encoding: [0x62
   %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
@@ -162,7 +190,7 @@ declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
 
 
 define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
-  ; CHECK: vcvtss2siz
+  ; CHECK: vcvtss2si {{.*}}encoding: [0x62
   %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
@@ -170,7 +198,7 @@ declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
 
 
 define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
-  ; CHECK: vcvtsi2ssqz
+  ; CHECK: vcvtsi2ssq {{.*}}encoding: [0x62
   %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -178,33 +206,34 @@ declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
 
 
 define i64 @test_x86_sse_cvttss2si64(<4 x float> %a0) {
-  ; CHECK: vcvttss2siz
+  ; CHECK: vcvttss2si {{.*}}encoding: [0x62
   %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
 
 define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
-  ; CHECK: vcvtsd2usiz
+  ; CHECK: vcvtsd2usi {{.*}}encoding: [0x62
   %res = call i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
 declare i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double>) nounwind readnone
 
 define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
-  ; CHECK: vcvtph2ps
-  %res = call <16 x float> @llvm.x86.avx512.vcvtph2ps.512(<16 x i16> %a0)
+  ; CHECK: vcvtph2ps  %ymm0, %zmm0    ## encoding: [0x62,0xf2,0x7d,0x48,0x13,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.vcvtph2ps.512(<16 x i16>) nounwind readonly
+declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
 
 
 define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) {
-  ; CHECK: vcvtps2ph
-  %res = call <16 x i16> @llvm.x86.avx512.vcvtps2ph.512(<16 x float> %a0, i32 0)
+  ; CHECK: vcvtps2ph $2, %zmm0, %ymm0  ## encoding: [0x62,0xf3,0x7d,0x48,0x1d,0xc0,0x02]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
   ret <16 x i16> %res
 }
-declare <16 x i16> @llvm.x86.avx512.vcvtps2ph.512(<16 x float>, i32) nounwind readonly
+
+declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
 
 define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) {
   ; CHECK: vbroadcastss
@@ -262,113 +291,249 @@ define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) {
 }
 declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly
 
-define <16 x i32> @test_x86_pmaxu_d(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpmaxud 
-  %res = call <16 x i32> @llvm.x86.avx512.pmaxu.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+define <16 x i32> @test_conflict_d(<16 x i32> %a) {
+  ; CHECK: movw $-1, %ax
+  ; CHECK: vpxor
+  ; CHECK: vpconflictd
+  %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.x86.avx512.pmaxu.d(<16 x i32>, <16 x i32>) nounwind readonly
 
-define <8 x i64> @test_x86_pmaxu_q(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK: vpmaxuq
-  %res = call <8 x i64> @llvm.x86.avx512.pmaxu.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
+
+define <8 x i64> @test_conflict_q(<8 x i64> %a) {
+  ; CHECK: movb $-1, %al
+  ; CHECK: vpxor
+  ; CHECK: vpconflictq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
-declare <8 x i64> @llvm.x86.avx512.pmaxu.q(<8 x i64>, <8 x i64>) nounwind readonly
 
-define <16 x i32> @test_x86_pmaxs_d(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpmaxsd
-  %res = call <16 x i32> @llvm.x86.avx512.pmaxs.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
+
+
+define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
+  ; CHECK: vpconflictd 
+  %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.x86.avx512.pmaxs.d(<16 x i32>, <16 x i32>) nounwind readonly
 
-define <8 x i64> @test_x86_pmaxs_q(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK: vpmaxsq
-  %res = call <8 x i64> @llvm.x86.avx512.pmaxs.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+  ; CHECK: vpconflictq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret <8 x i64> %res
 }
-declare <8 x i64> @llvm.x86.avx512.pmaxs.q(<8 x i64>, <8 x i64>) nounwind readonly
 
-define <16 x i32> @test_x86_pminu_d(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpminud
-  %res = call <16 x i32> @llvm.x86.avx512.pminu.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK: vblendmps
+  %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly
+
+define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK: vblendmpd
+  %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x double>* %ptr, i8 %mask) {
+  ; CHECK-LABEL: test_x86_mask_blend_pd_512_memop
+  ; CHECK: vblendmpd (%
+  %b = load <8 x double>* %ptr
+  %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a, <8 x double> %b, i8 %mask) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double>, <8 x double>, i8) nounwind readonly
+
+define <16 x i32> @test_x86_mask_blend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
+  ; CHECK: vpblendmd
+  %res = call <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32> %a1, <16 x i32> %a2, i16 %a0) ; <<16 x i32>> [#uses=1]
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.x86.avx512.pminu.d(<16 x i32>, <16 x i32>) nounwind readonly
+declare <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
 
-define <8 x i64> @test_x86_pminu_q(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK: vpminuq
-  %res = call <8 x i64> @llvm.x86.avx512.pminu.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+  ; CHECK: vpblendmq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64> %a1, <8 x i64> %a2, i8 %a0) ; <<8 x i64>> [#uses=1]
   ret <8 x i64> %res
 }
-declare <8 x i64> @llvm.x86.avx512.pminu.q(<8 x i64>, <8 x i64>) nounwind readonly
+declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
+
+ define <8 x i32> @test_cvtpd2udq(<8 x double> %a) {
+ ;CHECK: vcvtpd2udq {ru-sae}{{.*}}encoding: [0x62,0xf1,0xfc,0x58,0x79,0xc0]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %a, <8 x i32>zeroinitializer, i8 -1, i32 2)
+  ret <8 x i32>%res
+ }
+ declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
+ 
+ define <16 x i32> @test_cvtps2udq(<16 x float> %a) {
+ ;CHECK: vcvtps2udq {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x79,0xc0]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %a, <16 x i32>zeroinitializer, i16 -1, i32 1)
+  ret <16 x i32>%res
+ }
+ declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
+
+ define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
+ ;CHECK: vcmpleps {sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x02]
+   %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
+   ret i16 %res
+ }
+ declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
+
+ define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
+ ;CHECK: vcmpneqpd %zmm{{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc1,0x04]
+   %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
+   ret i8 %res
+ }
+ declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
+
+ ; cvt intrinsics
+ define <16 x float> @test_cvtdq2ps(<16 x i32> %a) {
+ ;CHECK: vcvtdq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x5b,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1)
+  ret <16 x float>%res
+ }
+ declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32)
+
+ define <16 x float> @test_cvtudq2ps(<16 x i32> %a) {
+ ;CHECK: vcvtudq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7f,0x38,0x7a,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1)
+  ret <16 x float>%res
+ }
+ declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32)
+
+ define <8 x double> @test_cvtdq2pd(<8 x i32> %a) {
+ ;CHECK: vcvtdq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1)
+  ret <8 x double>%res
+ }
+ declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
+
+ define <8 x double> @test_cvtudq2pd(<8 x i32> %a) {
+ ;CHECK: vcvtudq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1)
+  ret <8 x double>%res
+ }
+ declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
+
+ ; fp min - max
+define <16 x float> @test_vmaxps(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK: vmaxps
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>,
+                    <16 x float>, i16, i32)
 
-define <16 x i32> @test_x86_pmins_d(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpminsd
-  %res = call <16 x i32> @llvm.x86.avx512.pmins.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
-  ret <16 x i32> %res
+define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
+  ; CHECK: vmaxpd
+  %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,
+                    <8 x double>zeroinitializer, i8 -1, i32 4)
+  ret <8 x double> %res
 }
-declare <16 x i32> @llvm.x86.avx512.pmins.d(<16 x i32>, <16 x i32>) nounwind readonly
+declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>,
+                    <8 x double>, i8, i32)
 
-define <8 x i64> @test_x86_pmins_q(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK: vpminsq
-  %res = call <8 x i64> @llvm.x86.avx512.pmins.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+define <16 x float> @test_vminps(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK: vminps
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>,
+                    <16 x float>, i16, i32)
+
+define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
+  ; CHECK: vminpd
+  %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1,
+                    <8 x double>zeroinitializer, i8 -1, i32 4)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>,
+                    <8 x double>, i8, i32)
+
+ define <8 x float> @test_cvtpd2ps(<8 x double> %a) {
+ ;CHECK: vcvtpd2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0xfd,0x38,0x5a,0xc0]
+  %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %a, <8 x float>zeroinitializer, i8 -1, i32 1)
+  ret <8 x float>%res
+ }
+ declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
+
+ define <16 x i32> @test_pabsd(<16 x i32> %a) {
+ ;CHECK: vpabsd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x1e,0xc0]
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %a, <16 x i32>zeroinitializer, i16 -1)
+ ret < 16 x i32> %res
+ }
+ declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
+
+ define <8 x i64> @test_pabsq(<8 x i64> %a) {
+ ;CHECK: vpabsq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x1f,0xc0]
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %a, <8 x i64>zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+ }
+ declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_vpmaxq(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vpmaxsq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %a0, <8 x i64> %a1,
+                    <8 x i64>zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
-declare <8 x i64> @llvm.x86.avx512.pmins.q(<8 x i64>, <8 x i64>) nounwind readonly
+declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
-define <16 x i32> @test_conflict_d(<16 x i32> %a) {
-  ; CHECK: vpconflictd
-  %res = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a)
+define <16 x i32> @test_vpminud(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpminud {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %a0, <16 x i32> %a1,
+                    <16 x i32>zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32>) nounwind readonly
+declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
-  ; CHECK: vpconflictd %zmm0, %zmm0 {%k1} {z}
-  %vmask = bitcast i16 %mask to <16 x i1>
-  %res = call <16 x i32> @llvm.x86.avx512.conflict.d.maskz.512(<16 x i1> %vmask, <16 x i32> %a)
+define <16 x i32> @test_vpmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpmaxsd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %a0, <16 x i32> %a1,
+                    <16 x i32>zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.x86.avx512.conflict.d.maskz.512(<16 x i1>,<16 x i32>) nounwind readonly
+declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-  ; CHECK: vpconflictq {{.*}} {%k1}
-  %vmask = bitcast i8 %mask to <8 x i1>
-  %res = call <8 x i64> @llvm.x86.avx512.conflict.q.mask.512(<8 x i64> %b, <8 x i1> %vmask, <8 x i64> %a)
+define <8 x i64> @test_vpmuludq(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpmuludq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a0, <16 x i32> %a1,
+                    <8 x i64>zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
-declare <8 x i64> @llvm.x86.avx512.conflict.q.mask.512(<8 x i64>, <8 x i1>,<8 x i64>) nounwind readonly
+declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
 
-define <16 x float> @test_x86_mskblend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK: vblendmps
-  %m0 = bitcast i16 %a0 to <16 x i1>
-  %res = call <16 x float> @llvm.x86.avx512.mskblend.ps.512(<16 x i1> %m0, <16 x float> %a1, <16 x float> %a2) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
+define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vptestmq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc1]
+  %res = call i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
+  ret i8 %res
 }
-declare <16 x float> @llvm.x86.avx512.mskblend.ps.512(<16 x i1> %a0, <16 x float> %a1, <16 x float> %a2) nounwind readonly
+declare i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
 
-define <8 x double> @test_x86_mskblend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK: vblendmpd
-  %m0 = bitcast i8 %a0 to <8 x i1>
-  %res = call <8 x double> @llvm.x86.avx512.mskblend.pd.512(<8 x i1> %m0, <8 x double> %a1, <8 x double> %a2) ; <<8 x double>> [#uses=1]
-  ret <8 x double> %res
+define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vptestmd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc1]
+  %res = call i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
+  ret i16 %res
 }
-declare <8 x double> @llvm.x86.avx512.mskblend.pd.512(<8 x i1> %a0, <8 x double> %a1, <8 x double> %a2) nounwind readonly
+declare i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
 
-define <16 x i32> @test_x86_mskblend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
-  ; CHECK: vpblendmd
-  %m0 = bitcast i16 %a0 to <16 x i1>
-  %res = call <16 x i32> @llvm.x86.avx512.mskblend.d.512(<16 x i1> %m0, <16 x i32> %a1, <16 x i32> %a2) ; <<16 x i32>> [#uses=1]
-  ret <16 x i32> %res
+define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) {
+; CHECK: vmovups {{.*}}encoding: [0x62,0xf1,0x7c,0x49,0x11,0x07]
+  call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
+  ret void
 }
-declare <16 x i32> @llvm.x86.avx512.mskblend.d.512(<16 x i1> %a0, <16 x i32> %a1, <16 x i32> %a2) nounwind readonly
 
-define <8 x i64> @test_x86_mskblend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
-  ; CHECK: vpblendmq
-  %m0 = bitcast i8 %a0 to <8 x i1>
-  %res = call <8 x i64> @llvm.x86.avx512.mskblend.q.512(<8 x i1> %m0, <8 x i64> %a1, <8 x i64> %a2) ; <<8 x i64>> [#uses=1]
-  ret <8 x i64> %res
+declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
+
+define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK: vmovupd {{.*}}encoding: [0x62,0xf1,0xfd,0x49,0x11,0x07]
+  call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
+  ret void
 }
-declare <8 x i64> @llvm.x86.avx512.mskblend.q.512(<8 x i1> %a0, <8 x i64> %a1, <8 x i64> %a2) nounwind readonly
+
+declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index ef5cb56..dd33ffd 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -33,19 +33,6 @@ define i16 @mand16(i16 %x, i16 %y) {
   ret i16 %ret
 }
 
-; CHECK: unpckbw_test
-; CHECK: kunpckbw
-; CHECK:ret
-declare <16 x i1> @llvm.x86.kunpck.v16i1(<8 x i1>, <8 x i1>) nounwind readnone
-
-define i16 @unpckbw_test(i8 %x, i8 %y) {
-  %m0 = bitcast i8 %x to <8 x i1>
-  %m1 = bitcast i8 %y to <8 x i1>
-  %k = tail call <16 x i1> @llvm.x86.kunpck.v16i1(<8 x i1> %m0, <8 x i1> %m1)
-  %r = bitcast <16 x i1> %k to i16
-  ret i16 %r
-}
-
 ; CHECK: shuf_test1
 ; CHECK: kshiftrw        $8
 ; CHECK:ret
@@ -55,3 +42,39 @@ define i8 @shuf_test1(i16 %v) nounwind {
    %mask1 = bitcast <8 x i1> %mask to i8
    ret i8 %mask1
 }
+
+; CHECK: zext_test1
+; CHECK: kshiftlw
+; CHECK: kshiftrw
+; CHECK: kmovw
+; CHECK:ret
+define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
+  %cmp_res = icmp ugt <16 x i32> %a, %b
+  %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+  %res = zext i1 %cmp_res.i1 to i32
+  ret i32 %res
+}
+
+; CHECK: zext_test2
+; CHECK: kshiftlw
+; CHECK: kshiftrw
+; CHECK: kmovw
+; CHECK:ret
+define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
+  %cmp_res = icmp ugt <16 x i32> %a, %b
+  %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+  %res = zext i1 %cmp_res.i1 to i16
+  ret i16 %res
+}
+
+; CHECK: zext_test3
+; CHECK: kshiftlw
+; CHECK: kshiftrw
+; CHECK: kmovw
+; CHECK:ret
+define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
+  %cmp_res = icmp ugt <16 x i32> %a, %b
+  %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+  %res = zext i1 %cmp_res.i1 to i8
+  ret i8 %res
+}
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 91242b1..13e6843 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
 
 ; CHECK-LABEL: @test1
-; CHECK: vmovdz  %xmm0, %eax
+; CHECK: vmovd  %xmm0, %eax ## encoding: [0x62
 ; CHECK: ret
 define i32 @test1(float %x) {
    %res = bitcast float %x to i32
@@ -9,7 +9,7 @@ define i32 @test1(float %x) {
 }
 
 ; CHECK-LABEL: @test2
-; CHECK: vmovdz  %edi
+; CHECK: vmovd  %edi, %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test2(i32 %x) {
    %res = insertelement <4 x i32>undef, i32 %x, i32 0
@@ -17,7 +17,7 @@ define <4 x i32> @test2(i32 %x) {
 }
 
 ; CHECK-LABEL: @test3
-; CHECK: vmovqz  %rdi
+; CHECK: vmovq  %rdi, %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <2 x i64> @test3(i64 %x) {
    %res = insertelement <2 x i64>undef, i64 %x, i32 0
@@ -25,7 +25,7 @@ define <2 x i64> @test3(i64 %x) {
 }
 
 ; CHECK-LABEL: @test4
-; CHECK: vmovdz  (%rdi)
+; CHECK: vmovd  (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test4(i32* %x) {
    %y = load i32* %x
@@ -34,7 +34,7 @@ define <4 x i32> @test4(i32* %x) {
 }
 
 ; CHECK-LABEL: @test5
-; CHECK: vmovssz  %xmm0, (%rdi)
+; CHECK: vmovss  %xmm0, (%rdi) ## encoding: [0x62
 ; CHECK: ret
 define void @test5(float %x, float* %y) {
    store float %x, float* %y, align 4
@@ -42,7 +42,7 @@ define void @test5(float %x, float* %y) {
 }
 
 ; CHECK-LABEL: @test6
-; CHECK: vmovsdz  %xmm0, (%rdi)
+; CHECK: vmovsd  %xmm0, (%rdi) ## encoding: [0x62
 ; CHECK: ret
 define void @test6(double %x, double* %y) {
    store double %x, double* %y, align 8
@@ -50,7 +50,7 @@ define void @test6(double %x, double* %y) {
 }
 
 ; CHECK-LABEL: @test7
-; CHECK: vmovssz  (%rdi), %xmm0
+; CHECK: vmovss  (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define float @test7(i32* %x) {
    %y = load i32* %x
@@ -59,7 +59,7 @@ define float @test7(i32* %x) {
 }
 
 ; CHECK-LABEL: @test8
-; CHECK: vmovdz %xmm0, %eax
+; CHECK: vmovd %xmm0, %eax ## encoding: [0x62
 ; CHECK: ret
 define i32 @test8(<4 x i32> %x) {
    %res = extractelement <4 x i32> %x, i32 0
@@ -67,7 +67,7 @@ define i32 @test8(<4 x i32> %x) {
 }
 
 ; CHECK-LABEL: @test9
-; CHECK: vmovqz %xmm0, %rax
+; CHECK: vmovq %xmm0, %rax ## encoding: [0x62
 ; CHECK: ret
 define i64 @test9(<2 x i64> %x) {
    %res = extractelement <2 x i64> %x, i32 0
@@ -75,7 +75,7 @@ define i64 @test9(<2 x i64> %x) {
 }
 
 ; CHECK-LABEL: @test10
-; CHECK: vmovdz  (%rdi)
+; CHECK: vmovd (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test10(i32* %x) {
    %y = load i32* %x, align 4
@@ -84,7 +84,7 @@ define <4 x i32> @test10(i32* %x) {
 }
 
 ; CHECK-LABEL: @test11
-; CHECK: vmovssz  (%rdi)
+; CHECK: vmovss  (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x float> @test11(float* %x) {
    %y = load float* %x, align 4
@@ -93,7 +93,7 @@ define <4 x float> @test11(float* %x) {
 }
 
 ; CHECK-LABEL: @test12
-; CHECK: vmovsdz  (%rdi)
+; CHECK: vmovsd  (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <2 x double> @test12(double* %x) {
    %y = load double* %x, align 8
@@ -102,7 +102,7 @@ define <2 x double> @test12(double* %x) {
 }
 
 ; CHECK-LABEL: @test13
-; CHECK: vmovqz  %rdi
+; CHECK: vmovq  %rdi, %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <2 x i64> @test13(i64 %x) {
    %res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0
@@ -110,7 +110,7 @@ define <2 x i64> @test13(i64 %x) {
 }
 
 ; CHECK-LABEL: @test14
-; CHECK: vmovdz  %edi
+; CHECK: vmovd  %edi, %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test14(i32 %x) {
    %res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0
@@ -118,7 +118,7 @@ define <4 x i32> @test14(i32 %x) {
 }
 
 ; CHECK-LABEL: @test15
-; CHECK: vmovdz  (%rdi)
+; CHECK: vmovd  (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test15(i32* %x) {
    %y = load i32* %x, align 4
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index d2d6681..83f4698 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -20,3 +20,22 @@ define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
   ret <8 x i64> %res
 }
 
+; CHECK-LABEL: @select02
+; CHECK: cmpless %xmm0, %xmm3, %k1
+; CHECK-NEXT: vmovss  %xmm2, {{.*}}%xmm1 {%k1}
+; CHECK: ret
+define float @select02(float %a, float %b, float %c, float %eps) {
+  %cmp = fcmp oge float %a, %eps
+  %cond = select i1 %cmp, float %c, float %b
+  ret float %cond
+}
+
+; CHECK-LABEL: @select03
+; CHECK: cmplesd %xmm0, %xmm3, %k1
+; CHECK-NEXT: vmovsd  %xmm2, {{.*}}%xmm1 {%k1}
+; CHECK: ret
+define double @select03(double %a, double %b, double %c, double %eps) {
+  %cmp = fcmp oge double %a, %eps
+  %cond = select i1 %cmp, double %c, double %b
+  ret double %cond
+}
diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
index c9e0c2b..59d7010 100644
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ b/test/CodeGen/X86/avx512-shuffle.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
 ; CHECK: LCP
 ; CHECK: .long 2
 ; CHECK: .long 5
@@ -49,7 +49,7 @@ define <8 x double> @test4(<8 x double> %a) nounwind {
 }
 
 ; CHECK-LABEL: test5:
-; CHECK: vpermi2pd
+; CHECK: vpermt2pd
 ; CHECK: ret
 define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
   %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
@@ -65,7 +65,7 @@ define <8 x i64> @test6(<8 x i64> %a) nounwind {
 }
 
 ; CHECK-LABEL: test7:
-; CHECK: vpermi2q
+; CHECK: vpermt2q
 ; CHECK: ret
 define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
   %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
@@ -73,7 +73,7 @@ define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
 }
 
 ; CHECK-LABEL: test8:
-; CHECK: vpermi2d
+; CHECK: vpermt2d
 ; CHECK: ret
 define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
   %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
@@ -81,7 +81,7 @@ define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
 }
 
 ; CHECK-LABEL: test9:
-; CHECK: vpermi2ps
+; CHECK: vpermt2ps
 ; CHECK: ret
 define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
   %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
@@ -89,7 +89,7 @@ define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
 }
 
 ; CHECK-LABEL: test10:
-; CHECK: vpermi2ps (
+; CHECK: vpermt2ps (
 ; CHECK: ret
 define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
   %c = load <16 x float>* %b
@@ -98,7 +98,7 @@ define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
 }
 
 ; CHECK-LABEL: test11:
-; CHECK: vpermi2d (
+; CHECK: vpermt2d 
 ; CHECK: ret
 define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
   %c = load <16 x i32>* %b
@@ -107,7 +107,7 @@ define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
 }
 
 ; CHECK-LABEL: test12
-; CHECK: vmovlhpsz %xmm
+; CHECK: vmovlhps {{.*}}## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind {
   %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -186,7 +186,7 @@ define <16 x float> @test21(<16 x float> %a, <16 x float> %c) {
 }
 
 ; CHECK-LABEL: test22
-; CHECK: vmovhlpsz %xmm
+; CHECK: vmovhlps {{.*}}## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind {
   %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -202,7 +202,7 @@ define <16 x float> @test23(<16 x float> %a, <16 x float> %c) {
 }
 
 ; CHECK-LABEL: @test24
-; CHECK: vpermi2d
+; CHECK: vpermt2d
 ; CHECK: ret
 define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind {
   %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -223,4 +223,11 @@ define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind {
 define <16 x i32> @test26(<16 x i32> %a) nounwind {
   %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef>
   ret <16 x i32> %c
-}
-\ No newline at end of file
+}
+
+; CHECK-LABEL: @test27
+; CHECK: ret
+define <16 x i32> @test27(<4 x i32>%a) {
+ %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i32> %res
+}
diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-trunc-ext.ll
index 31db68c..5e097be 100644
--- a/test/CodeGen/X86/avx512-trunc-ext.ll
+++ b/test/CodeGen/X86/avx512-trunc-ext.ll
@@ -18,7 +18,7 @@ define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) nounwind readnone {
 
 
 ; CHECK-LABEL: zext_16x8_to_16x32
-; CHECK; vpmovzxbd {{.*}}%zmm
+; CHECK: vpmovzxbd {{.*}}%zmm
 ; CHECK: ret
 define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
   %x = zext <16 x i8> %i to <16 x i32>
@@ -26,7 +26,7 @@ define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
 }
 
 ; CHECK-LABEL: sext_16x8_to_16x32
-; CHECK; vpmovsxbd {{.*}}%zmm
+; CHECK: vpmovsxbd {{.*}}%zmm
 ; CHECK: ret
 define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
   %x = sext <16 x i8> %i to <16 x i32>
@@ -35,7 +35,7 @@ define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
 
 
 ; CHECK-LABEL: zext_16x16_to_16x32
-; CHECK; vpmovzxwd {{.*}}%zmm
+; CHECK: vpmovzxwd {{.*}}%zmm
 ; CHECK: ret
 define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %i) nounwind readnone {
   %x = zext <16 x i16> %i to <16 x i32>
@@ -43,7 +43,7 @@ define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %i) nounwind readnone {
 }
 
 ; CHECK-LABEL: zext_8x16_to_8x64
-; CHECK; vpmovzxwq
+; CHECK: vpmovzxwq
 ; CHECK: ret
 define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %i) nounwind readnone {
   %x = zext <8 x i16> %i to <8 x i64>
@@ -116,7 +116,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
   ret i8 %mask
 }
 
-; CHECK: sext_8i1_8i32
+; CHECK-LABEL: sext_8i1_8i32
 ; CHECK: vpbroadcastq  LCP{{.*}}(%rip), %zmm0 {%k1} {z}
 ; CHECK: ret
 define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
@@ -125,3 +125,24 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
   %y = sext <8 x i1> %x1 to <8 x i32>
   ret <8 x i32> %y
 }
+
+; CHECK-LABEL: trunc_v16i32_to_v16i16
+; CHECK: vpmovdw
+; CHECK: ret
+define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) {
+  %1 = trunc <16 x i32> %x to <16 x i16>
+  ret <16 x i16> %1
+}
+
+; CHECK-LABEL: trunc_i32_to_i1
+; CHECK: andl
+; CHECK: kmov
+; CHECK: kortest
+; CKECK: orl
+; CHECK: ret
+define i16 @trunc_i32_to_i1(i32 %a) {
+  %a_i = trunc i32 %a to i1
+  %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
+  %res = bitcast <16 x i1> %maskv to i16
+  ret i16 %res
+}
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 6f89d6c..9c6db11 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
 
 ;CHECK-LABEL: _inreg16xi32:
 ;CHECK: vpbroadcastd {{.*}}, %zmm
@@ -19,7 +19,7 @@ define   <8 x i64> @_inreg8xi64(i64 %a) {
 }
 
 ;CHECK-LABEL: _inreg16xfloat:
-;CHECK: vbroadcastssz {{.*}}, %zmm
+;CHECK: vbroadcastss {{.*}}, %zmm
 ;CHECK: ret
 define   <16 x float> @_inreg16xfloat(float %a) {
   %b = insertelement <16 x float> undef, float %a, i32 0
@@ -28,7 +28,7 @@ define   <16 x float> @_inreg16xfloat(float %a) {
 }
 
 ;CHECK-LABEL: _inreg8xdouble:
-;CHECK: vbroadcastsdz {{.*}}, %zmm
+;CHECK: vbroadcastsd {{.*}}, %zmm
 ;CHECK: ret
 define   <8 x double> @_inreg8xdouble(double %a) {
   %b = insertelement <8 x double> undef, double %a, i32 0
@@ -45,9 +45,20 @@ define   <16 x i32> @_xmm16xi32(<16 x i32> %a) {
 }
 
 ;CHECK-LABEL: _xmm16xfloat
-;CHECK: vbroadcastssz
+;CHECK: vbroadcastss {{.*}}## encoding: [0x62
 ;CHECK: ret
 define   <16 x float> @_xmm16xfloat(<16 x float> %a) {
   %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
   ret <16 x float> %b
 }
+
+define <16 x i32> @test_vbroadcast() {
+  ; CHECK: vpbroadcastd
+entry:
+  %0 = sext <16 x i1> zeroinitializer to <16 x i32>
+  %1 = fcmp uno <16 x float> undef, zeroinitializer
+  %2 = sext <16 x i1> %1 to <16 x i32>
+  %3 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> %2
+  ret <16 x i32> %3
+}
+
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index 6ca5bcc..d762f00 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -5,9 +5,9 @@
 ; CHECK: vmovups
 ; CHECK: ret
 define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
-	%mask = fcmp ole <16 x float> %x, %y
-	%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
-	ret <16 x float> %max
+  %mask = fcmp ole <16 x float> %x, %y
+  %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
+  ret <16 x float> %max
 }
 
 ; CHECK-LABEL: test2
@@ -15,9 +15,9 @@ define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
 ; CHECK: vmovupd
 ; CHECK: ret
 define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
-	%mask = fcmp ole <8 x double> %x, %y
-	%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
-	ret <8 x double> %max
+  %mask = fcmp ole <8 x double> %x, %y
+  %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
+  ret <8 x double> %max
 }
 
 ; CHECK-LABEL: test3
@@ -26,9 +26,9 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
 ; CHECK: ret
 define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind {
   %y = load <16 x i32>* %yp, align 4
-	%mask = icmp eq <16 x i32> %x, %y
-	%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
-	ret <16 x i32> %max
+  %mask = icmp eq <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
 }
 
 ; CHECK-LABEL: @test4_unsigned
@@ -36,9 +36,9 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin
 ; CHECK: vmovdqu32
 ; CHECK: ret
 define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind {
-	%mask = icmp uge <16 x i32> %x, %y
-	%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
-	ret <16 x i32> %max
+  %mask = icmp uge <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+  ret <16 x i32> %max
 }
 
 ; CHECK-LABEL: test5
@@ -46,9 +46,9 @@ define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; CHECK: vmovdqu64 {{.*}}%k1
 ; CHECK: ret
 define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
-	%mask = icmp eq <8 x i64> %x, %y
-	%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
-	ret <8 x i64> %max
+  %mask = icmp eq <8 x i64> %x, %y
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
+  ret <8 x i64> %max
 }
 
 ; CHECK-LABEL: test6_unsigned
@@ -56,9 +56,9 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; CHECK: vmovdqu64 {{.*}}%k1
 ; CHECK: ret
 define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y) nounwind {
-	%mask = icmp ugt <8 x i64> %x, %y
-	%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
-	ret <8 x i64> %max
+  %mask = icmp ugt <8 x i64> %x, %y
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
+  ret <8 x i64> %max
 }
 
 ; CHECK-LABEL: test7
@@ -111,3 +111,54 @@ define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
   %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
   ret <8 x i32> %max
 }
+
+; CHECK-LABEL: test12
+; CHECK: vpcmpeqq        %zmm2, %zmm0, [[LO:%k[0-7]]]
+; CHECK: vpcmpeqq        %zmm3, %zmm1, [[HI:%k[0-7]]]
+; CHECK: kunpckbw        [[LO]], [[HI]], {{%k[0-7]}}
+
+define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
+  %res = icmp eq <16 x i64> %a, %b
+  %res1 = bitcast <16 x i1> %res to i16
+  ret i16 %res1
+}
+
+; CHECK-LABEL: test13
+; CHECK: vcmpeqps        %zmm
+; CHECK: vpbroadcastd
+; CHECK: ret
+define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
+{
+  %cmpvector_i = fcmp oeq <16 x float> %a, %b
+  %conv = zext <16 x i1> %cmpvector_i to <16 x i32>
+  ret <16 x i32> %conv
+}
+
+; CHECK-LABEL: test14
+; CHECK: vpcmp
+; CHECK-NOT: vpcmp
+; CHECK: vmovdqu32 {{.*}}{%k1} {z}
+; CHECK: ret
+define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
+  %sub_r = sub <16 x i32> %a, %b
+  %cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a
+  %sext.i3.i = sext <16 x i1> %cmp.i2.i to <16 x i32>
+  %mask = icmp eq <16 x i32> %sext.i3.i, zeroinitializer
+  %res = select <16 x i1> %mask, <16 x i32> zeroinitializer, <16 x i32> %sub_r
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test15
+; CHECK: vpcmpgtq
+; CHECK-NOT: vpcmp
+; CHECK: vmovdqu64 {{.*}}{%k1} {z}
+; CHECK: ret
+define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
+  %sub_r = sub <8 x i64> %a, %b
+  %cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a
+  %sext.i3.i = sext <8 x i1> %cmp.i2.i to <8 x i64>
+  %mask = icmp eq <8 x i64> %sext.i3.i, zeroinitializer
+  %res = select <8 x i1> %mask, <8 x i64> zeroinitializer, <8 x i64> %sub_r
+  ret <8 x i64>%res
+}
+
diff --git a/test/CodeGen/X86/avx512-vselect-crash.ll b/test/CodeGen/X86/avx512-vselect-crash.ll
new file mode 100644
index 0000000..9d652d3
--- /dev/null
+++ b/test/CodeGen/X86/avx512-vselect-crash.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: test
+; CHECK: vpxord
+; CHECK: ret
+define <16 x i32> @test() {
+entry:
+  %0 = icmp slt <16 x i32> undef, undef
+  %1 = select <16 x i1> %0, <16 x i32> undef, <16 x i32> zeroinitializer
+  ret <16 x i32> %1
+}
diff --git a/test/CodeGen/X86/avx512-zext-load-crash.ll b/test/CodeGen/X86/avx512-zext-load-crash.ll
new file mode 100644
index 0000000..07ded13
--- /dev/null
+++ b/test/CodeGen/X86/avx512-zext-load-crash.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+define <8 x i16> @test_zext_load() {
+  ; CHECK: vmovq
+entry:
+  %0 = load <2 x i16> ** undef, align 8
+  %1 = getelementptr inbounds <2 x i16>* %0, i64 1
+  %2 = load <2 x i16>* %0, align 1
+  %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %4 = load <2 x i16>* %1, align 1
+  %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %6
+}
diff --git a/test/CodeGen/X86/barrier-sse.ll b/test/CodeGen/X86/barrier-sse.ll
index bbfeea6..80c0cc8 100644
--- a/test/CodeGen/X86/barrier-sse.ll
+++ b/test/CodeGen/X86/barrier-sse.ll
@@ -1,11 +1,14 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep sfence
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep lfence
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep mfence
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep MEMBARRIER
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
 
 define void @test() {
   fence acquire
+  ; CHECK: #MEMBARRIER
+
   fence release
+  ; CHECK: #MEMBARRIER
+
   fence acq_rel
+  ; CHECK: #MEMBARRIER
+
   ret void
 }
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 4f2060f..6b46596 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -1,13 +1,11 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 
-; In this test we check that sign-extend of the mask bit is performed by
-; shifting the needed bit to the MSB, and not using shl+sra.
+; Verify that we produce movss instead of blendvps when possible.
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: movl $-2147483648
-;CHECK-NEXT: movd
-;CHECK-NEXT: blendvps
+;CHECK-NOT: blendvps
+;CHECK: movss
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
@@ -15,9 +13,8 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 }
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK: movl $-2147483648
-;CHECK-NEXT: movd
-;CHECK-NEXT: blendvps
+;CHECK-NOT: blendvps
+;CHECK: movss
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
@@ -26,12 +23,12 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 
 
 ; We do not have native support for v8i16 blends and we have to use the
-; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not r
+; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not
 ; reduce the mask in this case.
 ;CHECK-LABEL: vsel_8xi16:
-;CHECK: psllw
-;CHECK: psraw
-;CHECK: pblendvb
+;CHECK: andps
+;CHECK: andps
+;CHECK: orps
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index d3e05d6..2681c10 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -701,7 +701,7 @@ exit:
 
 define void @unanalyzable_branch_to_best_succ(i1 %cond) {
 ; Ensure that we can handle unanalyzable branches where the destination block
-; gets selected as the optimal sucessor to merge.
+; gets selected as the optimal successor to merge.
 ;
 ; CHECK: unanalyzable_branch_to_best_succ
 ; CHECK: %entry
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll
new file mode 100644
index 0000000..6b77176
--- /dev/null
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mcpu=x86_64 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
+
+define <2 x i64> @foo(<2 x i64> %v) #0 {
+entry:
+  %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
+  ret <2 x i64> %r
+}
+
+; CHECK-LABEL: @foo
+; CHECK: bswapq
+; CHECK: bswapq
+; CHECK: retq
+
+attributes #0 = { nounwind uwtable }
+
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index f12a354..036ec0a 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -20,7 +20,7 @@
 define void @test2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: test2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %ecx, %eax
 ; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/cache-intrinsic.ll b/test/CodeGen/X86/cache-intrinsic.ll
new file mode 100644
index 0000000..3091b5f
--- /dev/null
+++ b/test/CodeGen/X86/cache-intrinsic.ll
@@ -0,0 +1,26 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@buffer = global [32 x i8] c"This is a largely unused buffer\00", align 16
+@.str = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
+@.str1 = private unnamed_addr constant [25 x i8] c"Still, largely unused...\00", align 1
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0))
+  %call1 = call i8* @strcpy(i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8]* @.str1, i32 0, i32 0)) #3
+  call void @llvm.clear_cache(i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds (i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i32 32)) #3
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0))
+  ret i32 0
+}
+
+; CHECK-NOT: __clear_cache
+
+declare i32 @printf(i8*, ...)
+
+declare i8* @strcpy(i8*, i8*)
+
+declare void @llvm.clear_cache(i8*, i8*)
diff --git a/test/CodeGen/X86/call-imm.ll b/test/CodeGen/X86/call-imm.ll
index 8753594..898b4ec 100644
--- a/test/CodeGen/X86/call-imm.ll
+++ b/test/CodeGen/X86/call-imm.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=static | FileCheck -check-prefix X86STA %s
 ; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=pic | FileCheck -check-prefix X86PIC %s
 ; RUN: llc < %s -mtriple=i386-pc-linux -relocation-model=dynamic-no-pic | FileCheck -check-prefix X86DYN %s
+; RUN: llc < %s -mtriple=i386-pc-win32 -relocation-model=static | FileCheck -check-prefix X86WINSTA %s
 
 ; Call to immediate is not safe on x86-64 unless we *know* that the
 ; call will be within 32-bits pcrel from the dest immediate.
@@ -20,4 +21,5 @@ entry:
 ; X86STA: {{call.*12345678}}
 ; X86PIC-NOT: {{call.*12345678}}
 ; X86DYN: {{call.*12345678}}
+; X86WINSTA: {{call.*[*]%eax}}
 ; X64: {{call.*[*]%rax}}
diff --git a/test/CodeGen/X86/cas.ll b/test/CodeGen/X86/cas.ll
index c2dd05e..ec519c6 100644
--- a/test/CodeGen/X86/cas.ll
+++ b/test/CodeGen/X86/cas.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-pc-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnu %s -o - -no-integrated-as | FileCheck %s
 
 ; C code this came from
 ;bool cas(float volatile *p, float *expected, float desired) {
diff --git a/test/CodeGen/X86/catch.ll b/test/CodeGen/X86/catch.ll
new file mode 100644
index 0000000..6f70213
--- /dev/null
+++ b/test/CodeGen/X86/catch.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic | FileCheck %s
+
+; PR18390
+; We used to assert creating this label. The name itself is not critical. It
+; just needs to be a unique local symbol.
+; CHECK:      .L.Lstr.DW.stub:
+; CHECK-NEXT: .quad   .Lstr
+
+@str = private unnamed_addr constant [12 x i8] c"NSException\00"
+define void @f() {
+  invoke void @g()
+          to label %invoke.cont unwind label %lpad
+invoke.cont:
+  ret void
+lpad:
+  %tmp14 = landingpad { i8*, i32 } personality i8* bitcast (void ()* @h to i8*)
+           catch i8* getelementptr inbounds ([12 x i8]* @str, i64 0, i64 0)
+  ret void
+}
+declare void @g()
+declare void @h()
diff --git a/test/CodeGen/X86/cdecl-method-return.ll b/test/CodeGen/X86/cdecl-method-return.ll
new file mode 100644
index 0000000..2baa47a
--- /dev/null
+++ b/test/CodeGen/X86/cdecl-method-return.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 -mcpu=core2 | FileCheck %s
+
+; The sret flag causes the first two parameters to be reordered on the stack.
+
+define x86_cdeclmethodcc void @foo(i32* sret %dst, i32* %src) {
+  %v = load i32* %src
+  store i32 %v, i32* %dst
+  ret void
+}
+
+; CHECK-LABEL: _foo:
+; CHECK:  movl    8(%esp), %[[dst:[^ ]*]]
+; CHECK:  movl    4(%esp), %[[src:[^ ]*]]
+; CHECK:  movl    (%[[src]]), %[[v:[^ ]*]]
+; CHECK:  movl    %[[v]], (%[[dst]])
+; CHECK:  retl
+
+define i32 @bar() {
+  %src = alloca i32
+  %dst = alloca i32
+  store i32 42, i32* %src
+  call x86_cdeclmethodcc void @foo(i32* sret %dst, i32* %src)
+  %v = load i32* %dst
+  ret i32 %v
+}
+
+; CHECK-LABEL: _bar:
+; CHECK:  movl    $42, [[src:[^,]*]]
+; CHECK:  leal    [[src]], %[[reg:[^ ]*]]
+; CHECK:  movl    %[[reg]], (%esp)
+; CHECK:  leal    [[dst:[^,]*]], %[[reg:[^ ]*]]
+; CHECK:  movl    %[[reg]], 4(%esp)
+; CHECK:  calll   _foo
+; CHECK:  movl    [[dst]], %eax
+; CHECK:  retl
+
+; If we don't have the sret flag, parameters are not reordered.
+
+define x86_cdeclmethodcc void @baz(i32* %dst, i32* %src) {
+  %v = load i32* %src
+  store i32 %v, i32* %dst
+  ret void
+}
+
+; CHECK-LABEL: _baz:
+; CHECK:  movl    4(%esp), %[[dst:[^ ]*]]
+; CHECK:  movl    8(%esp), %[[src:[^ ]*]]
+; CHECK:  movl    (%[[src]]), %[[v:[^ ]*]]
+; CHECK:  movl    %[[v]], (%[[dst]])
+; CHECK:  retl
+
+define i32 @qux() {
+  %src = alloca i32
+  %dst = alloca i32
+  store i32 42, i32* %src
+  call x86_cdeclmethodcc void @baz(i32* %dst, i32* %src)
+  %v = load i32* %dst
+  ret i32 %v
+}
+
+; CHECK-LABEL: _qux:
+; CHECK:  movl    $42, [[src:[^,]*]]
+; CHECK:  leal    [[src]], %[[reg:[^ ]*]]
+; CHECK:  movl    %[[reg]], 4(%esp)
+; CHECK:  leal    [[dst:[^,]*]], %[[reg:[^ ]*]]
+; CHECK:  movl    %[[reg]], (%esp)
+; CHECK:  calll   _baz
+; CHECK:  movl    [[dst]], %eax
+; CHECK:  retl
diff --git a/test/CodeGen/X86/cfstring.ll b/test/CodeGen/X86/cfstring.ll
index 8cdd59e..cae4320 100644
--- a/test/CodeGen/X86/cfstring.ll
+++ b/test/CodeGen/X86/cfstring.ll
@@ -7,7 +7,7 @@
 ; Make sure that the string ends up the correct section.
 
 ; CHECK:        .section __TEXT,__cstring
-; CHECK-NEXT: l_.str3:
+; CHECK-NEXT: L_.str3:
 
 ; CHECK:        .section  __DATA,__cfstring
 ; CHECK-NEXT:   .align  4
@@ -15,13 +15,13 @@
 ; CHECK-NEXT:   .quad  ___CFConstantStringClassReference
 ; CHECK-NEXT:   .long  1992
 ; CHECK-NEXT:   .space  4
-; CHECK-NEXT:   .quad  l_.str3
+; CHECK-NEXT:   .quad  L_.str3
 ; CHECK-NEXT:   .long  0
 ; CHECK-NEXT:   .space  4
 
 @isLogVisible = global i8 0, align 1
 @__CFConstantStringClassReference = external global [0 x i32]
-@.str3 = linker_private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+@.str3 = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
 @_unnamed_cfstring_4 = private constant %struct.NSConstantString { i32* getelementptr inbounds ([0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([1 x i8]* @.str3, i32 0, i32 0), i32 0 }, section "__DATA,__cfstring"
 @null.array = weak_odr constant [1 x i8] zeroinitializer, align 1
 
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index 215b862..d38d2b4 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -41,8 +41,8 @@ declare void @bar(i64) nounwind
 
 define void @test3(i64 %a, i64 %b, i1 %p) nounwind {
 ; CHECK-LABEL: test3:
-; CHECK:      cmovnel %edi, %esi
-; CHECK-NEXT: movl    %esi, %edi
+; CHECK:      cmov{{n?}}el %[[R1:e..]], %[[R2:e..]]
+; CHECK-NEXT: movl    %[[R2]], %{{e..}}
 
   %c = trunc i64 %a to i32
   %d = trunc i64 %b to i32
diff --git a/test/CodeGen/X86/cmpxchg16b.ll b/test/CodeGen/X86/cmpxchg16b.ll
index edbd0bc..1d5bb85 100644
--- a/test/CodeGen/X86/cmpxchg16b.ll
+++ b/test/CodeGen/X86/cmpxchg16b.ll
@@ -6,7 +6,7 @@ entry:
 ; CHECK: movl	$1, %ebx
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchg16b
-  %r = cmpxchg i128* %p, i128 0, i128 1 seq_cst
+  %r = cmpxchg i128* %p, i128 0, i128 1 seq_cst seq_cst
   ret void
 }
 
diff --git a/test/CodeGen/X86/coalescer-remat.ll b/test/CodeGen/X86/coalescer-remat.ll
index eb7b7a8..468b70b 100644
--- a/test/CodeGen/X86/coalescer-remat.ll
+++ b/test/CodeGen/X86/coalescer-remat.ll
@@ -5,7 +5,7 @@
 
 define i32 @main() nounwind {
 entry:
-  %0 = cmpxchg i64* @val, i64 0, i64 1 monotonic
+  %0 = cmpxchg i64* @val, i64 0, i64 1 monotonic monotonic
   %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([7 x i8]* @"\01LC", i32 0, i64 0), i64 %0) nounwind
   ret i32 0
 }
diff --git a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
new file mode 100644
index 0000000..e3d6b34
--- /dev/null
+++ b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -0,0 +1,303 @@
+; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
+; This file tests the different cases what are involved when codegen prepare
+; tries to get sign extension out of the way of addressing mode.
+; This tests require an actual target as addressing mode decisions depends
+; on the target.
+
+target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+
+; Check that we correctly promote both operands of the promotable add.
+; CHECK-LABEL: @twoArgsPromotion
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64
+; CHECK: [[ARG2SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg2 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], [[ARG2SEXT]]
+; CHECK: inttoptr i64 [[PROMOTED]] to i8*
+; CHECK: ret
+define i8 @twoArgsPromotion(i32 %arg1, i32 %arg2) {
+  %add = add nsw i32 %arg1, %arg2 
+  %sextadd = sext i32 %add to i64
+  %base = inttoptr i64 %sextadd to i8*
+  %res = load i8* %base
+  ret i8 %res
+}
+
+; Check that we do not promote both operands of the promotable add when
+; the instruction will not be folded into the addressing mode.
+; Otherwise, we will increase the number of instruction executed.
+; (This is a heuristic of course, because the new sext could have been
+; merged with something else.)
+; CHECK-LABEL: @twoArgsNoPromotion
+; CHECK: add nsw i32 %arg1, %arg2
+; CHECK: ret
+define i8 @twoArgsNoPromotion(i32 %arg1, i32 %arg2, i8* %base) {
+  %add = add nsw i32 %arg1, %arg2 
+  %sextadd = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we do not promote when the related instruction does not have
+; the nsw flag.
+; CHECK-LABEL: @noPromotion
+; CHECK-NOT: add i64
+; CHECK: ret
+define i8 @noPromotion(i32 %arg1, i32 %arg2, i8* %base) {
+  %add = add i32 %arg1, %arg2 
+  %sextadd = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we correctly promote constant arguments.
+; CHECK-LABEL: @oneArgPromotion
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotion(i32 %arg1, i8* %base) {
+  %add = add nsw i32 %arg1, 1 
+  %sextadd = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we do not promote truncate when we cannot determine the
+; bits that are dropped.
+; CHECK-LABEL: @oneArgPromotionBlockTrunc1
+; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 %arg1 to i8
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionBlockTrunc1(i32 %arg1, i8* %base) {
+  %trunc = trunc i32 %arg1 to i8
+  %add = add nsw i8 %trunc, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we do not promote truncate when we cannot determine all the
+; bits that are dropped.
+; CHECK-LABEL: @oneArgPromotionBlockTrunc2
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i16 %arg1 to i32
+; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8
+; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionBlockTrunc2(i16 %arg1, i8* %base) {
+  %sextarg1 = sext i16 %arg1 to i32
+  %trunc = trunc i32 %sextarg1 to i8
+  %add = add nsw i8 %trunc, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we are able to promote truncate when we know all the bits
+; that are dropped.
+; CHECK-LABEL: @oneArgPromotionPassTruncKeepSExt
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionPassTruncKeepSExt(i1 %arg1, i8* %base) {
+  %sextarg1 = sext i1 %arg1 to i32
+  %trunc = trunc i32 %sextarg1 to i8
+  %add = add nsw i8 %trunc, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; On X86 truncate are free. Check that we are able to promote the add
+; to be used as addressing mode and that we insert a truncate for the other
+; use. 
+; CHECK-LABEL: @oneArgPromotionTruncInsert
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8
+; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8* [[GEP]]
+; CHECK: add i8 [[LOAD]], [[TRUNC]]
+; CHECK: ret
+define i8 @oneArgPromotionTruncInsert(i8 %arg1, i8* %base) {
+  %add = add nsw i8 %arg1, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  %finalres = add i8 %res, %add
+  ret i8 %finalres
+}
+
+; Cannot sext from a larger type than the promoted type.
+; CHECK-LABEL: @oneArgPromotionLargerType
+; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i128 %arg1 to i8
+; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionLargerType(i128 %arg1, i8* %base) {
+  %trunc = trunc i128 %arg1 to i8
+  %add = add nsw i8 %trunc, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  %finalres = add i8 %res, %add
+  ret i8 %finalres
+}
+
+; Use same inserted trunc
+; On X86 truncate are free. Check that we are able to promote the add
+; to be used as addressing mode and that we insert a truncate for
+; *all* the other uses. 
+; CHECK-LABEL: @oneArgPromotionTruncInsertSeveralUse
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8
+; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8* [[GEP]]
+; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = add i8 [[LOAD]], [[TRUNC]]
+; CHECK: add i8 [[ADDRES]], [[TRUNC]]
+; CHECK: ret
+define i8 @oneArgPromotionTruncInsertSeveralUse(i8 %arg1, i8* %base) {
+  %add = add nsw i8 %arg1, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  %almostfinalres = add i8 %res, %add
+  %finalres = add i8 %almostfinalres, %add
+  ret i8 %finalres
+}
+
+; Check that the promoted instruction is used for all uses of the original
+; sign extension.
+; CHECK-LABEL: @oneArgPromotionSExtSeveralUse
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8* [[GEP]]
+; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = zext i8 [[LOAD]] to i64
+; CHECK: add i64 [[ADDRES]], [[PROMOTED]]
+; CHECK: ret
+define i64 @oneArgPromotionSExtSeveralUse(i8 %arg1, i8* %base) {
+  %add = add nsw i8 %arg1, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  %almostfinalres = zext i8 %res to i64
+  %finalres = add i64 %almostfinalres, %sextadd
+  ret i64 %finalres
+}
+
+; Check all types of rollback mechanism.
+; For this test, the sign extension stays in place.
+; However, the matching process goes until promoting both the operands
+; of the first promotable add implies.
+; At this point the rollback mechanism kicks in and restores the states
+; until the addressing mode matcher is able to match something: in that
+; case promote nothing.
+; Along the way, the promotion mechanism involves:
+; - Mutating the type of %promotableadd1 and %promotableadd2.
+; - Creating a sext for %arg1 and %arg2.
+; - Creating a trunc for a use of %promotableadd1.
+; - Replacing a bunch of uses.
+; - Setting the operands of the promoted instruction with the promoted values.
+; - Moving instruction around (mainly sext when promoting instruction).
+; Each type of those promotions has to be undo at least once during this
+; specific test. 
+; CHECK-LABEL: @twoArgsPromotionNest
+; CHECK: [[ORIG:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2
+; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ORIG]], [[ORIG]]
+; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
+; CHECK: getelementptr inbounds i8* %base, i64 [[SEXT]]
+; CHECK: ret
+define i8 @twoArgsPromotionNest(i32 %arg1, i32 %arg2, i8* %base) {
+  %promotableadd1 = add nsw i32 %arg1, %arg2
+  %promotableadd2 = add nsw i32 %promotableadd1, %promotableadd1 
+  %sextadd = sext i32 %promotableadd2 to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Test the InstructionRemover undo, which was the only one not
+; kicked in the previous test.
+; The matcher first promotes the add, removes the trunc and promotes
+; the sext of arg1.
+; Then, the matcher cannot use an addressing mode r + r + r, thus it
+; rolls back. 
+; CHECK-LABEL: @twoArgsNoPromotionRemove
+; CHECK: [[SEXTARG1:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32
+; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[SEXTARG1]] to i8
+; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[TRUNC]], %arg2
+; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i64
+; CHECK: getelementptr inbounds i8* %base, i64 [[SEXT]]
+; CHECK: ret
+define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, i8* %base) {
+  %sextarg1 = sext i1 %arg1 to i32
+  %trunc = trunc i32 %sextarg1 to i8
+  %add = add nsw i8 %trunc, %arg2 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Ensure that when the profitability checks kicks in, the IR is not modified
+; will IgnoreProfitability is on.
+; The profitabily check happens when a candidate instruction has several uses.
+; The matcher will create a new matcher for each use and check if the
+; instruction is in the list of the matched instructions of this new matcher.
+; All changes made by the new matchers must be dropped before pursuing
+; otherwise the state of the original matcher will be wrong.
+;
+; Without the profitability check, when checking for the second use of
+; arrayidx, the matcher promotes everything all the way to %arg1, %arg2.
+; Check that we did not promote anything in the final matching.
+;
+; <rdar://problem/16020230>
+; CHECK-LABEL: @checkProfitability
+; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg1 to i64
+; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg2 to i64
+; CHECK: [[SHL:%[a-zA-Z_0-9-]+]] = shl nsw i32 %arg1, 1
+; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SHL]], %arg2
+; CHECK: [[SEXTADD:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
+; BB then
+; CHECK: [[BASE1:%[a-zA-Z_0-9-]+]] = add i64 [[SEXTADD]], 48
+; CHECK: [[ADDR1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[BASE1]] to i32*
+; CHECK: load i32* [[ADDR1]]
+; BB else
+; CHECK: [[BASE2:%[a-zA-Z_0-9-]+]] = add i64 [[SEXTADD]], 48
+; CHECK: [[ADDR2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[BASE2]] to i32*
+; CHECK: load i32* [[ADDR2]]
+; CHECK: ret
+define i32 @checkProfitability(i32 %arg1, i32 %arg2, i1 %test) {
+  %shl = shl nsw i32 %arg1, 1
+  %add1 = add nsw i32 %shl, %arg2
+  %sextidx1 = sext i32 %add1 to i64
+  %tmpptr = inttoptr i64 %sextidx1 to i32*
+  %arrayidx1 = getelementptr i32* %tmpptr, i64 12
+  br i1 %test, label %then, label %else
+then: 
+  %res1 = load i32* %arrayidx1
+  br label %end
+else:
+  %res2 = load i32* %arrayidx1
+  br label %end
+end:
+  %tmp = phi i32 [%res1, %then], [%res2, %else]
+  %res = add i32 %tmp, %add1
+  %addr = inttoptr i32 %res to i32*
+  %final = load i32* %addr
+  ret i32 %final
+}
diff --git a/test/CodeGen/X86/codegen-prepare-cast.ll b/test/CodeGen/X86/codegen-prepare-cast.ll
index 2a8ead8..59c5133 100644
--- a/test/CodeGen/X86/codegen-prepare-cast.ll
+++ b/test/CodeGen/X86/codegen-prepare-cast.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=x86-64
 ; PR4297
+; RUN: opt -S < %s -codegenprepare | FileCheck %s
 
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
@@ -8,6 +9,9 @@ target triple = "x86_64-unknown-linux-gnu"
         %"char[][]" = type { i64, %"byte[]"* }
 @.str = external constant [7 x i8]              ; <[7 x i8]*> [#uses=1]
 
+; CHECK-LABEL: @_Dmain
+; CHECK: load i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0)
+; CHECK ret
 define fastcc i32 @_Dmain(%"char[][]" %unnamed) {
 entry:
         %tmp = getelementptr [7 x i8]* @.str, i32 0, i32 0              ; <i8*> [#uses=1]
diff --git a/test/CodeGen/X86/codegen-prepare-extload.ll b/test/CodeGen/X86/codegen-prepare-extload.ll
index 14df815..9320706 100644
--- a/test/CodeGen/X86/codegen-prepare-extload.ll
+++ b/test/CodeGen/X86/codegen-prepare-extload.ll
@@ -5,7 +5,7 @@
 ; CodeGenPrepare should move the zext into the block with the load
 ; so that SelectionDAG can select it with the load.
 
-; CHECK: movzbl ({{%rdi|%rcx}}), %eax
+; CHECK: movsbl ({{%rdi|%rcx}}), %eax
 
 define void @foo(i8* %p, i32* %q) {
 entry:
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
new file mode 100644
index 0000000..c1ce533
--- /dev/null
+++ b/test/CodeGen/X86/combine-or.ll
@@ -0,0 +1,269 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+
+; Verify that each of the following test cases is folded into a single
+; instruction which performs a blend operation.
+
+define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK: ret
+
+
+define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
+  %and1 = and <2 x i64> %a, <i64 -1, i64 0>
+  %and2 = and <2 x i64> %b, <i64 0, i64 -1>
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test9
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK: ret
+
+
+define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
+  %and1 = and <2 x i64> %a, <i64 0, i64 -1>
+  %and2 = and <2 x i64> %b, <i64 -1, i64 0>
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test11
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test12
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NEXT: ret
+
+
+; Verify that the following test cases are folded into single shuffles.
+
+define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: pslldq
+; CHECK-NOT: por
+; CHECK: punpcklqdq
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test15
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NOT: shufps
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test16
+; CHECK-NOT: pslldq
+; CHECK-NOT: por
+; CHECK: punpcklqdq
+; CHECK: ret
+
+
+; Verify that the dag-combiner does not fold a OR of two shuffles into a single
+; shuffle instruction when the shuffle indexes are not compatible.
+
+define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test17
+; CHECK: por
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test18
+; CHECK: orps
+; CHECK: ret
+
+
+define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test19
+; CHECK: por
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test20
+; CHECK-NOT: xorps
+; CHECK: orps
+; CHECK-NEXT: movq
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test21
+; CHECK: por
+; CHECK-NEXT: pslldq
+; CHECK-NEXT: ret
+
+
diff --git a/test/CodeGen/X86/combine-vec-shuffle.ll b/test/CodeGen/X86/combine-vec-shuffle.ll
new file mode 100644
index 0000000..9e6ab89
--- /dev/null
+++ b/test/CodeGen/X86/combine-vec-shuffle.ll
@@ -0,0 +1,253 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+; Verify that the DAGCombiner correctly folds according to the following rules:
+
+; fold (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C)
+; fold (OR  (shuf (A, C), shuf (B, C)) -> shuf (OR  (A, B), C)
+; fold (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0)
+
+; fold (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B))
+; fold (OR  (shuf (C, A), shuf (C, B)) -> shuf (C, OR  (A, B))
+; fold (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B))
+
+
+
+define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: pshufd
+; CHECK: pand
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: pshufd
+; CHECK: por
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: pshufd
+; CHECK: pxor
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: pshufd
+; CHECK: pand
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: pshufd
+; CHECK: por
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: pshufd
+; CHECK: pxor
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
+; are not performing a swizzle operations.
+
+define <4 x i32> @test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test1b
+; CHECK-NOT: blendps
+; CHECK: andps
+; CHECK-NEXT: blendps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2b
+; CHECK-NOT: blendps
+; CHECK: orps
+; CHECK-NEXT: blendps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test3b
+; CHECK-NOT: blendps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: blendps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test4b
+; CHECK-NOT: blendps
+; CHECK: andps
+; CHECK-NEXT: blendps
+; CHECK: ret
+
+
+define <4 x i32> @test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5b
+; CHECK-NOT: blendps
+; CHECK: orps
+; CHECK-NEXT: blendps
+; CHECK: ret
+
+
+define <4 x i32> @test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test6b
+; CHECK-NOT: blendps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: blendps
+; CHECK: ret
+
+define <4 x i32> @test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test1c
+; CHECK-NOT: shufps
+; CHECK: andps
+; CHECK-NEXT: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2c
+; CHECK-NOT: shufps
+; CHECK: orps
+; CHECK-NEXT: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test3c
+; CHECK-NOT: shufps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test4c
+; CHECK-NOT: shufps
+; CHECK: andps
+; CHECK-NEXT: shufps
+; CHECK: ret
+
+
+define <4 x i32> @test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5c
+; CHECK-NOT: shufps
+; CHECK: orps
+; CHECK-NEXT: shufps
+; CHECK: ret
+
+
+define <4 x i32> @test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test6c
+; CHECK-NOT: shufps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: shufps
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/const-base-addr.ll b/test/CodeGen/X86/const-base-addr.ll
new file mode 100644
index 0000000..f859d7f
--- /dev/null
+++ b/test/CodeGen/X86/const-base-addr.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%T = type { i32, i32, i32, i32 }
+
+define i32 @test1() nounwind {
+; CHECK-LABEL:  test1
+; CHECK:        movabsq $123456789012345678, %rcx
+; CHECK-NEXT:   movl  4(%rcx), %eax
+; CHECK-NEXT:   addl  8(%rcx), %eax
+; CHECK-NEXT:   addl  12(%rcx), %eax
+  %addr1 = getelementptr %T* inttoptr (i64 123456789012345678 to %T*), i32 0, i32 1
+  %tmp1 = load i32* %addr1
+  %addr2 = getelementptr %T* inttoptr (i64 123456789012345678 to %T*), i32 0, i32 2
+  %tmp2 = load i32* %addr2
+  %addr3 = getelementptr %T* inttoptr (i64 123456789012345678 to %T*), i32 0, i32 3
+  %tmp3 = load i32* %addr3
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+}
+
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index 051150e..ee73377 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -1,7 +1,7 @@
 ; REQUIRES: asserts
-; RUN: llc -march=x86 < %s -verify-machineinstrs -precompute-phys-liveness
-; RUN: llc -march=x86-64 < %s -verify-machineinstrs -precompute-phys-liveness
-
+; RUN: llc -march=x86 -no-integrated-as < %s -verify-machineinstrs -precompute-phys-liveness
+; RUN: llc -march=x86-64 -no-integrated-as < %s -verify-machineinstrs -precompute-phys-liveness
+ 
 ; PR6497
 
 ; Chain and flag folding issues.
diff --git a/test/CodeGen/X86/cse-add-with-overflow.ll b/test/CodeGen/X86/cse-add-with-overflow.ll
new file mode 100644
index 0000000..1fcc03f
--- /dev/null
+++ b/test/CodeGen/X86/cse-add-with-overflow.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=x86_64-darwin -mcpu=generic | FileCheck %s
+; XFAIL: *
+; rdar:15661073 simple example of redundant adds
+;
+; MachineCSE should coalesce trivial subregister copies.
+;
+; The extra movl+addl should be removed during MachineCSE.
+; CHECK-LABEL: redundantadd
+; CHECK: cmpq
+; CHECK: movq
+; CHECK-NOT: movl
+; CHECK: addl
+; CHECK-NOT: addl
+; CHECK: ret
+
+define i64 @redundantadd(i64* %a0, i64* %a1) {
+entry:
+  %tmp8 = load i64* %a0, align 8
+  %tmp12 = load i64* %a1, align 8
+  %tmp13 = icmp ult i64 %tmp12, -281474976710656
+  br i1 %tmp13, label %exit1, label %body
+
+exit1:
+  unreachable
+
+body:
+  %tmp14 = trunc i64 %tmp8 to i32
+  %tmp15 = trunc i64 %tmp12 to i32
+  %tmp16 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %tmp14, i32 %tmp15)
+  %tmp17 = extractvalue { i32, i1 } %tmp16, 1
+  br i1 %tmp17, label %exit2, label %return
+
+exit2:
+  unreachable
+
+return:
+  %tmp18 = add i64 %tmp12, %tmp8
+  %tmp19 = and i64 %tmp18, 4294967295
+  %tmp20 = or i64 %tmp19, -281474976710656
+  ret i64 %tmp20
+}
+
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
diff --git a/test/CodeGen/X86/ctpop-combine.ll b/test/CodeGen/X86/ctpop-combine.ll
index 786f7f9..463505b 100644
--- a/test/CodeGen/X86/ctpop-combine.ll
+++ b/test/CodeGen/X86/ctpop-combine.ll
@@ -35,6 +35,6 @@ define i32 @test3(i64 %x) nounwind readnone {
   %conv = zext i1 %cmp to i32
   ret i32 %conv
 ; CHECK-LABEL: test3:
-; CHECK: cmpb $2
+; CHECK: cmpl $2
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/darwin-no-dead-strip.ll b/test/CodeGen/X86/darwin-no-dead-strip.ll
index 452d1f8..35196aa 100644
--- a/test/CodeGen/X86/darwin-no-dead-strip.ll
+++ b/test/CodeGen/X86/darwin-no-dead-strip.ll
@@ -1,7 +1,13 @@
-; RUN: llc < %s | grep no_dead_strip
+; RUN: llc < %s | FileCheck %s
 
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin8.7.2"
-@x = weak global i32 0          ; <i32*> [#uses=1]
-@llvm.used = appending global [1 x i8*] [ i8* bitcast (i32* @x to i8*) ]                ; <[1 x i8*]*> [#uses=0]
 
+@x = weak global i32 0
+; CHECK: .no_dead_strip	_x
+
+@"\01Ly" = private global i8 0
+; CHECK: no_dead_strip Ly
+
+@llvm.used = appending global [2 x i8*] [ i8* bitcast (i32* @x to i8*),
+            i8* @"\01Ly" ]
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
new file mode 100644
index 0000000..23f8335
--- /dev/null
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -0,0 +1,109 @@
+; RUN: llc -march=x86-64 -mtriple=x86_64-linux < %s | FileCheck %s
+; RUN: opt -strip-debug < %s | llc -march=x86-64 -mtriple=x86_64-linux | FileCheck %s
+; http://llvm.org/PR19051. Minor code-motion difference with -g.
+; Presence of debug info shouldn't affect the codegen. Make sure that
+; we generated the same code sequence with and without debug info. 
+;
+; CHECK:      callq   _Z3fooPcjPKc
+; CHECK:      callq   _Z3fooPcjPKc
+; CHECK:      leaq    (%rsp), %rdi
+; CHECK:      movl    $4, %esi
+; CHECK:      testl   {{%[a-z]+}}, {{%[a-z]+}}
+; CHECK:      je     .LBB0_4
+
+; Regenerate test with this command: 
+;   clang -emit-llvm -S -O2 -g
+; from this source:
+;
+; extern void foo(char *dst,unsigned siz,const char *src);
+; extern const char * i2str(int);
+;
+; struct AAA3 {
+;  AAA3(const char *value) { foo(text,sizeof(text),value);}
+;  void operator=(const char *value) { foo(text,sizeof(text),value);}
+;  operator const char*() const { return text;}
+;  char text[4];
+; };
+;
+; void bar (int param1,int param2)  {
+;   const char * temp(0);
+;
+;   if (param2) {
+;     temp = i2str(param2);
+;   }
+;   AAA3 var1("");
+;   AAA3 var2("");
+;
+;   if (param1)
+;     var2 = "+";
+;   else
+;     var2 = "-";
+;   var1 = "";
+; }
+
+%struct.AAA3 = type { [4 x i8] }
+
+@.str = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+@.str1 = private unnamed_addr constant [2 x i8] c"+\00", align 1
+@.str2 = private unnamed_addr constant [2 x i8] c"-\00", align 1
+
+; Function Attrs: uwtable
+define void @_Z3barii(i32 %param1, i32 %param2) #0 {
+entry:
+  %var1 = alloca %struct.AAA3, align 1
+  %var2 = alloca %struct.AAA3, align 1
+  %tobool = icmp eq i32 %param2, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = call i8* @_Z5i2stri(i32 %param2)
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !60)
+  call void @llvm.dbg.value(metadata !62, i64 0, metadata !63)
+  %arraydecay.i = getelementptr inbounds %struct.AAA3* %var1, i64 0, i32 0, i64 0
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0))
+  call void @llvm.dbg.declare(metadata !{%struct.AAA3* %var2}, metadata !38)
+  %arraydecay.i5 = getelementptr inbounds %struct.AAA3* %var2, i64 0, i32 0, i64 0
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0))
+  %tobool1 = icmp eq i32 %param1, 0
+  br i1 %tobool1, label %if.else, label %if.then2
+
+if.then2:                                         ; preds = %if.end
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0))
+  br label %if.end3
+
+if.else:                                          ; preds = %if.end
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0))
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.else, %if.then2
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0))
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+declare i8* @_Z5i2stri(i32) #2
+
+declare void @_Z3fooPcjPKc(i8*, i32, i8*) #2
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!48, !49}
+!llvm.ident = !{!50}
+
+!38 = metadata !{i32 786688, null, metadata !"var2", null, i32 20, null, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var2] [line 20]
+!48 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!50 = metadata !{metadata !"clang version 3.5 (202418)"}
+!60 = metadata !{i32 786689, null, metadata !"this", null, i32 16777216, null, i32 1088, null} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!62 = metadata !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)}
+!63 = metadata !{i32 786689, null, metadata !"value", null, i32 33554439, null, i32 0, null} ; [ DW_TAG_arg_variable ] [value] [line 7]
diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll
new file mode 100644
index 0000000..0b17c45
--- /dev/null
+++ b/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux | FileCheck %s
+
+; The Peephole optimizer should fold the load into the cmp even with debug info.
+; CHECK-LABEL: _ZN3Foo3batEv
+; CHECK-NOT: movq pfoo
+; CHECK: cmpq {{%[a-z]+}}, pfoo(%rip)
+;
+; CHECK-LABEL: _Z3bazv
+; CHECK-NOT: movq wibble2
+; CHECK: cmpq {{%[a-z]+}}, wibble2(%rip)
+
+; Regenerate test with this command: 
+;   clang -emit-llvm -S -O2 -g
+; from this source:
+;   struct Foo {
+;     bool bat();
+;     bool operator==(Foo &arg) { return (this == &arg); }
+;   };
+;   Foo *pfoo;
+;   bool Foo::bat() { return (*this == *pfoo); }
+;
+;   struct Wibble {
+;     int x;
+;   } *wibble1, *wibble2;
+;   struct Flibble {
+;     void bar(Wibble *c) {
+;       if (c < wibble2)
+;         wibble2 = 0;
+;       c->x = 0;
+;     }
+;   } flibble;
+;   void baz() { flibble.bar(wibble1); }
+
+%struct.Foo = type { i8 }
+%struct.Wibble = type { i32 }
+%struct.Flibble = type { i8 }
+
+@pfoo = global %struct.Foo* null, align 8
+@wibble1 = global %struct.Wibble* null, align 8
+@wibble2 = global %struct.Wibble* null, align 8
+@flibble = global %struct.Flibble zeroinitializer, align 1
+
+; Function Attrs: nounwind readonly uwtable
+define zeroext i1 @_ZN3Foo3batEv(%struct.Foo* %this) #0 align 2 {
+entry:
+  %0 = load %struct.Foo** @pfoo, align 8
+  tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62)
+  %cmp.i = icmp eq %struct.Foo* %0, %this
+  ret i1 %cmp.i
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z3bazv() #1 {
+entry:
+  %0 = load %struct.Wibble** @wibble1, align 8
+  tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65)
+  %1 = load %struct.Wibble** @wibble2, align 8
+  %cmp.i = icmp ugt %struct.Wibble* %1, %0
+  br i1 %cmp.i, label %if.then.i, label %_ZN7Flibble3barEP6Wibble.exit
+
+if.then.i:                                        ; preds = %entry
+  store %struct.Wibble* null, %struct.Wibble** @wibble2, align 8
+  br label %_ZN7Flibble3barEP6Wibble.exit
+
+_ZN7Flibble3barEP6Wibble.exit:                    ; preds = %entry, %if.then.i
+  %x.i = getelementptr inbounds %struct.Wibble* %0, i64 0, i32 0
+  store i32 0, i32* %x.i, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #2
+
+attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+
+!17 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo]
+!45 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble]
+!62 = metadata !{i32 786689, null, metadata !"arg", null, i32 33554436, metadata !17, i32 0, null} ; [ DW_TAG_arg_variable ] [arg] [line 4]
+!64 = metadata !{%struct.Flibble* undef}
+!65 = metadata !{i32 786689, null, metadata !"this", null, i32 16777229, metadata !45, i32 1088, null} ; [ DW_TAG_arg_variable ] [this] [line 13]
diff --git a/test/CodeGen/X86/dll-linkage.ll b/test/CodeGen/X86/dll-linkage.ll
deleted file mode 100644
index a0c2a54..0000000
--- a/test/CodeGen/X86/dll-linkage.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=i386-pc-mingw32 | FileCheck %s
-
-; RUN: llc < %s -mtriple=i386-pc-mingw32 -O0 | FileCheck %s -check-prefix=FAST
-; PR6275
-
-declare dllimport void @foo()
-
-define void @bar() nounwind {
-; CHECK: calll	*__imp__foo
-; FAST:  movl   __imp__foo, [[R:%[a-z]{3}]]
-; FAST:  calll  *[[R]]
-  call void @foo()
-  ret void
-}
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
new file mode 100644
index 0000000..a38c2d8
--- /dev/null
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple x86_64-pc-win32 < %s | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
+; RUN: llc -mtriple x86_64-pc-mingw32 < %s | FileCheck -check-prefix=CHECK -check-prefix=MINGW %s
+
+; CHECK: .text
+
+define void @notExported() {
+	ret void
+}
+
+; CHECK: .globl f1
+define dllexport void @f1() {
+	ret void
+}
+
+; CHECK: .globl f2
+define dllexport void @f2() unnamed_addr {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,lnk1
+; CHECK: .globl lnk1
+define linkonce_odr dllexport void @lnk1() {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,lnk2
+; CHECK: .globl lnk2
+define linkonce_odr dllexport void @lnk2() alwaysinline {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,weak1
+; CHECK: .globl weak1
+define weak_odr dllexport void @weak1() {
+	ret void
+}
+
+
+; CHECK: .data
+; CHECK: .globl Var1
+@Var1 = dllexport global i32 1, align 4
+
+; CHECK: .rdata,"r"
+; CHECK: .globl Var2
+@Var2 = dllexport unnamed_addr constant i32 1
+
+; CHECK: .comm Var3
+@Var3 = common dllexport global i32 0, align 4
+
+; CHECK: .section .data,"w",discard,WeakVar1
+; CHECK: .globl WeakVar1
+@WeakVar1 = weak_odr dllexport global i32 1, align 4
+
+; CHECK: .section .rdata,"r",discard,WeakVar2
+; CHECK: .globl WeakVar2
+@WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
+
+
+; CHECK: .globl alias
+; CHECK: alias = notExported
+@alias = dllexport alias void()* @notExported
+
+; CHECK: .globl alias2
+; CHECK: alias2 = f1
+@alias2 = dllexport alias void()* @f1
+
+; CHECK: .globl alias3
+; CHECK: alias3 = notExported
+@alias3 = dllexport alias void()* @alias
+
+; CHECK: .weak weak_alias
+; CHECK: weak_alias = f1
+@weak_alias = dllexport alias weak_odr void()* @f1
+
+
+; CHECK: .section .drectve
+; WIN32: /EXPORT:Var1,DATA
+; WIN32: /EXPORT:Var2,DATA
+; WIN32: /EXPORT:Var3,DATA
+; WIN32: /EXPORT:WeakVar1,DATA
+; WIN32: /EXPORT:WeakVar2,DATA
+; WIN32: /EXPORT:f1
+; WIN32: /EXPORT:f2
+; WIN32: /EXPORT:lnk1
+; WIN32: /EXPORT:lnk2
+; WIN32: /EXPORT:weak1
+; WIN32: /EXPORT:alias
+; WIN32: /EXPORT:alias2
+; WIN32: /EXPORT:alias3
+; WIN32: /EXPORT:weak_alias
+; MINGW: -export:Var1,data
+; MINGW: -export:Var2,data
+; MINGW: -export:Var3,data
+; MINGW: -export:WeakVar1,data
+; MINGW: -export:WeakVar2,data
+; MINGW: -export:f1
+; MINGW: -export:f2
+; MINGW: -export:lnk1
+; MINGW: -export:lnk2
+; MINGW: -export:weak1
+; MINGW: -export:alias
+; MINGW: -export:alias2
+; MINGW: -export:alias3
+; MINGW: -export:weak_alias
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index bf57e78..1b34d23 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -1,12 +1,125 @@
-; RUN: llc < %s | FileCheck %s
-; PR2936
+; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
+; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck -check-prefix=CHECK -check-prefix=MINGW %s
 
-target triple = "i386-pc-mingw32"
+; CHECK: .text
 
-define dllexport x86_fastcallcc i32 @foo() nounwind  {
-entry:
+define void @notExported() {
+	ret void
+}
+
+; CHECK: .globl _f1
+define dllexport void @f1() {
+	ret void
+}
+
+; CHECK: .globl _f2
+define dllexport void @f2() unnamed_addr {
+	ret void
+}
+
+; CHECK: .globl _stdfun@0
+define dllexport x86_stdcallcc void @stdfun() nounwind {
+	ret void
+}
+
+; CHECK: .globl @fastfun@0
+define dllexport x86_fastcallcc i32 @fastfun() nounwind {
 	ret i32 0
 }
 
+; CHECK: .globl _thisfun
+define dllexport x86_thiscallcc void @thisfun() nounwind {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,_lnk1
+; CHECK: .globl _lnk1
+define linkonce_odr dllexport void @lnk1() {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,_lnk2
+; CHECK: .globl _lnk2
+define linkonce_odr dllexport void @lnk2() alwaysinline {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,_weak1
+; CHECK: .globl _weak1
+define weak_odr dllexport void @weak1() {
+	ret void
+}
+
+
+; CHECK: .data
+; CHECK: .globl _Var1
+@Var1 = dllexport global i32 1, align 4
+
+; CHECK: .rdata,"r"
+; CHECK: .globl _Var2
+@Var2 = dllexport unnamed_addr constant i32 1
+
+; CHECK: .comm _Var3
+@Var3 = common dllexport global i32 0, align 4
+
+; CHECK: .section .data,"w",discard,_WeakVar1
+; CHECK: .globl _WeakVar1
+@WeakVar1 = weak_odr dllexport global i32 1, align 4
+
+; CHECK: .section .rdata,"r",discard,_WeakVar2
+; CHECK: .globl _WeakVar2
+@WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
+
+
+; CHECK: .globl _alias
+; CHECK: _alias = _notExported
+@alias = dllexport alias void()* @notExported
+
+; CHECK: .globl _alias2
+; CHECK: _alias2 = _f1
+@alias2 = dllexport alias void()* @f1
+
+; CHECK: .globl _alias3
+; CHECK: _alias3 = _notExported
+@alias3 = dllexport alias void()* @alias
+
+; CHECK: .weak _weak_alias
+; CHECK: _weak_alias = _f1
+@weak_alias = dllexport alias weak_odr void()* @f1
+
+
 ; CHECK: .section .drectve
-; CHECK: -export:@foo@0
+; WIN32: /EXPORT:_Var1,DATA
+; WIN32: /EXPORT:_Var2,DATA
+; WIN32: /EXPORT:_Var3,DATA
+; WIN32: /EXPORT:_WeakVar1,DATA
+; WIN32: /EXPORT:_WeakVar2,DATA
+; WIN32: /EXPORT:_f1
+; WIN32: /EXPORT:_f2
+; WIN32: /EXPORT:_stdfun@0
+; WIN32: /EXPORT:@fastfun@0
+; WIN32: /EXPORT:_thisfun
+; WIN32: /EXPORT:_lnk1
+; WIN32: /EXPORT:_lnk2
+; WIN32: /EXPORT:_weak1
+; WIN32: /EXPORT:_alias
+; WIN32: /EXPORT:_alias2
+; WIN32: /EXPORT:_alias3
+; WIN32: /EXPORT:_weak_alias
+; MINGW: -export:_Var1,data
+; MINGW: -export:_Var2,data
+; MINGW: -export:_Var3,data
+; MINGW: -export:_WeakVar1,data
+; MINGW: -export:_WeakVar2,data
+; MINGW: -export:_f1
+; MINGW: -export:_f2
+; MINGW: -export:_stdfun@0
+; MINGW: -export:@fastfun@0
+; MINGW: -export:_thisfun
+; MINGW: -export:_lnk1
+; MINGW: -export:_lnk2
+; MINGW: -export:_weak1
+; MINGW: -export:_alias
+; MINGW: -export:_alias2
+; MINGW: -export:_alias3
+; MINGW: -export:_weak_alias
diff --git a/test/CodeGen/X86/dllimport-x86_64.ll b/test/CodeGen/X86/dllimport-x86_64.ll
new file mode 100644
index 0000000..666409f
--- /dev/null
+++ b/test/CodeGen/X86/dllimport-x86_64.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple x86_64-pc-win32 < %s | FileCheck %s
+; RUN: llc -mtriple x86_64-pc-mingw32 < %s | FileCheck %s
+;
+; RUN: llc -mtriple x86_64-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
+; PR6275
+;
+; RUN: opt -mtriple x86_64-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT
+
+@Var1 = external dllimport global i32
+@Var2 = available_externally dllimport unnamed_addr constant i32 1
+
+declare dllimport void @fun()
+
+define available_externally dllimport void @inline1() {
+	ret void
+}
+
+define available_externally dllimport void @inline2() {
+	ret void
+}
+
+declare void @dummy(...)
+
+define void @use() nounwind {
+; CHECK:     callq *__imp_fun(%rip)
+; FAST:      movq  __imp_fun(%rip), [[R:%[a-z]{3}]]
+; FAST-NEXT: callq *[[R]]
+  call void @fun()
+
+; CHECK: callq *__imp_inline1(%rip)
+; CHECK: callq *__imp_inline2(%rip)
+  call void @inline1()
+  call void @inline2()
+
+; available_externally uses go away
+; OPT-NOT: call void @inline1()
+; OPT-NOT: call void @inline2()
+; OPT-NOT: load i32* @Var2
+; OPT: call void (...)* @dummy(i32 %1, i32 1)
+
+; CHECK-DAG: movq __imp_Var1(%rip), [[R1:%[a-z]{3}]]
+; CHECK-DAG: movq __imp_Var2(%rip), [[R2:%[a-z]{3}]]
+  %1 = load i32* @Var1
+  %2 = load i32* @Var2
+  call void(...)* @dummy(i32 %1, i32 %2)
+
+  ret void
+}
diff --git a/test/CodeGen/X86/dllimport.ll b/test/CodeGen/X86/dllimport.ll
new file mode 100644
index 0000000..695bfce
--- /dev/null
+++ b/test/CodeGen/X86/dllimport.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck %s
+; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck %s
+;
+; RUN: llc -mtriple i386-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
+; PR6275
+;
+; RUN: opt -mtriple i386-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT
+
+@Var1 = external dllimport global i32
+@Var2 = available_externally dllimport unnamed_addr constant i32 1
+
+declare dllimport void @fun()
+
+define available_externally dllimport void @inline1() {
+	ret void
+}
+
+define available_externally dllimport void @inline2() alwaysinline {
+	ret void
+}
+
+declare dllimport x86_stdcallcc void @stdfun() nounwind
+declare dllimport x86_fastcallcc void @fastfun() nounwind
+declare dllimport x86_thiscallcc void @thisfun() nounwind
+
+declare void @dummy(...)
+
+define void @use() nounwind {
+; CHECK:     calll *__imp__fun
+; FAST:      movl  __imp__fun, [[R:%[a-z]{3}]]
+; FAST-NEXT: calll *[[R]]
+  call void @fun()
+
+; CHECK: calll *__imp__inline1
+; CHECK: calll *__imp__inline2
+  call void @inline1()
+  call void @inline2()
+
+; CHECK: calll *__imp__stdfun@0
+; CHECK: calll *__imp_@fastfun@0
+; CHECK: calll *__imp__thisfun
+  call void @stdfun()
+  call void @fastfun()
+  call void @thisfun()
+
+; available_externally uses go away
+; OPT-NOT: call void @inline1()
+; OPT-NOT: call void @inline2()
+; OPT-NOT: load i32* @Var2
+; OPT: call void (...)* @dummy(i32 %1, i32 1)
+
+; CHECK-DAG: movl __imp__Var1, [[R1:%[a-z]{3}]]
+; CHECK-DAG: movl __imp__Var2, [[R2:%[a-z]{3}]]
+  %1 = load i32* @Var1
+  %2 = load i32* @Var2
+  call void(...)* @dummy(i32 %1, i32 %2)
+
+  ret void
+}
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index 3b4a868..c8d7527 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -7,10 +7,12 @@ target triple = "x86_64-unknown-linux-gnu"
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{i32 0}
+!0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{}
 !3 = metadata !{i32 786473, metadata !4} ; [ DW_TAG_file_type ]
 !4 = metadata !{metadata !"empty.c", metadata !"/home/nlewycky"}
+!6 = metadata !{i32 786451, metadata !4, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!7 = metadata !{metadata !6}
 
 ; The important part of the following check is that dir = #0.
 ;                        Dir  Mod Time   File Len   File Name
diff --git a/test/CodeGen/X86/dynamic-alloca-in-entry.ll b/test/CodeGen/X86/dynamic-alloca-in-entry.ll
new file mode 100644
index 0000000..7ed471c
--- /dev/null
+++ b/test/CodeGen/X86/dynamic-alloca-in-entry.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+; Allocas with unknown size in the entry block are dynamic.
+define void @foo(i32 %n) {
+  %m = alloca i32, i32 %n
+  ret void
+}
+; CHECK-LABEL: _foo:
+; CHECK: calll __chkstk
+; CHECK: retl
+
+; Use of inalloca implies that that the alloca is not static.
+define void @bar() {
+  %m = alloca inalloca i32
+  ret void
+}
+; CHECK-LABEL: _bar:
+; CHECK: calll __chkstk
+; CHECK: retl
diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll
new file mode 100644
index 0000000..a18f751
--- /dev/null
+++ b/test/CodeGen/X86/exedepsfix-broadcast.ll
@@ -0,0 +1,128 @@
+; RUN: llc -O3 -mtriple=x86_64-apple-macosx -o - < %s -mattr=+avx2 -enable-unsafe-fp-math -mcpu=core2 | FileCheck %s
+; Check that the ExeDepsFix pass correctly fixes the domain for broadcast instructions.
+; <rdar://problem/16354675>
+
+; CHECK-LABEL: ExeDepsFix_broadcastss
+; CHECK: broadcastss
+; CHECK: vandps
+; CHECK: vmaxps
+; CHECK: ret
+define <4 x float> @ExeDepsFix_broadcastss(<4 x float> %arg, <4 x float> %arg2) {
+  %bitcast = bitcast <4 x float> %arg to <4 x i32>
+  %and = and <4 x i32> %bitcast, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %floatcast = bitcast <4 x i32> %and to <4 x float>
+  %max_is_x = fcmp oge <4 x float> %floatcast, %arg2
+  %max = select <4 x i1> %max_is_x, <4 x float> %floatcast, <4 x float> %arg2
+  ret <4 x float> %max
+}
+
+; CHECK-LABEL: ExeDepsFix_broadcastss256
+; CHECK: broadcastss
+; CHECK: vandps
+; CHECK: vmaxps
+; CHECK: ret
+define <8 x float> @ExeDepsFix_broadcastss256(<8 x float> %arg, <8 x float> %arg2) {
+  %bitcast = bitcast <8 x float> %arg to <8 x i32>
+  %and = and <8 x i32> %bitcast, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %floatcast = bitcast <8 x i32> %and to <8 x float>
+  %max_is_x = fcmp oge <8 x float> %floatcast, %arg2
+  %max = select <8 x i1> %max_is_x, <8 x float> %floatcast, <8 x float> %arg2
+  ret <8 x float> %max
+}
+
+
+; CHECK-LABEL: ExeDepsFix_broadcastss_inreg
+; CHECK: broadcastss
+; CHECK: vandps
+; CHECK: vmaxps
+; CHECK: ret
+define <4 x float> @ExeDepsFix_broadcastss_inreg(<4 x float> %arg, <4 x float> %arg2, i32 %broadcastvalue) {
+  %bitcast = bitcast <4 x float> %arg to <4 x i32>
+  %in = insertelement <4 x i32> undef, i32 %broadcastvalue, i32 0
+  %mask = shufflevector <4 x i32> %in, <4 x i32> undef, <4 x i32> zeroinitializer
+  %and = and <4 x i32> %bitcast, %mask
+  %floatcast = bitcast <4 x i32> %and to <4 x float>
+  %max_is_x = fcmp oge <4 x float> %floatcast, %arg2
+  %max = select <4 x i1> %max_is_x, <4 x float> %floatcast, <4 x float> %arg2
+  ret <4 x float> %max
+}
+
+; CHECK-LABEL: ExeDepsFix_broadcastss256_inreg
+; CHECK: broadcastss
+; CHECK: vandps
+; CHECK: vmaxps
+; CHECK: ret
+define <8 x float> @ExeDepsFix_broadcastss256_inreg(<8 x float> %arg, <8 x float> %arg2, i32 %broadcastvalue) {
+  %bitcast = bitcast <8 x float> %arg to <8 x i32>
+  %in = insertelement <8 x i32> undef, i32 %broadcastvalue, i32 0
+  %mask = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
+  %and = and <8 x i32> %bitcast, %mask
+  %floatcast = bitcast <8 x i32> %and to <8 x float>
+  %max_is_x = fcmp oge <8 x float> %floatcast, %arg2
+  %max = select <8 x i1> %max_is_x, <8 x float> %floatcast, <8 x float> %arg2
+  ret <8 x float> %max
+}
+
+; CHECK-LABEL: ExeDepsFix_broadcastsd
+; In that case the broadcast is directly folded into vandpd.
+; CHECK: vandpd
+; CHECK: vmaxpd
+; CHECK:ret
+define <2 x double> @ExeDepsFix_broadcastsd(<2 x double> %arg, <2 x double> %arg2) {
+  %bitcast = bitcast <2 x double> %arg to <2 x i64>
+  %and = and <2 x i64> %bitcast, <i64 2147483647, i64 2147483647>
+  %floatcast = bitcast <2 x i64> %and to <2 x double>
+  %max_is_x = fcmp oge <2 x double> %floatcast, %arg2
+  %max = select <2 x i1> %max_is_x, <2 x double> %floatcast, <2 x double> %arg2
+  ret <2 x double> %max
+}
+
+; CHECK-LABEL: ExeDepsFix_broadcastsd256
+; CHECK: broadcastsd
+; CHECK: vandpd
+; CHECK: vmaxpd
+; CHECK: ret
+define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %arg2) {
+  %bitcast = bitcast <4 x double> %arg to <4 x i64>
+  %and = and <4 x i64> %bitcast, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
+  %floatcast = bitcast <4 x i64> %and to <4 x double>
+  %max_is_x = fcmp oge <4 x double> %floatcast, %arg2
+  %max = select <4 x i1> %max_is_x, <4 x double> %floatcast, <4 x double> %arg2
+  ret <4 x double> %max
+}
+
+
+; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
+; ExeDepsFix works top down, thus it coalesces vmovlhps domain with
+; vandps and there is nothing more you can do to match vmaxpd.
+; CHECK: vmovlhps
+; CHECK: vandps
+; CHECK: vmaxpd
+; CHECK: ret
+define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {
+  %bitcast = bitcast <2 x double> %arg to <2 x i64>
+  %in = insertelement <2 x i64> undef, i64 %broadcastvalue, i32 0
+  %mask = shufflevector <2 x i64> %in, <2 x i64> undef, <2 x i32> zeroinitializer
+  %and = and <2 x i64> %bitcast, %mask
+  %floatcast = bitcast <2 x i64> %and to <2 x double>
+  %max_is_x = fcmp oge <2 x double> %floatcast, %arg2
+  %max = select <2 x i1> %max_is_x, <2 x double> %floatcast, <2 x double> %arg2
+  ret <2 x double> %max
+}
+
+; CHECK-LABEL: ExeDepsFix_broadcastsd256_inreg
+; CHECK: broadcastsd
+; CHECK: vandpd
+; CHECK: vmaxpd
+; CHECK: ret
+define <4 x double> @ExeDepsFix_broadcastsd256_inreg(<4 x double> %arg, <4 x double> %arg2, i64 %broadcastvalue) {
+  %bitcast = bitcast <4 x double> %arg to <4 x i64>
+  %in = insertelement <4 x i64> undef, i64 %broadcastvalue, i32 0
+  %mask = shufflevector <4 x i64> %in, <4 x i64> undef, <4 x i32> zeroinitializer
+  %and = and <4 x i64> %bitcast, %mask
+  %floatcast = bitcast <4 x i64> %and to <4 x double>
+  %max_is_x = fcmp oge <4 x double> %floatcast, %arg2
+  %max = select <4 x i1> %max_is_x, <4 x double> %floatcast, <4 x double> %arg2
+  ret <4 x double> %max
+}
+
diff --git a/test/CodeGen/X86/extract-store.ll b/test/CodeGen/X86/extract-store.ll
new file mode 100644
index 0000000..27d9380
--- /dev/null
+++ b/test/CodeGen/X86/extract-store.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse4.1 | FileCheck %s -check-prefix=SSE41
+; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX
+
+define void @pextrb(i8* nocapture %dst, <16 x i8> %foo) {
+; AVX: vpextrb
+; SSE41: pextrb
+; AVX-NOT: movb
+; SSE41-NOT: movb
+  %vecext = extractelement <16 x i8> %foo, i32 15
+  store i8 %vecext, i8* %dst, align 1
+  ret void
+}
+
+define void @pextrw(i16* nocapture %dst, <8 x i16> %foo) {
+; AVX: vpextrw
+; SSE41: pextrw
+; AVX-NOT: movw
+; SSE41-NOT: movw
+  %vecext = extractelement <8 x i16> %foo, i32 15
+  store i16 %vecext, i16* %dst, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/fast-isel-args-fail.ll b/test/CodeGen/X86/fast-isel-args-fail.ll
index e748e1c..7467edd 100644
--- a/test/CodeGen/X86/fast-isel-args-fail.ll
+++ b/test/CodeGen/X86/fast-isel-args-fail.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-apple-darwin10
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN32
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win64 | FileCheck %s -check-prefix=WIN64
-; Requires: Asserts
+; REQUIRES: asserts
 
 ; Previously, this would cause an assert.
 define i31 @t1(i31 %a, i31 %b, i31 %c) {
diff --git a/test/CodeGen/X86/fast-isel-select.ll b/test/CodeGen/X86/fast-isel-select.ll
new file mode 100644
index 0000000..53158bc
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-select.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple x86_64-apple-darwin -O0 -o - < %s | FileCheck %s
+; Make sure we only use the less significant bit of the value that feeds the
+; select. Otherwise, we may account for a non-zero value whereas the
+; lsb is zero.
+; <rdar://problem/15651765>
+
+; CHECK-LABEL: fastisel_select: 
+; CHECK: subb {{%[a-z0-9]+}}, [[RES:%[a-z0-9]+]]
+; CHECK: testb $1, [[RES]]
+; CHECK: cmovel
+define i32 @fastisel_select(i1 %exchSub2211_, i1 %trunc_8766) {
+  %shuffleInternal15257_8932 = sub i1 %exchSub2211_, %trunc_8766
+  %counter_diff1345 = select i1 %shuffleInternal15257_8932, i32 1204476887, i32 0
+  ret i32 %counter_diff1345
+}
+
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index ba86e88..a212a7c 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -3,7 +3,7 @@
 ; This should use flds to set the return value.
 ; CHECK-LABEL: test0:
 ; CHECK: flds
-; CHECK: ret
+; CHECK: retl
 @G = external global float
 define float @test0() nounwind {
   %t = load float* @G
@@ -12,7 +12,7 @@ define float @test0() nounwind {
 
 ; This should pop 4 bytes on return.
 ; CHECK-LABEL: test1:
-; CHECK: ret $4
+; CHECK: retl $4
 define void @test1({i32, i32, i32, i32}* sret %p) nounwind {
   store {i32, i32, i32, i32} zeroinitializer, {i32, i32, i32, i32}* %p
   ret void
@@ -25,7 +25,7 @@ define void @test1({i32, i32, i32, i32}* sret %p) nounwind {
 ; CHECK-NEXT: L2$pb:
 ; CHECK-NEXT: pop
 ; CHECK: HHH
-; CHECK: ret
+; CHECK: retl
 @HHH = external global i32
 define i32 @test2() nounwind {
   %t = load i32* @HHH
diff --git a/test/CodeGen/X86/fast-isel.ll b/test/CodeGen/X86/fast-isel.ll
index 132df2b..bc79184 100644
--- a/test/CodeGen/X86/fast-isel.ll
+++ b/test/CodeGen/X86/fast-isel.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -march=x86 -mattr=sse2
-; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=x86_64-apple-darwin10
+; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -march=x86 -mattr=sse2 -no-integrated-as
+; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=x86_64-apple-darwin10 -no-integrated-as
 
 ; This tests very minimal fast-isel functionality.
 
diff --git a/test/CodeGen/X86/fastcall-correct-mangling.ll b/test/CodeGen/X86/fastcall-correct-mangling.ll
index 3569d36..00dc44e 100644
--- a/test/CodeGen/X86/fastcall-correct-mangling.ll
+++ b/test/CodeGen/X86/fastcall-correct-mangling.ll
@@ -1,14 +1,33 @@
-; RUN: llc < %s -mtriple=i386-unknown-mingw32 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-mingw32 | \
+; RUN: FileCheck --check-prefix=CHECK32 %s
+
+; RUN: llc < %s -mtriple=i386-unknown-win32 | \
+; RUN: FileCheck --check-prefix=CHECK32 %s
+
+; RUN: llc < %s -mtriple=x86_64-unknown-mingw32 | \
+; RUN: FileCheck --check-prefix=CHECK64 %s
+
+; RUN: llc < %s -mtriple=x86_64-unknown-mingw32 | \
+; RUN: FileCheck --check-prefix=CHECK64 %s
 
 ; Check that a fastcall function gets correct mangling
 
 define x86_fastcallcc void @func(i64 %X, i8 %Y, i8 %G, i16 %Z) {
-; CHECK: @func@20:
+; CHECK32-LABEL: {{^}}@func@20:
+; CHECK64-LABEL: {{^}}func:
         ret void
 }
 
 define x86_fastcallcc i32 @"\01DoNotMangle"(i32 %a) {
-; CHECK: DoNotMangle:
+; CHECK32-LABEL: {{^}}DoNotMangle:
+; CHECK64-LABEL: {{^}}DoNotMangle:
 entry:
   ret i32 %a
 }
+
+define private x86_fastcallcc void @dontCrash() {
+; The name is fairly arbitrary since it is private. Just don't crash.
+; CHECK32-LABEL: {{^}}L@dontCrash@0:
+; CHECK64-LABEL: {{^}}.LdontCrash:
+  ret void
+}
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 917eac0..47252ec 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -42,6 +42,39 @@ entry:
   ret float %call
 }
 
+; Test FMA3 variant selection
+; CHECK: fma3_select231ssX:
+; CHECK: vfmadd231ss xmm
+define float @fma3_select231ssX(float %x, float %y) #0 {
+entry:
+  br label %while.body
+while.body:                                       ; preds = %while.body, %while.body
+  %acc.01 = phi float [ 0.000000e+00, %entry ], [ %acc, %while.body ]
+  %acc = tail call float @llvm.fma.f32(float %x, float %y, float %acc.01) nounwind readnone
+  %b = fcmp ueq float %acc, 0.0
+  br i1 %b, label %while.body, label %while.end
+while.end:                                        ; preds = %while.body, %entry
+  ret float %acc
+}
+
+; Test FMA3 variant selection
+; CHECK: fma3_select231pdY:
+; CHECK: vfmadd231pd ymm
+define <4 x double> @fma3_select231pdY(<4 x double> %x, <4 x double> %y) #0 {
+entry:
+  br label %while.body
+while.body:                                       ; preds = %entry, %while.body
+  %acc.04 = phi <4 x double> [ zeroinitializer, %entry ], [ %add, %while.body ]
+  %add = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %acc.04)
+  %vecext = extractelement <4 x double> %add, i32 0
+  %cmp = fcmp oeq double %vecext, 0.000000e+00
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body
+  ret <4 x double> %add
+}
+
 declare float @llvm.fma.f32(float, float, float) nounwind readnone
 declare double @llvm.fma.f64(double, double, double) nounwind readnone
 declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) nounwind readnone
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index e3910a6..9a25096 100644
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ; CHECK: fmadd213ss %xmm
+  ; CHECK: fmadd213ss (%r8), %xmm
   %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
   ret <4 x float> %res
 }
@@ -24,7 +24,7 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
 
 define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ; CHECK: fnmadd213ss %xmm
+  ; CHECK: fnmadd213ss (%r8), %xmm
   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
   ret <4 x float> %res
 }
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index 7a1a9ae..494cb28 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
 
 ; VFMADD
diff --git a/test/CodeGen/X86/fold-call-oper.ll b/test/CodeGen/X86/fold-call-oper.ll
new file mode 100644
index 0000000..94e2a6f
--- /dev/null
+++ b/test/CodeGen/X86/fold-call-oper.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+;
+; PR18396: Assertion: MO->isDead "Cannot fold physreg def".
+; InlineSpiller::foldMemoryOperand needs to handle undef call operands.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@a = external global i32**, align 8
+@b = external global i32, align 4
+
+; Check that the call targets are folded, and we don't crash!
+; CHECK-LABEL: foldCallOper:
+; CHECK: callq *{{.*}}(%rbp)
+; CHECK: callq *{{.*}}(%rbp)
+define void @foldCallOper(i32 (i32*, i32, i32**)* nocapture %p1) #0 {
+entry:
+  %0 = load i32*** @a, align 8
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i, %entry
+  %exitcond5.i = icmp eq i32 undef, undef
+  br i1 %exitcond5.i, label %for.body3.lr.ph.i, label %for.body.i
+
+for.body3.lr.ph.i:                                ; preds = %for.body.i
+  %call.i = tail call i32 %p1(i32* undef, i32 0, i32** null)
+  %tobool.i = icmp eq i32 %call.i, 0
+  br label %for.body3.i
+
+for.body3.i:                                      ; preds = %for.inc8.i, %for.body3.lr.ph.i
+  %1 = phi i32* [ undef, %for.body3.lr.ph.i ], [ %.pre.i, %for.inc8.i ]
+  %indvars.iv.i = phi i64 [ 1, %for.body3.lr.ph.i ], [ %phitmp.i, %for.inc8.i ]
+  %call5.i = tail call i32 %p1(i32* %1, i32 0, i32** %0)
+  br i1 %tobool.i, label %for.inc8.i, label %if.then.i
+
+if.then.i:                                        ; preds = %for.body3.i
+  %2 = load i32* %1, align 4
+  store i32 %2, i32* @b, align 4
+  br label %for.inc8.i
+
+for.inc8.i:                                       ; preds = %if.then.i, %for.body3.i
+  %lftr.wideiv.i = trunc i64 %indvars.iv.i to i32
+  %arrayidx4.phi.trans.insert.i = getelementptr inbounds [0 x i32*]* undef, i64 0, i64 %indvars.iv.i
+  %.pre.i = load i32** %arrayidx4.phi.trans.insert.i, align 8
+  %phitmp.i = add i64 %indvars.iv.i, 1
+  br label %for.body3.i
+}
+
+attributes #0 = { noreturn uwtable "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/X86/fold-vector-sext-crash.ll b/test/CodeGen/X86/fold-vector-sext-crash.ll
new file mode 100644
index 0000000..52ea7a9
--- /dev/null
+++ b/test/CodeGen/X86/fold-vector-sext-crash.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-unknown-linux-gnu -mattr=+avx,+popcnt,+cmov
+
+; Make sure that we don't introduce illegal build_vector dag nodes
+; when trying to fold a sign_extend of a constant build_vector.
+; After r200234 the test case below was crashing the compiler with an assertion failure
+; due to an illegal build_vector of type MVT::v4i64.
+
+define <4 x i64> @foo(<4 x i64> %A) {
+  %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i64> %A, <4 x i64><i64 undef, i64 undef, i64 0, i64 0>
+  ret <4 x i64> %1
+}
+
diff --git a/test/CodeGen/X86/fold-vector-sext-zext.ll b/test/CodeGen/X86/fold-vector-sext-zext.ll
new file mode 100644
index 0000000..aeaab44
--- /dev/null
+++ b/test/CodeGen/X86/fold-vector-sext-zext.ll
@@ -0,0 +1,291 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+
+; Verify that the backend correctly folds a sign/zero extend of a vector where
+; elements are all constant values or UNDEFs.
+; The backend should be able to optimize all the test functions below into
+; simple loads from constant pool of the result. That is because the resulting
+; vector should be known at static time.
+
+
+define <4 x i16> @test1() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i16>
+  ret <4 x i16> %5
+}
+; CHECK-LABEL: test1
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i16> @test2() {
+  %1 = insertelement <4 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i16>
+  ret <4 x i16> %5
+}
+; CHECK-LABEL: test2
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test3() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+; CHECK-LABEL: test3
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test4() {
+  %1 = insertelement <4 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+; CHECK-LABEL: test4
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+
+define <4 x i64> @test5() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i64>
+  ret <4 x i64> %5
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i64> @test6() {
+  %1 = insertelement <4 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i64>
+  ret <4 x i64> %5
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test7() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = sext <8 x i8> %4 to <8 x i16>
+  ret <8 x i16> %9
+}
+; CHECK-LABEL: test7
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i32> @test8() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = sext <8 x i8> %4 to <8 x i32>
+  ret <8 x i32> %9
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test9() {
+  %1 = insertelement <8 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 undef, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 undef, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = sext <8 x i8> %4 to <8 x i16>
+  ret <8 x i16> %9
+}
+; CHECK-LABEL: test9
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i32> @test10() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 undef, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 undef, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 undef, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 undef, i32 7
+  %9 = sext <8 x i8> %4 to <8 x i32>
+  ret <8 x i32> %9
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+
+define <4 x i16> @test11() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i16>
+  ret <4 x i16> %5
+}
+; CHECK-LABEL: test11
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test12() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+; CHECK-LABEL: test12
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i64> @test13() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i64>
+  ret <4 x i64> %5
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i16> @test14() {
+  %1 = insertelement <4 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i16>
+  ret <4 x i16> %5
+}
+; CHECK-LABEL: test14
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test15() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 undef, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 undef, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+; CHECK-LABEL: test15
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i64> @test16() {
+  %1 = insertelement <4 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 undef, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i64>
+  ret <4 x i64> %5
+}
+; CHECK-LABEL: test16
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test17() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = zext <8 x i8> %8 to <8 x i16>
+  ret <8 x i16> %9
+}
+; CHECK-LABEL: test17
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i32> @test18() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = zext <8 x i8> %8 to <8 x i32>
+  ret <8 x i32> %9
+}
+; CHECK-LABEL: test18
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test19() {
+  %1 = insertelement <8 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 undef, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 undef, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = zext <8 x i8> %8 to <8 x i16>
+  ret <8 x i16> %9
+}
+; CHECK-LABEL: test19
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i32> @test20() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 undef, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 undef, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 undef, i32 7
+  %9 = zext <8 x i8> %8 to <8 x i32>
+  ret <8 x i32> %9
+}
+; CHECK-LABEL: test20
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
diff --git a/test/CodeGen/X86/fold-xmm-zero.ll b/test/CodeGen/X86/fold-xmm-zero.ll
index b4eeb40..c92d45c 100644
--- a/test/CodeGen/X86/fold-xmm-zero.ll
+++ b/test/CodeGen/X86/fold-xmm-zero.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-macosx10.6.7 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-macosx10.6.7 -mattr=+sse2 -no-integrated-as | FileCheck %s
 
 ; Simple test to make sure folding for special constants (like float zero)
 ; isn't completely broken.
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
index 07baca8..7b08ad6 100644
--- a/test/CodeGen/X86/fp-fast.ll
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -mattr=+avx,-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=corei7-avx -enable-unsafe-fp-math < %s | FileCheck %s
 
 ; CHECK-LABEL: test1
 define float @test1(float %a) {
diff --git a/test/CodeGen/X86/gcc_except_table.ll b/test/CodeGen/X86/gcc_except_table.ll
index fcc4e9f..7a29b07 100644
--- a/test/CodeGen/X86/gcc_except_table.ll
+++ b/test/CodeGen/X86/gcc_except_table.ll
@@ -1,12 +1,35 @@
-; RUN: llc -mtriple x86_64-apple-darwin %s -o - | FileCheck %s
+; RUN: llc -mtriple x86_64-apple-darwin %s -o -   | FileCheck %s   --check-prefix=APPLE
+; RUN: llc -mtriple x86_64-pc-windows-gnu %s -o - | FileCheck %s   --check-prefix=MINGW64
+; RUN: llc -mtriple i686-pc-windows-gnu %s -o -   | FileCheck %s   --check-prefix=MINGW32
 @_ZTIi = external constant i8*
 
 define i32 @main() uwtable optsize ssp {
-; CHECK: .cfi_startproc
-; CHECK: .cfi_personality 155, ___gxx_personality_v0
-; CHECK: .cfi_lsda 16, Lexception0
-; CHECK: .cfi_def_cfa_offset 16
-; CHECK: .cfi_endproc
+; APPLE: .cfi_startproc
+; APPLE: .cfi_personality 155, ___gxx_personality_v0
+; APPLE: .cfi_lsda 16, Lexception0
+; APPLE: .cfi_def_cfa_offset 16
+; APPLE: callq __Unwind_Resume
+; APPLE: .cfi_endproc
+; APPLE: GCC_except_table0:
+; APPLE: Lexception0:
+
+; MINGW64: .cfi_startproc
+; MINGW64: .cfi_personality 0, __gxx_personality_v0
+; MINGW64: .cfi_lsda 0, .Lexception0
+; MINGW64: .cfi_def_cfa_offset 16
+; MINGW64: callq _Unwind_Resume
+; MINGW64: .cfi_endproc
+; MINGW64: GCC_except_table0:
+; MINGW64: Lexception0:
+
+; MINGW32: .cfi_startproc
+; MINGW32: .cfi_personality 0, ___gxx_personality_v0
+; MINGW32: .cfi_lsda 0, Lexception0
+; MINGW32: .cfi_def_cfa_offset 8
+; MINGW32: calll __Unwind_Resume
+; MINGW32: .cfi_endproc
+; MINGW32: GCC_except_table0:
+; MINGW32: Lexception0:
 
 entry:
   invoke void @_Z1fv() optsize
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index d8743ac..5ad5047 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -1,7 +1,16 @@
 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s -check-prefix=LINUX
 ; RUN: llc < %s -mtriple=i386-apple-darwin9.7 | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -relocation-model=static | FileCheck %s -check-prefix=DARWIN-STATIC
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=DARWIN64
 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -fdata-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
+; RUN: llc < %s -mtriple=i686-pc-win32 -fdata-sections -ffunction-sections | FileCheck %s -check-prefix=WIN32-SECTIONS
 
+define void @F1() {
+  ret void
+}
+
+; WIN32-SECTIONS: .section        .text,"xr",one_only,_F1
+; WIN32-SECTIONS: .globl _F1
 
 ; int G1;
 @G1 = common global i32 0
@@ -39,6 +48,9 @@
 ; LINUX-SECTIONS: .section        .rodata.G3,"a",@progbits
 ; LINUX-SECTIONS: .globl  G3
 
+; WIN32-SECTIONS: .section        .rdata,"r",one_only,_G3
+; WIN32-SECTIONS: .globl  _G3
+
 
 ; _Complex long long const G4 = 34;
 @G4 = unnamed_addr constant {i64,i64} { i64 34, i64 0 }
@@ -47,6 +59,14 @@
 ; DARWIN: _G4:
 ; DARWIN:     .long 34
 
+; DARWIN-STATIC: .section        __TEXT,__literal16,16byte_literals
+; DARWIN-STATIC: _G4:
+; DARWIN-STATIC:     .long 34
+
+; DARWIN64: .section        __TEXT,__literal16,16byte_literals
+; DARWIN64: _G4:
+; DARWIN64:     .quad 34
+
 
 ; int G5 = 47;
 @G5 = global i32 47
@@ -107,6 +127,9 @@
 ; LINUX-SECTIONS: .section        .rodata.G7,"aMS",@progbits,1
 ; LINUX-SECTIONS:	.globl G7
 
+; WIN32-SECTIONS: .section        .rdata,"r",one_only,_G7
+; WIN32-SECTIONS:	.globl _G7
+
 
 @G8 = unnamed_addr constant [4 x i16] [ i16 1, i16 2, i16 3, i16 0 ]
 
@@ -158,3 +181,16 @@
 ; DARWIN: .zerofill __DATA,__common,_G12,1,3
 ; DARWIN: .globl _G13
 ; DARWIN: .zerofill __DATA,__common,_G13,1,3
+
+@G14 = private unnamed_addr constant [4 x i8] c"foo\00", align 1
+
+; LINUX-SECTIONS:        .type   .LG14,@object           # @G14
+; LINUX-SECTIONS:        .section        .rodata..LG14,"aMS",@progbits,1
+; LINUX-SECTIONS: .LG14:
+; LINUX-SECTIONS:        .asciz  "foo"
+; LINUX-SECTIONS:        .size   .LG14, 4
+
+; WIN32-SECTIONS:        .section        .rdata,"r"
+; WIN32-SECTIONS: L_G14:
+; WIN32-SECTIONS:        .asciz  "foo"
+
diff --git a/test/CodeGen/X86/hidden-vis-pic.ll b/test/CodeGen/X86/hidden-vis-pic.ll
index 67be3d0..1caab7a 100644
--- a/test/CodeGen/X86/hidden-vis-pic.ll
+++ b/test/CodeGen/X86/hidden-vis-pic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -disable-cfi -mtriple=i386-apple-darwin9 -relocation-model=pic -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin9 -relocation-model=pic -disable-fp-elim | FileCheck %s
 
 
 
@@ -48,8 +48,3 @@ return:                                           ; preds = %entry
   %retval1 = load i32* %retval                    ; <i32> [#uses=1]
   ret i32 %retval1
 }
-
-; CHECK: .private_extern _func.eh
-; CHECK: .private_extern _main.eh
-
-
diff --git a/test/CodeGen/X86/i64-mem-copy.ll b/test/CodeGen/X86/i64-mem-copy.ll
index dce12ae..bf77896 100644
--- a/test/CodeGen/X86/i64-mem-copy.ll
+++ b/test/CodeGen/X86/i64-mem-copy.ll
@@ -3,7 +3,7 @@
 ; X64: movq ({{%rsi|%rdx}}), %r
 
 ; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s -check-prefix=X32
-; X32: movsd (%eax), %xmm
+; X32: movsd ({{%ecx|%eax}}), %xmm
 
 ; Uses movsd to load / store i64 values if sse2 is available.
 
diff --git a/test/CodeGen/X86/inalloca-ctor.ll b/test/CodeGen/X86/inalloca-ctor.ll
new file mode 100644
index 0000000..7cfa929
--- /dev/null
+++ b/test/CodeGen/X86/inalloca-ctor.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+%Foo = type { i32, i32 }
+
+%frame = type { %Foo, i32, %Foo }
+
+declare void @f(%frame* inalloca %a)
+
+declare void @Foo_ctor(%Foo* %this)
+
+define void @g() {
+entry:
+  %args = alloca inalloca %frame
+  %c = getelementptr %frame* %args, i32 0, i32 2
+; CHECK: movl    $20, %eax
+; CHECK: calll   __chkstk
+; CHECK: movl %esp,
+  call void @Foo_ctor(%Foo* %c)
+; CHECK: leal 12(%{{.*}}),
+; CHECK: subl $4, %esp
+; CHECK: calll _Foo_ctor
+; CHECK: addl $4, %esp
+  %b = getelementptr %frame* %args, i32 0, i32 1
+  store i32 42, i32* %b
+; CHECK: movl $42,
+  %a = getelementptr %frame* %args, i32 0, i32 0
+  call void @Foo_ctor(%Foo* %a)
+; CHECK: subl $4, %esp
+; CHECK: calll _Foo_ctor
+; CHECK: addl $4, %esp
+  call void @f(%frame* inalloca %args)
+; CHECK: calll   _f
+  ret void
+}
diff --git a/test/CodeGen/X86/inalloca-invoke.ll b/test/CodeGen/X86/inalloca-invoke.ll
new file mode 100644
index 0000000..6cff9ac
--- /dev/null
+++ b/test/CodeGen/X86/inalloca-invoke.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+%Iter = type { i32, i32, i32 }
+
+%frame.reverse = type { %Iter, %Iter }
+
+declare void @llvm.stackrestore(i8*)
+declare i8* @llvm.stacksave()
+declare void @begin(%Iter* sret)
+declare void @plus(%Iter* sret, %Iter*, i32)
+declare void @reverse(%frame.reverse* inalloca align 4)
+
+define i32 @main() {
+  %temp.lvalue = alloca %Iter
+  br label %blah
+
+blah:
+  %inalloca.save = call i8* @llvm.stacksave()
+  %rev_args = alloca inalloca %frame.reverse, align 4
+  %beg = getelementptr %frame.reverse* %rev_args, i32 0, i32 0
+  %end = getelementptr %frame.reverse* %rev_args, i32 0, i32 1
+
+; CHECK:  calll   __chkstk
+; CHECK:  movl    %[[beg:[^,]*]], %esp
+; CHECK:  leal    12(%[[beg]]), %[[end:[^ ]*]]
+
+  call void @begin(%Iter* sret %temp.lvalue)
+; CHECK:  calll _begin
+
+  invoke void @plus(%Iter* sret %end, %Iter* %temp.lvalue, i32 4)
+          to label %invoke.cont unwind label %lpad
+
+;  Uses end as sret param.
+; CHECK:  movl %[[end]], (%esp)
+; CHECK:  calll _plus
+
+invoke.cont:
+  call void @begin(%Iter* sret %beg)
+
+; CHECK:  movl %[[beg]],
+; CHECK:  calll _begin
+
+  invoke void @reverse(%frame.reverse* inalloca align 4 %rev_args)
+          to label %invoke.cont5 unwind label %lpad
+
+invoke.cont5:                                     ; preds = %invoke.cont
+  call void @llvm.stackrestore(i8* %inalloca.save)
+  ret i32 0
+
+lpad:                                             ; preds = %invoke.cont, %entry
+  %lp = landingpad { i8*, i32 } personality i8* null
+          cleanup
+  unreachable
+}
diff --git a/test/CodeGen/X86/inalloca-stdcall.ll b/test/CodeGen/X86/inalloca-stdcall.ll
new file mode 100644
index 0000000..54f97d9
--- /dev/null
+++ b/test/CodeGen/X86/inalloca-stdcall.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+%Foo = type { i32, i32 }
+
+declare x86_stdcallcc void @f(%Foo* inalloca %a)
+declare x86_stdcallcc void @i(i32 %a)
+
+define void @g() {
+  %b = alloca inalloca %Foo
+; CHECK: movl    $8, %eax
+; CHECK: calll   __chkstk
+; CHECK: movl   %[[REG:[^,]*]], %esp
+  %f1 = getelementptr %Foo* %b, i32 0, i32 0
+  %f2 = getelementptr %Foo* %b, i32 0, i32 1
+  store i32 13, i32* %f1
+  store i32 42, i32* %f2
+; CHECK: movl    $13, (%[[REG]])
+; CHECK: movl    $42, 4(%[[REG]])
+  call x86_stdcallcc void @f(%Foo* inalloca %b)
+; CHECK: calll   _f@8
+; CHECK-NOT: %esp
+; CHECK: subl $4, %esp
+; CHECK: calll   _i@4
+  call x86_stdcallcc void @i(i32 0)
+  ret void
+}
diff --git a/test/CodeGen/X86/inalloca.ll b/test/CodeGen/X86/inalloca.ll
new file mode 100644
index 0000000..12643f9
--- /dev/null
+++ b/test/CodeGen/X86/inalloca.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+%Foo = type { i32, i32 }
+
+declare void @f(%Foo* inalloca %b)
+
+define void @a() {
+; CHECK-LABEL: _a:
+entry:
+  %b = alloca inalloca %Foo
+; CHECK: movl    $8, %eax
+; CHECK: calll   __chkstk
+; CHECK: movl   %[[REG:[^,]*]], %esp
+  %f1 = getelementptr %Foo* %b, i32 0, i32 0
+  %f2 = getelementptr %Foo* %b, i32 0, i32 1
+  store i32 13, i32* %f1
+  store i32 42, i32* %f2
+; CHECK: movl    $13, (%[[REG]])
+; CHECK: movl    $42, 4(%[[REG]])
+  call void @f(%Foo* inalloca %b)
+; CHECK: calll   _f
+  ret void
+}
+
+declare void @inreg_with_inalloca(i32 inreg %a, %Foo* inalloca %b)
+
+define void @b() {
+; CHECK-LABEL: _b:
+entry:
+  %b = alloca inalloca %Foo
+; CHECK: movl    $8, %eax
+; CHECK: calll   __chkstk
+; CHECK: movl   %[[REG:[^,]*]], %esp
+  %f1 = getelementptr %Foo* %b, i32 0, i32 0
+  %f2 = getelementptr %Foo* %b, i32 0, i32 1
+  store i32 13, i32* %f1
+  store i32 42, i32* %f2
+; CHECK: movl    $13, (%[[REG]])
+; CHECK: movl    $42, 4(%[[REG]])
+  call void @inreg_with_inalloca(i32 inreg 1, %Foo* inalloca %b)
+; CHECK: movl    $1, %eax
+; CHECK: calll   _inreg_with_inalloca
+  ret void
+}
+
+declare x86_thiscallcc void @thiscall_with_inalloca(i8* %a, %Foo* inalloca %b)
+
+define void @c() {
+; CHECK-LABEL: _c:
+entry:
+  %b = alloca inalloca %Foo
+; CHECK: movl    $8, %eax
+; CHECK: calll   __chkstk
+; CHECK: movl   %[[REG:[^,]*]], %esp
+  %f1 = getelementptr %Foo* %b, i32 0, i32 0
+  %f2 = getelementptr %Foo* %b, i32 0, i32 1
+  store i32 13, i32* %f1
+  store i32 42, i32* %f2
+; CHECK-DAG: movl    $13, (%[[REG]])
+; CHECK-DAG: movl    $42, 4(%[[REG]])
+  call x86_thiscallcc void @thiscall_with_inalloca(i8* null, %Foo* inalloca %b)
+; CHECK-DAG: xorl    %ecx, %ecx
+; CHECK: calll   _thiscall_with_inalloca
+  ret void
+}
diff --git a/test/CodeGen/X86/inline-asm-flag-clobber.ll b/test/CodeGen/X86/inline-asm-flag-clobber.ll
index 45f4d2f..bb7c33e 100644
--- a/test/CodeGen/X86/inline-asm-flag-clobber.ll
+++ b/test/CodeGen/X86/inline-asm-flag-clobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -march=x86-64 -no-integrated-as < %s | FileCheck %s
 ; PR3701
 
 define i64 @t(i64* %arg) nounwind {
diff --git a/test/CodeGen/X86/inline-asm-fpstack.ll b/test/CodeGen/X86/inline-asm-fpstack.ll
index e83c065..91c477b 100644
--- a/test/CodeGen/X86/inline-asm-fpstack.ll
+++ b/test/CodeGen/X86/inline-asm-fpstack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -no-integrated-as | FileCheck %s
 
 ; There should be no stack manipulations between the inline asm and ret.
 ; CHECK: test1
diff --git a/test/CodeGen/X86/inline-asm-h.ll b/test/CodeGen/X86/inline-asm-h.ll
index 53cf419..8c3e45a 100644
--- a/test/CodeGen/X86/inline-asm-h.ll
+++ b/test/CodeGen/X86/inline-asm-h.ll
@@ -9,4 +9,4 @@ entry:
 }
 
 ; CHECK: zed
-; CHECK: movq %mm2,foobar+8(%rip)
+; CHECK: movq %mm2, foobar+8(%rip)
diff --git a/test/CodeGen/X86/inline-asm-modifier-n.ll b/test/CodeGen/X86/inline-asm-modifier-n.ll
index b069c46..072c7c4 100644
--- a/test/CodeGen/X86/inline-asm-modifier-n.ll
+++ b/test/CodeGen/X86/inline-asm-modifier-n.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep " 37"
+; RUN: llc < %s -march=x86 -no-integrated-as | grep " 37"
 ; rdar://7008959
 
 define void @bork() nounwind {
diff --git a/test/CodeGen/X86/inline-asm-modifier-q.ll b/test/CodeGen/X86/inline-asm-modifier-q.ll
new file mode 100644
index 0000000..8063d48
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-modifier-q.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=x86 -no-integrated-as | FileCheck %s
+
+; If the target does not have 64-bit integer registers, emit 32-bit register
+; names.
+
+; CHECK: movq (%e{{[abcd]}}x, %ebx, 4)
+
+define void @q_modifier(i32* %p) {
+entry:
+  tail call void asm sideeffect "movq (${0:q}, %ebx, 4), %mm0", "r,~{dirflag},~{fpsr},~{flags}"(i32* %p)
+  ret void
+}
diff --git a/test/CodeGen/X86/inline-asm-mrv.ll b/test/CodeGen/X86/inline-asm-mrv.ll
index 733205d..a96e7b8 100644
--- a/test/CodeGen/X86/inline-asm-mrv.ll
+++ b/test/CodeGen/X86/inline-asm-mrv.ll
@@ -1,8 +1,8 @@
 ; PR2094
-; RUN: llc < %s -march=x86-64 | grep movslq
-; RUN: llc < %s -march=x86-64 | grep addps
-; RUN: llc < %s -march=x86-64 | grep paddd
-; RUN: llc < %s -march=x86-64 | not grep movq
+; RUN: llc < %s -march=x86-64 -no-integrated-as | grep movslq
+; RUN: llc < %s -march=x86-64 -no-integrated-as | grep addps
+; RUN: llc < %s -march=x86-64 -no-integrated-as | grep paddd
+; RUN: llc < %s -march=x86-64 -no-integrated-as | not grep movq
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin8"
diff --git a/test/CodeGen/X86/inline-asm-q-regs.ll b/test/CodeGen/X86/inline-asm-q-regs.ll
index fca68ba..53a56ae 100644
--- a/test/CodeGen/X86/inline-asm-q-regs.ll
+++ b/test/CodeGen/X86/inline-asm-q-regs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx
+; RUN: llc < %s -march=x86-64 -mattr=+avx -no-integrated-as
 ; rdar://7066579
 
 	%0 = type { i64, i64, i64, i64, i64 }		; type %0
diff --git a/test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll b/test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll
new file mode 100644
index 0000000..b55571b
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -force-align-stack -mtriple i386-apple-darwin -mcpu=i486 | FileCheck %s
+
+%struct.foo = type { [88 x i8] }
+
+declare void @bar(i8* nocapture, %struct.foo* align 4 byval) nounwind
+
+; PR19012
+; Don't clobber %esi if we have inline asm that clobbers %esp.
+define void @test1(%struct.foo* nocapture %x, i32 %y, i8* %z) nounwind {
+  call void @bar(i8* %z, %struct.foo* align 4 byval %x)
+  call void asm sideeffect inteldialect "xor esp, esp", "=*m,~{flags},~{esp},~{esp},~{dirflag},~{fpsr},~{flags}"(i8* %z)
+  ret void
+
+; CHECK-LABEL: test1:
+; CHECK: movl %esp, %esi
+; CHECK-NOT: rep;movsl
+}
diff --git a/test/CodeGen/X86/inline-asm-stack-realign.ll b/test/CodeGen/X86/inline-asm-stack-realign.ll
new file mode 100644
index 0000000..f2ac0f4
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-stack-realign.ll
@@ -0,0 +1,16 @@
+; RUN: not llc -mtriple=i686-pc-win32 < %s 2>&1 | FileCheck %s
+
+; FIXME: This is miscompiled due to our unconditional use of ESI as the base
+; pointer.
+; XFAIL: *
+
+; CHECK: Stack realignment in presence of dynamic stack adjustments is not supported with inline assembly
+
+define i32 @foo() {
+entry:
+  %r = alloca i32, align 16
+  store i32 -1, i32* %r, align 16
+  call void asm sideeffect inteldialect "push esi\0A\09xor esi, esi\0A\09mov dword ptr $0, esi\0A\09pop esi", "=*m,~{flags},~{esi},~{esp},~{dirflag},~{fpsr},~{flags}"(i32* %r)
+  %0 = load i32* %r, align 16
+  ret i32 %0
+}
diff --git a/test/CodeGen/X86/inline-asm-stack-realign2.ll b/test/CodeGen/X86/inline-asm-stack-realign2.ll
new file mode 100644
index 0000000..0e4e7e1
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-stack-realign2.ll
@@ -0,0 +1,16 @@
+; RUN: not llc -mtriple=i686-pc-win32 < %s 2>&1 | FileCheck %s
+
+; FIXME: This is miscompiled due to our unconditional use of ESI as the base
+; pointer.
+; XFAIL: *
+
+; CHECK: Stack realignment in presence of dynamic stack adjustments is not supported with inline assembly
+
+define i32 @foo() {
+entry:
+  %r = alloca i32, align 16
+  store i32 -1, i32* %r, align 16
+  call void asm sideeffect "push %esi\0A\09xor %esi, %esi\0A\09mov %esi, $0\0A\09pop %esi", "=*m,~{flags},~{esi},~{esp},~{dirflag},~{fpsr},~{flags}"(i32* %r)
+  %0 = load i32* %r, align 16
+  ret i32 %0
+}
diff --git a/test/CodeGen/X86/inline-asm-stack-realign3.ll b/test/CodeGen/X86/inline-asm-stack-realign3.ll
new file mode 100644
index 0000000..3baaaaa
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-stack-realign3.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=x86 -no-integrated-as < %s | FileCheck %s
+
+declare void @bar(i32* %junk)
+
+define i32 @foo(i1 %cond) {
+entry:
+  %r = alloca i32, align 128
+  store i32 -1, i32* %r, align 128
+  br i1 %cond, label %doit, label %skip
+
+doit:
+  call void asm sideeffect "xor %ecx, %ecx\0A\09mov %ecx, $0", "=*m,~{ecx},~{flags}"(i32* %r)
+  %junk = alloca i32
+  call void @bar(i32* %junk)
+  br label %skip
+
+skip:
+  %0 = load i32* %r, align 128
+  ret i32 %0
+}
+
+; CHECK-LABEL: foo:
+; CHECK: pushl %ebp
+; CHECK: andl $-128, %esp
+; CHECK: xor %ecx, %ecx
+; CHECK-NEXT: mov %ecx, (%esi)
+; CHECK: movl (%esi), %eax
+; CHECK: popl %ebp
+; CHECK: ret
diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index 597236e..fb5896b 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin9 -O0 -optimize-regalloc -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin9 -O0 -optimize-regalloc -regalloc=basic -no-integrated-as | FileCheck %s
 ; rdar://6992609
 
 ; CHECK: movl [[EDX:%e..]], 4(%esp)
diff --git a/test/CodeGen/X86/inline-asm-x-scalar.ll b/test/CodeGen/X86/inline-asm-x-scalar.ll
index 5a9628b..64a7fe8 100644
--- a/test/CodeGen/X86/inline-asm-x-scalar.ll
+++ b/test/CodeGen/X86/inline-asm-x-scalar.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah
+; RUN: llc < %s -march=x86 -mcpu=yonah -no-integrated-as
 
 define void @test1() {
         tail call void asm sideeffect "ucomiss $0", "x"( float 0x41E0000000000000)
diff --git a/test/CodeGen/X86/inline-asm.ll b/test/CodeGen/X86/inline-asm.ll
index d201ebd..5ec4f46 100644
--- a/test/CodeGen/X86/inline-asm.ll
+++ b/test/CodeGen/X86/inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -march=x86 -no-integrated-as
 
 define i32 @test1() nounwind {
 	; Dest is AX, dest type = i32.
@@ -59,3 +59,18 @@ entry:
   %asm = tail call i32 asm sideeffect "", "={ax},i,~{eax},~{flags},~{rax}"(i64 61) nounwind
   ret i32 %asm
 }
+
+@test8_v = global i32 42
+
+define void @test8() {
+  call void asm sideeffect "${0:P}", "i"( i32* @test8_v )
+  ret void
+}
+
+define void @test9() {
+  call void asm sideeffect "${0:P}", "X"( i8* blockaddress(@test9, %bb) )
+  br label %bb
+
+bb:
+  ret void
+}
diff --git a/test/CodeGen/X86/ins_split_regalloc.ll b/test/CodeGen/X86/ins_split_regalloc.ll
new file mode 100644
index 0000000..f5c5254
--- /dev/null
+++ b/test/CodeGen/X86/ins_split_regalloc.ll
@@ -0,0 +1,33 @@
+; RUN: llc -O1 -regalloc=greedy -mtriple=x86_64-apple-macosx -march x86-64  < %s -o - | FileCheck %s
+; Check that last chance split (RAGreedy::tryInstructonSplit) just split
+; when this is beneficial, otherwise we end up with uncoalesced copies.
+; <rdar://problem/15570057> 
+
+target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
+
+@f = external constant void (i32)*
+
+; CHECK-LABEL: test:
+; Get the address of f in the GOT.
+; CHECK: movq _f@{{[^,]+}}, [[F_ENTRY_ADDR:%[a-z0-9]+]]
+; Read the actual address of f.
+; CHECK: movq ([[F_ENTRY_ADDR]]), [[F_ADDR:%[a-z0-9]+]]
+; Check that we do not have useless split points before each call.
+; CHECK-NOT: movq
+; CHECK: callq *[[F_ADDR]]
+; Check that we do not have useless split points before each call.
+; CHECK-NOT: movq
+; CHECK: callq *[[F_ADDR]]
+; Last call is a tail call, thus the address of the function cannot use
+; a callee saved register.
+; CHECK: movq [[F_ADDR]], [[F_ADDR_TC:%[a-z0-9]+]]
+; CHECK: popq [[F_ADDR]]
+; CHECK: jmpq *[[F_ADDR_TC]]
+define void @test(i32 %a, i32 %b, i32 %c) {
+entry:
+  %fct_f = load void (i32)** @f, align 8
+  tail call void %fct_f(i32 %a)
+  tail call void %fct_f(i32 %b)
+  tail call void %fct_f(i32 %c)
+  ret void
+}
diff --git a/test/CodeGen/X86/isint.ll b/test/CodeGen/X86/isint.ll
index 4a98e63..38d05c6 100644
--- a/test/CodeGen/X86/isint.ll
+++ b/test/CodeGen/X86/isint.ll
@@ -1,6 +1,11 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 -mcpu=penryn | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 -mcpu=penryn | FileCheck %s
+
+; PR19059
+; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 -mcpu=penryn | FileCheck -check-prefix=CHECK32 %s
 
 define i32 @isint_return(double %d) nounwind {
+; CHECK-LABEL: isint_return:
 ; CHECK-NOT: xor
 ; CHECK: cvt
   %i = fptosi double %d to i32
@@ -8,6 +13,24 @@ define i32 @isint_return(double %d) nounwind {
   %e = sitofp i32 %i to double
 ; CHECK: cmpeqsd
   %c = fcmp oeq double %d, %e
+; CHECK32-NOT: movd {{.*}}, %r{{.*}}
+; CHECK32-NOT: andq
+; CHECK-NEXT: movd
+; CHECK-NEXT: andl
+  %z = zext i1 %c to i32
+  ret i32 %z
+}
+
+define i32 @isint_float_return(float %f) nounwind {
+; CHECK-LABEL: isint_float_return:
+; CHECK-NOT: xor
+; CHECK: cvt
+  %i = fptosi float %f to i32
+; CHECK-NEXT: cvt
+  %g = sitofp i32 %i to float
+; CHECK: cmpeqss
+  %c = fcmp oeq float %f, %g
+; CHECK-NOT: movd {{.*}}, %r{{.*}}
 ; CHECK-NEXT: movd
 ; CHECK-NEXT: andl
   %z = zext i1 %c to i32
@@ -17,6 +40,7 @@ define i32 @isint_return(double %d) nounwind {
 declare void @foo()
 
 define void @isint_branch(double %d) nounwind {
+; CHECK-LABEL: isint_branch:
 ; CHECK: cvt
   %i = fptosi double %d to i32
 ; CHECK-NEXT: cvt
diff --git a/test/CodeGen/X86/large-constants.ll b/test/CodeGen/X86/large-constants.ll
new file mode 100644
index 0000000..157ecc4
--- /dev/null
+++ b/test/CodeGen/X86/large-constants.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -mtriple=x86_64-darwin -mcpu=corei7 | grep movabsq | count 3
+
+define i64 @constant_hoisting(i64 %o0, i64 %o1, i64 %o2, i64 %o3, i64 %o4, i64 %o5) {
+entry:
+  %l0 = and i64 %o0, -281474976710654
+  %c0 = icmp ne i64 %l0, 0
+  br i1 %c0, label %fail, label %bb1
+
+bb1:
+  %l1 = and i64 %o1, -281474976710654
+  %c1 = icmp ne i64 %l1, 0
+  br i1 %c1, label %fail, label %bb2
+
+bb2:
+  %l2 = and i64 %o2, -281474976710654
+  %c2 = icmp ne i64 %l2, 0
+  br i1 %c2, label %fail, label %bb3
+
+bb3:
+  %l3 = and i64 %o3, -281474976710654
+  %c3 = icmp ne i64 %l3, 0
+  br i1 %c3, label %fail, label %bb4
+
+bb4:
+  %l4 = and i64 %o4, -281474976710653
+  %c4 = icmp ne i64 %l4, 0
+  br i1 %c4, label %fail, label %bb5
+
+bb5:
+  %l5 = and i64 %o5, -281474976710652
+  %c5 = icmp ne i64 %l5, 0
+  br i1 %c5, label %fail, label %bb6
+
+bb6:
+  ret i64 %l5
+
+fail:
+  ret i64 -1
+}
+
+define void @constant_expressions() {
+entry:
+  %0 = load i64* inttoptr (i64 add (i64 51250129900, i64 0) to i64*)
+  %1 = load i64* inttoptr (i64 add (i64 51250129900, i64 8) to i64*)
+  %2 = load i64* inttoptr (i64 add (i64 51250129900, i64 16) to i64*)
+  %3 = load i64* inttoptr (i64 add (i64 51250129900, i64 24) to i64*)
+  %4 = add i64 %0, %1
+  %5 = add i64 %2, %3
+  %6 = add i64 %4, %5
+  store i64 %6, i64* inttoptr (i64 add (i64 51250129900, i64 0) to i64*)
+  ret void
+}
+
+
+define void @constant_expressions2() {
+entry:
+  %0 = load i64* inttoptr (i64 51250129900 to i64*)
+  %1 = load i64* inttoptr (i64 51250129908 to i64*)
+  %2 = load i64* inttoptr (i64 51250129916 to i64*)
+  %3 = load i64* inttoptr (i64 51250129924 to i64*)
+  %4 = add i64 %0, %1
+  %5 = add i64 %2, %3
+  %6 = add i64 %4, %5
+  store i64 %6, i64* inttoptr (i64 51250129900 to i64*)
+  ret void
+}
+
diff --git a/test/CodeGen/X86/load-slice.ll b/test/CodeGen/X86/load-slice.ll
index 85fd7f0..49eb131 100644
--- a/test/CodeGen/X86/load-slice.ll
+++ b/test/CodeGen/X86/load-slice.ll
@@ -6,7 +6,7 @@
 %class.Complex = type { float, float }
 
 
-; Check that independant slices leads to independant loads then the slices leads to
+; Check that independent slices leads to independent loads then the slices leads to
 ; different register file.
 ;
 ; The layout is:
diff --git a/test/CodeGen/X86/lsr-interesting-step.ll b/test/CodeGen/X86/lsr-interesting-step.ll
index d4a7ac7..8ea3c53 100644
--- a/test/CodeGen/X86/lsr-interesting-step.ll
+++ b/test/CodeGen/X86/lsr-interesting-step.ll
@@ -3,26 +3,24 @@
 ; The inner loop should require only one add (and no leas either).
 ; rdar://8100380
 
-; CHECK:      BB0_3:
-; CHECK-NEXT:   movb    $0, flags(%rdx)
-; CHECK-NEXT:   addq    %rax, %rdx
-; CHECK-NEXT:   cmpq    $8192, %rdx
+; CHECK:      BB0_2:
+; CHECK-NEXT:   movb    $0, flags(%rcx)
+; CHECK-NEXT:   addq    %rax, %rcx
+; CHECK-NEXT:   cmpq    $8192, %rcx
 ; CHECK-NEXT:   jl
 
 @flags = external global [8192 x i8], align 16 ; <[8192 x i8]*> [#uses=1]
 
 define void @foo() nounwind {
 entry:
-  %tmp = icmp slt i64 2, 8192                     ; <i1> [#uses=1]
-  br i1 %tmp, label %bb, label %bb21
+  br label %bb
 
 bb:                                               ; preds = %entry
   br label %bb7
 
 bb7:                                              ; preds = %bb, %bb17
   %tmp8 = phi i64 [ %tmp18, %bb17 ], [ 2, %bb ]   ; <i64> [#uses=2]
-  %tmp9 = icmp slt i64 2, 8192                    ; <i1> [#uses=1]
-  br i1 %tmp9, label %bb10, label %bb17
+  br label %bb10
 
 bb10:                                             ; preds = %bb7
   br label %bb11
diff --git a/test/CodeGen/X86/lsr-normalization.ll b/test/CodeGen/X86/lsr-normalization.ll
index bbf8f01..2775558 100644
--- a/test/CodeGen/X86/lsr-normalization.ll
+++ b/test/CodeGen/X86/lsr-normalization.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; REQUIRES: asserts
+; RUN: llc < %s -march=x86-64 | FileCheck %s --check-prefix=ASM
+; RUN: llc -debug -o /dev/null < %s -march=x86-64 2>&1 | FileCheck %s --check-prefix=DBG
 ; rdar://8168938
 
 ; This testcase involves SCEV normalization with the exit value from
@@ -6,8 +8,9 @@
 ; loop. The expression should be properly normalized and simplified,
 ; and require only a single division.
 
-; CHECK: div
-; CHECK-NOT: div
+; DBG-NOT: DISCARDING (NORMALIZATION ISN'T INVERTIBLE)
+; ASM: div
+; ASM-NOT: div
 
 %0 = type { %0*, %0* }
 
diff --git a/test/CodeGen/X86/machine-cp.ll b/test/CodeGen/X86/machine-cp.ll
index f04e111..0006b6e 100644
--- a/test/CodeGen/X86/machine-cp.ll
+++ b/test/CodeGen/X86/machine-cp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=nocona < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=nocona -verify-machineinstrs < %s | FileCheck %s
 
 ; After tail duplication, two copies in an early exit BB can be cancelled out.
 ; rdar://10640363
@@ -34,3 +34,27 @@ entry:
   %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
   ret <8 x i16> %tmp8
 }
+
+define i32 @t3(i64 %a, i64 %b) nounwind  {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: je [[LABEL:.*BB.*]]
+  %cmp1 = icmp eq i64 %b, 0
+  br i1 %cmp1, label %while.end, label %while.body
+
+; CHECK: [[LABEL]]:
+; CHECK-NOT: mov
+; CHECK: ret
+
+while.body:                                       ; preds = %entry, %while.body
+  %a.addr.03 = phi i64 [ %b.addr.02, %while.body ], [ %a, %entry ]
+  %b.addr.02 = phi i64 [ %rem, %while.body ], [ %b, %entry ]
+  %rem = srem i64 %a.addr.03, %b.addr.02
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %a.addr.0.lcssa = phi i64 [ %a, %entry ], [ %b.addr.02, %while.body ]
+  %t = trunc i64 %a.addr.0.lcssa to i32
+  ret i32 %t
+}
diff --git a/test/CodeGen/X86/mature-mc-support.ll b/test/CodeGen/X86/mature-mc-support.ll
new file mode 100644
index 0000000..9d956f4
--- /dev/null
+++ b/test/CodeGen/X86/mature-mc-support.ll
@@ -0,0 +1,18 @@
+; Test that inline assembly is parsed by the MC layer when MC support is mature
+; (even when the output is assembly).
+
+; RUN: not llc -march=x86 < %s > /dev/null 2> %t1
+; RUN: FileCheck %s < %t1
+
+; RUN: not llc -march=x86 -filetype=obj < %s > /dev/null 2> %t2
+; RUN: FileCheck %s < %t2
+
+; RUN: not llc -march=x86-64 < %s > /dev/null 2> %t3
+; RUN: FileCheck %s < %t3
+
+; RUN: not llc -march=x86-64 -filetype=obj < %s > /dev/null 2> %t4
+; RUN: FileCheck %s < %t4
+
+module asm "	.this_directive_is_very_unlikely_to_exist"
+
+; CHECK: LLVM ERROR: Error parsing inline asm
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index cb0797d..0a53492 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -22,8 +22,9 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp2:
-; CHECK: movw    ([[A0:%rdi|%rcx]]), %ax
-; CHECK: cmpw    ([[A1:%rsi|%rdx]]), %ax
+; CHECK: movzwl
+; CHECK-NEXT: movzwl
+; CHECK-NEXT: cmpl
 ; NOBUILTIN-LABEL: memcmp2:
 ; NOBUILTIN: callq
 }
@@ -41,7 +42,8 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp2a:
-; CHECK: cmpw    $28527, ([[A0]])
+; CHECK: movzwl
+; CHECK-NEXT: cmpl    $28527,
 }
 
 
@@ -58,8 +60,8 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp4:
-; CHECK: movl    ([[A0]]), %eax
-; CHECK: cmpl    ([[A1]]), %eax
+; CHECK: movl
+; CHECK-NEXT: cmpl
 }
 
 define void @memcmp4a(i8* %X, i32* nocapture %P) nounwind {
@@ -75,7 +77,7 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp4a:
-; CHECK: cmpl $1869573999, ([[A0]])
+; CHECK: cmpl $1869573999,
 }
 
 define void @memcmp8(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
@@ -91,8 +93,8 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp8:
-; CHECK: movq    ([[A0]]), %rax
-; CHECK: cmpq    ([[A1]]), %rax
+; CHECK: movq
+; CHECK: cmpq
 }
 
 define void @memcmp8a(i8* %X, i32* nocapture %P) nounwind {
@@ -108,7 +110,7 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp8a:
-; CHECK: movabsq $8029759185026510694, %rax
-; CHECK: cmpq	%rax, ([[A0]])
+; CHECK: movabsq $8029759185026510694,
+; CHECK: cmpq
 }
 
diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll
index d0a3c7a..a87ef2e 100644
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@@ -5,7 +5,7 @@ declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
 define fastcc void @t1() nounwind {
 entry:
 ; CHECK-LABEL: t1:
-; CHECK: calll _memset
+; CHECK: calll L_memset$stub
   call void @llvm.memset.p0i8.i32(i8* null, i8 0, i32 188, i32 1, i1 false)
   unreachable
 }
@@ -13,7 +13,7 @@ entry:
 define fastcc void @t2(i8 signext %c) nounwind {
 entry:
 ; CHECK-LABEL: t2:
-; CHECK: calll _memset
+; CHECK: calll L_memset$stub
   call void @llvm.memset.p0i8.i32(i8* undef, i8 %c, i32 76, i32 1, i1 false)
   unreachable
 }
diff --git a/test/CodeGen/X86/misched-aa-colored.ll b/test/CodeGen/X86/misched-aa-colored.ll
new file mode 100644
index 0000000..52a5e5d
--- /dev/null
+++ b/test/CodeGen/X86/misched-aa-colored.ll
@@ -0,0 +1,189 @@
+; RUN: llc < %s -mcpu=x86-64 -enable-misched -misched-bottomup=0 -misched-topdown=0 -misched=shuffle -enable-aa-sched-mi | FileCheck %s
+; REQUIRES: asserts
+; -misched=shuffle is NDEBUG only!
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%"class.llvm::SDNode.10.610.970.1930.2050.2290.4090" = type { %"class.llvm::FoldingSetImpl::Node.0.600.960.1920.2040.2280.4080", %"class.llvm::ilist_node.2.602.962.1922.2042.2282.4082", i16, [2 x i8], i32, %"class.llvm::SDUse.4.604.964.1924.2044.2284.4084"*, %"struct.llvm::EVT.8.608.968.1928.2048.2288.4088"*, %"class.llvm::SDUse.4.604.964.1924.2044.2284.4084"*, i16, i16, %"class.llvm::DebugLoc.9.609.969.1929.2049.2289.4089", i32 }
+%"class.llvm::FoldingSetImpl::Node.0.600.960.1920.2040.2280.4080" = type { i8* }
+%"class.llvm::ilist_node.2.602.962.1922.2042.2282.4082" = type { %"class.llvm::ilist_half_node.1.601.961.1921.2041.2281.4081", %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"* }
+%"class.llvm::ilist_half_node.1.601.961.1921.2041.2281.4081" = type { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"* }
+%"struct.llvm::EVT.8.608.968.1928.2048.2288.4088" = type { %"class.llvm::MVT.5.605.965.1925.2045.2285.4085", %"class.llvm::Type.7.607.967.1927.2047.2287.4087"* }
+%"class.llvm::MVT.5.605.965.1925.2045.2285.4085" = type { i32 }
+%"class.llvm::SDUse.4.604.964.1924.2044.2284.4084" = type { %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, %"class.llvm::SDUse.4.604.964.1924.2044.2284.4084"**, %"class.llvm::SDUse.4.604.964.1924.2044.2284.4084"* }
+%"class.llvm::SDValue.3.603.963.1923.2043.2283.4083" = type { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 }
+%"class.llvm::DebugLoc.9.609.969.1929.2049.2289.4089" = type { i32, i32 }
+%"class.llvm::SelectionDAG.104.704.1064.2024.2144.2384.4184" = type { %"class.llvm::TargetMachine.17.617.977.1937.2057.2297.4097"*, %"class.llvm::TargetSelectionDAGInfo.18.618.978.1938.2058.2298.4098"*, %"class.llvm::TargetTransformInfo.19.619.979.1939.2059.2299.4099"*, %"class.llvm::TargetLowering.51.651.1011.1971.2091.2331.4131"*, %"class.llvm::MachineFunction.52.652.1012.1972.2092.2332.4132"*, %"class.llvm::LLVMContext.6.606.966.1926.2046.2286.4086"*, i32, %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090", %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", %"struct.llvm::ilist.55.655.1015.1975.2095.2335.4135", %"class.llvm::RecyclingAllocator.65.665.1025.1985.2105.2345.4145", %"class.llvm::FoldingSet.67.667.1027.1987.2107.2347.4147", %"class.llvm::BumpPtrAllocator.64.664.1024.1984.2104.2344.4144", %"class.llvm::BumpPtrAllocator.64.664.1024.1984.2104.2344.4144", %"class.llvm::SDDbgInfo.79.679.1039.1999.2119.2359.4159"*, i8, %"struct.llvm::SelectionDAG::DAGUpdateListener.80.680.1040.2000.2120.2360.4160"*, %"class.std::map.43.84.684.1044.2004.2124.2364.4164", %"class.llvm::FoldingSet.50.85.685.1045.2005.2125.2365.4165", %"class.std::vector.51.89.689.1049.2009.2129.2369.4169", %"class.std::vector.56.92.692.1052.2012.2132.2372.4172", %"class.std::map.61.96.696.1056.2016.2136.2376.4176", %"class.llvm::StringMap.99.699.1059.2019.2139.2379.4179", %"class.std::map.66.103.703.1063.2023.2143.2383.4183" }
+%"class.llvm::TargetMachine.17.617.977.1937.2057.2297.4097" = type { i32 (...)**, %"class.llvm::Target.11.611.971.1931.2051.2291.4091"*, %"class.std::basic_string.13.613.973.1933.2053.2293.4093", %"class.std::basic_string.13.613.973.1933.2053.2293.4093", %"class.std::basic_string.13.613.973.1933.2053.2293.4093", %"class.llvm::MCCodeGenInfo.14.614.974.1934.2054.2294.4094"*, %"class.llvm::MCAsmInfo.15.615.975.1935.2055.2295.4095"*, i8, %"class.llvm::TargetOptions.16.616.976.1936.2056.2296.4096" }
+%"class.llvm::Target.11.611.971.1931.2051.2291.4091" = type opaque
+%"class.std::basic_string.13.613.973.1933.2053.2293.4093" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider.12.612.972.1932.2052.2292.4092" }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider.12.612.972.1932.2052.2292.4092" = type { i8* }
+%"class.llvm::MCCodeGenInfo.14.614.974.1934.2054.2294.4094" = type opaque
+%"class.llvm::MCAsmInfo.15.615.975.1935.2055.2295.4095" = type opaque
+%"class.llvm::TargetOptions.16.616.976.1936.2056.2296.4096" = type { [2 x i8], i32, i8, %"class.std::basic_string.13.613.973.1933.2053.2293.4093", i32, i32 }
+%"class.llvm::TargetSelectionDAGInfo.18.618.978.1938.2058.2298.4098" = type opaque
+%"class.llvm::TargetTransformInfo.19.619.979.1939.2059.2299.4099" = type opaque
+%"class.llvm::TargetLowering.51.651.1011.1971.2091.2331.4131" = type { %"class.llvm::TargetLoweringBase.50.650.1010.1970.2090.2330.4130" }
+%"class.llvm::TargetLoweringBase.50.650.1010.1970.2090.2330.4130" = type { i32 (...)**, %"class.llvm::TargetMachine.17.617.977.1937.2057.2297.4097"*, %"class.llvm::DataLayout.35.635.995.1955.2075.2315.4115"*, %"class.llvm::TargetLoweringObjectFile.36.636.996.1956.2076.2316.4116"*, i8, i8, i8, i8, %"class.llvm::DenseMap.11.38.638.998.1958.2078.2318.4118", i8, i8, i8, i8, i8, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8, i32, i32, i32, [58 x %"class.llvm::TargetRegisterClass.39.639.999.1959.2079.2319.4119"*], [58 x i8], [58 x %"class.llvm::MVT.5.605.965.1925.2045.2285.4085"], [58 x %"class.llvm::TargetRegisterClass.39.639.999.1959.2079.2319.4119"*], [58 x i8], [58 x %"class.llvm::MVT.5.605.965.1925.2045.2285.4085"], [58 x [188 x i8]], [58 x [4 x i8]], [58 x [58 x i8]], [58 x [5 x i8]], [24 x [4 x i32]], %"class.llvm::TargetLoweringBase::ValueTypeActionImpl.40.640.1000.1960.2080.2320.4120", %"class.std::vector.15.44.644.1004.1964.2084.2324.4124", [24 x i8], %"class.std::map.49.649.1009.1969.2089.2329.4129", [341 x i8*], [341 x i32], [341 x i32], i32, i32, i32, i32, i32, i32, i8 }
+%"class.llvm::DataLayout.35.635.995.1955.2075.2315.4115" = type { [28 x i8], i8, i32, i32, %"class.llvm::SmallVector.23.623.983.1943.2063.2303.4103", %"class.llvm::SmallVector.3.31.631.991.1951.2071.2311.4111", %"class.llvm::DenseMap.34.634.994.1954.2074.2314.4114", i8* }
+%"class.llvm::SmallVector.23.623.983.1943.2063.2303.4103" = type { [25 x i8], %"struct.llvm::SmallVectorStorage.22.622.982.1942.2062.2302.4102" }
+%"struct.llvm::SmallVectorStorage.22.622.982.1942.2062.2302.4102" = type { [7 x %"struct.llvm::AlignedCharArrayUnion.21.621.981.1941.2061.2301.4101"] }
+%"struct.llvm::AlignedCharArrayUnion.21.621.981.1941.2061.2301.4101" = type { %"struct.llvm::AlignedCharArray.20.620.980.1940.2060.2300.4100" }
+%"struct.llvm::AlignedCharArray.20.620.980.1940.2060.2300.4100" = type { [1 x i8] }
+%"class.llvm::SmallVector.3.31.631.991.1951.2071.2311.4111" = type { %"class.llvm::SmallVectorImpl.4.29.629.989.1949.2069.2309.4109", %"struct.llvm::SmallVectorStorage.9.30.630.990.1950.2070.2310.4110" }
+%"class.llvm::SmallVectorImpl.4.29.629.989.1949.2069.2309.4109" = type { %"class.llvm::SmallVectorTemplateBase.5.28.628.988.1948.2068.2308.4108" }
+%"class.llvm::SmallVectorTemplateBase.5.28.628.988.1948.2068.2308.4108" = type { %"class.llvm::SmallVectorTemplateCommon.6.27.627.987.1947.2067.2307.4107" }
+%"class.llvm::SmallVectorTemplateCommon.6.27.627.987.1947.2067.2307.4107" = type { %"class.llvm::SmallVectorBase.24.624.984.1944.2064.2304.4104", %"struct.llvm::AlignedCharArrayUnion.7.26.626.986.1946.2066.2306.4106" }
+%"class.llvm::SmallVectorBase.24.624.984.1944.2064.2304.4104" = type { i8*, i8*, i8* }
+%"struct.llvm::AlignedCharArrayUnion.7.26.626.986.1946.2066.2306.4106" = type { %"struct.llvm::AlignedCharArray.8.25.625.985.1945.2065.2305.4105" }
+%"struct.llvm::AlignedCharArray.8.25.625.985.1945.2065.2305.4105" = type { [8 x i8] }
+%"struct.llvm::SmallVectorStorage.9.30.630.990.1950.2070.2310.4110" = type { [15 x %"struct.llvm::AlignedCharArrayUnion.7.26.626.986.1946.2066.2306.4106"] }
+%"class.llvm::DenseMap.34.634.994.1954.2074.2314.4114" = type { %"struct.std::pair.10.33.633.993.1953.2073.2313.4113"*, i32, i32, i32 }
+%"struct.std::pair.10.33.633.993.1953.2073.2313.4113" = type { i32, %"struct.llvm::PointerAlignElem.32.632.992.1952.2072.2312.4112" }
+%"struct.llvm::PointerAlignElem.32.632.992.1952.2072.2312.4112" = type { i32, i32, i32, i32 }
+%"class.llvm::TargetLoweringObjectFile.36.636.996.1956.2076.2316.4116" = type opaque
+%"class.llvm::DenseMap.11.38.638.998.1958.2078.2318.4118" = type { %"struct.std::pair.14.37.637.997.1957.2077.2317.4117"*, i32, i32, i32 }
+%"struct.std::pair.14.37.637.997.1957.2077.2317.4117" = type { i32, i32 }
+%"class.llvm::TargetRegisterClass.39.639.999.1959.2079.2319.4119" = type opaque
+%"class.llvm::TargetLoweringBase::ValueTypeActionImpl.40.640.1000.1960.2080.2320.4120" = type { [58 x i8] }
+%"class.std::vector.15.44.644.1004.1964.2084.2324.4124" = type { %"struct.std::_Vector_base.16.43.643.1003.1963.2083.2323.4123" }
+%"struct.std::_Vector_base.16.43.643.1003.1963.2083.2323.4123" = type { %"struct.std::_Vector_base<std::pair<llvm::MVT, const llvm::TargetRegisterClass *>, std::allocator<std::pair<llvm::MVT, const llvm::TargetRegisterClass *> > >::_Vector_impl.42.642.1002.1962.2082.2322.4122" }
+%"struct.std::_Vector_base<std::pair<llvm::MVT, const llvm::TargetRegisterClass *>, std::allocator<std::pair<llvm::MVT, const llvm::TargetRegisterClass *> > >::_Vector_impl.42.642.1002.1962.2082.2322.4122" = type { %"struct.std::pair.20.41.641.1001.1961.2081.2321.4121"*, %"struct.std::pair.20.41.641.1001.1961.2081.2321.4121"*, %"struct.std::pair.20.41.641.1001.1961.2081.2321.4121"* }
+%"struct.std::pair.20.41.641.1001.1961.2081.2321.4121" = type { %"class.llvm::MVT.5.605.965.1925.2045.2285.4085", %"class.llvm::TargetRegisterClass.39.639.999.1959.2079.2319.4119"* }
+%"class.std::map.49.649.1009.1969.2089.2329.4129" = type { %"class.std::_Rb_tree.48.648.1008.1968.2088.2328.4128" }
+%"class.std::_Rb_tree.48.648.1008.1968.2088.2328.4128" = type { %"struct.std::_Rb_tree<std::pair<unsigned int, llvm::MVT::SimpleValueType>, std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType>, std::_Select1st<std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType> >, std::less<std::pair<unsigned int, llvm::MVT::SimpleValueType> >, std::allocator<std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType> > >::_Rb_tree_impl.47.647.1007.1967.2087.2327.4127" }
+%"struct.std::_Rb_tree<std::pair<unsigned int, llvm::MVT::SimpleValueType>, std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType>, std::_Select1st<std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType> >, std::less<std::pair<unsigned int, llvm::MVT::SimpleValueType> >, std::allocator<std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType> > >::_Rb_tree_impl.47.647.1007.1967.2087.2327.4127" = type { %"struct.std::less.45.645.1005.1965.2085.2325.4125", %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126", i64 }
+%"struct.std::less.45.645.1005.1965.2085.2325.4125" = type { i8 }
+%"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126" = type { i32, %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126"*, %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126"*, %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126"* }
+%"class.llvm::MachineFunction.52.652.1012.1972.2092.2332.4132" = type opaque
+%"class.llvm::LLVMContext.6.606.966.1926.2046.2286.4086" = type opaque
+%"struct.llvm::ilist.55.655.1015.1975.2095.2335.4135" = type { %"class.llvm::iplist.54.654.1014.1974.2094.2334.4134" }
+%"class.llvm::iplist.54.654.1014.1974.2094.2334.4134" = type { %"struct.llvm::ilist_traits.53.653.1013.1973.2093.2333.4133", %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"* }
+%"struct.llvm::ilist_traits.53.653.1013.1973.2093.2333.4133" = type { %"class.llvm::ilist_half_node.1.601.961.1921.2041.2281.4081" }
+%"class.llvm::RecyclingAllocator.65.665.1025.1985.2105.2345.4145" = type { %"class.llvm::Recycler.59.659.1019.1979.2099.2339.4139", %"class.llvm::BumpPtrAllocator.64.664.1024.1984.2104.2344.4144" }
+%"class.llvm::Recycler.59.659.1019.1979.2099.2339.4139" = type { %"class.llvm::iplist.24.58.658.1018.1978.2098.2338.4138" }
+%"class.llvm::iplist.24.58.658.1018.1978.2098.2338.4138" = type { %"struct.llvm::ilist_traits.25.57.657.1017.1977.2097.2337.4137", %"struct.llvm::RecyclerStruct.56.656.1016.1976.2096.2336.4136"* }
+%"struct.llvm::ilist_traits.25.57.657.1017.1977.2097.2337.4137" = type { %"struct.llvm::RecyclerStruct.56.656.1016.1976.2096.2336.4136" }
+%"struct.llvm::RecyclerStruct.56.656.1016.1976.2096.2336.4136" = type { %"struct.llvm::RecyclerStruct.56.656.1016.1976.2096.2336.4136"*, %"struct.llvm::RecyclerStruct.56.656.1016.1976.2096.2336.4136"* }
+%"class.llvm::FoldingSet.67.667.1027.1987.2107.2347.4147" = type { %"class.llvm::FoldingSetImpl.66.666.1026.1986.2106.2346.4146" }
+%"class.llvm::FoldingSetImpl.66.666.1026.1986.2106.2346.4146" = type { i32 (...)**, i8**, i32, i32 }
+%"class.llvm::BumpPtrAllocator.64.664.1024.1984.2104.2344.4144" = type { i64, i64, %"class.llvm::MallocSlabAllocator.62.662.1022.1982.2102.2342.4142", %"class.llvm::SlabAllocator.60.660.1020.1980.2100.2340.4140"*, %"class.llvm::MemSlab.63.663.1023.1983.2103.2343.4143"*, i8*, i8*, i64 }
+%"class.llvm::MallocSlabAllocator.62.662.1022.1982.2102.2342.4142" = type { %"class.llvm::SlabAllocator.60.660.1020.1980.2100.2340.4140", %"class.llvm::MallocAllocator.61.661.1021.1981.2101.2341.4141" }
+%"class.llvm::SlabAllocator.60.660.1020.1980.2100.2340.4140" = type { i32 (...)** }
+%"class.llvm::MallocAllocator.61.661.1021.1981.2101.2341.4141" = type { i8 }
+%"class.llvm::MemSlab.63.663.1023.1983.2103.2343.4143" = type { i64, %"class.llvm::MemSlab.63.663.1023.1983.2103.2343.4143"* }
+%"class.llvm::SDDbgInfo.79.679.1039.1999.2119.2359.4159" = type { %"class.llvm::SmallVector.30.74.674.1034.1994.2114.2354.4154", %"class.llvm::SmallVector.30.74.674.1034.1994.2114.2354.4154", %"class.llvm::DenseMap.37.78.678.1038.1998.2118.2358.4158" }
+%"class.llvm::SmallVector.30.74.674.1034.1994.2114.2354.4154" = type { %"class.llvm::SmallVectorImpl.31.72.672.1032.1992.2112.2352.4152", %"struct.llvm::SmallVectorStorage.36.73.673.1033.1993.2113.2353.4153" }
+%"class.llvm::SmallVectorImpl.31.72.672.1032.1992.2112.2352.4152" = type { %"class.llvm::SmallVectorTemplateBase.32.71.671.1031.1991.2111.2351.4151" }
+%"class.llvm::SmallVectorTemplateBase.32.71.671.1031.1991.2111.2351.4151" = type { %"class.llvm::SmallVectorTemplateCommon.33.70.670.1030.1990.2110.2350.4150" }
+%"class.llvm::SmallVectorTemplateCommon.33.70.670.1030.1990.2110.2350.4150" = type { %"class.llvm::SmallVectorBase.24.624.984.1944.2064.2304.4104", %"struct.llvm::AlignedCharArrayUnion.34.69.669.1029.1989.2109.2349.4149" }
+%"struct.llvm::AlignedCharArrayUnion.34.69.669.1029.1989.2109.2349.4149" = type { %"struct.llvm::AlignedCharArray.35.68.668.1028.1988.2108.2348.4148" }
+%"struct.llvm::AlignedCharArray.35.68.668.1028.1988.2108.2348.4148" = type { [8 x i8] }
+%"struct.llvm::SmallVectorStorage.36.73.673.1033.1993.2113.2353.4153" = type { [31 x %"struct.llvm::AlignedCharArrayUnion.34.69.669.1029.1989.2109.2349.4149"] }
+%"class.llvm::DenseMap.37.78.678.1038.1998.2118.2358.4158" = type { %"struct.std::pair.40.77.677.1037.1997.2117.2357.4157"*, i32, i32, i32 }
+%"struct.std::pair.40.77.677.1037.1997.2117.2357.4157" = type { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, %"class.llvm::SmallVector.41.76.676.1036.1996.2116.2356.4156" }
+%"class.llvm::SmallVector.41.76.676.1036.1996.2116.2356.4156" = type { %"class.llvm::SmallVectorImpl.31.72.672.1032.1992.2112.2352.4152", %"struct.llvm::SmallVectorStorage.42.75.675.1035.1995.2115.2355.4155" }
+%"struct.llvm::SmallVectorStorage.42.75.675.1035.1995.2115.2355.4155" = type { [1 x %"struct.llvm::AlignedCharArrayUnion.34.69.669.1029.1989.2109.2349.4149"] }
+%"struct.llvm::SelectionDAG::DAGUpdateListener.80.680.1040.2000.2120.2360.4160" = type { i32 (...)**, %"struct.llvm::SelectionDAG::DAGUpdateListener.80.680.1040.2000.2120.2360.4160"*, %"class.llvm::SelectionDAG.104.704.1064.2024.2144.2384.4184"* }
+%"class.std::map.43.84.684.1044.2004.2124.2364.4164" = type { %"class.std::_Rb_tree.44.83.683.1043.2003.2123.2363.4163" }
+%"class.std::_Rb_tree.44.83.683.1043.2003.2123.2363.4163" = type { %"struct.std::_Rb_tree<const llvm::SDNode *, std::pair<const llvm::SDNode *const, std::basic_string<char> >, std::_Select1st<std::pair<const llvm::SDNode *const, std::basic_string<char> > >, std::less<const llvm::SDNode *>, std::allocator<std::pair<const llvm::SDNode *const, std::basic_string<char> > > >::_Rb_tree_impl.82.682.1042.2002.2122.2362.4162" }
+%"struct.std::_Rb_tree<const llvm::SDNode *, std::pair<const llvm::SDNode *const, std::basic_string<char> >, std::_Select1st<std::pair<const llvm::SDNode *const, std::basic_string<char> > >, std::less<const llvm::SDNode *>, std::allocator<std::pair<const llvm::SDNode *const, std::basic_string<char> > > >::_Rb_tree_impl.82.682.1042.2002.2122.2362.4162" = type { %"struct.std::less.48.81.681.1041.2001.2121.2361.4161", %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126", i64 }
+%"struct.std::less.48.81.681.1041.2001.2121.2361.4161" = type { i8 }
+%"class.llvm::FoldingSet.50.85.685.1045.2005.2125.2365.4165" = type { %"class.llvm::FoldingSetImpl.66.666.1026.1986.2106.2346.4146" }
+%"class.std::vector.51.89.689.1049.2009.2129.2369.4169" = type { %"struct.std::_Vector_base.52.88.688.1048.2008.2128.2368.4168" }
+%"struct.std::_Vector_base.52.88.688.1048.2008.2128.2368.4168" = type { %"struct.std::_Vector_base<llvm::CondCodeSDNode *, std::allocator<llvm::CondCodeSDNode *> >::_Vector_impl.87.687.1047.2007.2127.2367.4167" }
+%"struct.std::_Vector_base<llvm::CondCodeSDNode *, std::allocator<llvm::CondCodeSDNode *> >::_Vector_impl.87.687.1047.2007.2127.2367.4167" = type { %"class.llvm::CondCodeSDNode.86.686.1046.2006.2126.2366.4166"**, %"class.llvm::CondCodeSDNode.86.686.1046.2006.2126.2366.4166"**, %"class.llvm::CondCodeSDNode.86.686.1046.2006.2126.2366.4166"** }
+%"class.llvm::CondCodeSDNode.86.686.1046.2006.2126.2366.4166" = type { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090", i32 }
+%"class.std::vector.56.92.692.1052.2012.2132.2372.4172" = type { %"struct.std::_Vector_base.57.91.691.1051.2011.2131.2371.4171" }
+%"struct.std::_Vector_base.57.91.691.1051.2011.2131.2371.4171" = type { %"struct.std::_Vector_base<llvm::SDNode *, std::allocator<llvm::SDNode *> >::_Vector_impl.90.690.1050.2010.2130.2370.4170" }
+%"struct.std::_Vector_base<llvm::SDNode *, std::allocator<llvm::SDNode *> >::_Vector_impl.90.690.1050.2010.2130.2370.4170" = type { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"**, %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"**, %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"** }
+%"class.std::map.61.96.696.1056.2016.2136.2376.4176" = type { %"class.std::_Rb_tree.62.95.695.1055.2015.2135.2375.4175" }
+%"class.std::_Rb_tree.62.95.695.1055.2015.2135.2375.4175" = type { %"struct.std::_Rb_tree<llvm::EVT, std::pair<const llvm::EVT, llvm::SDNode *>, std::_Select1st<std::pair<const llvm::EVT, llvm::SDNode *> >, llvm::EVT::compareRawBits, std::allocator<std::pair<const llvm::EVT, llvm::SDNode *> > >::_Rb_tree_impl.94.694.1054.2014.2134.2374.4174" }
+%"struct.std::_Rb_tree<llvm::EVT, std::pair<const llvm::EVT, llvm::SDNode *>, std::_Select1st<std::pair<const llvm::EVT, llvm::SDNode *> >, llvm::EVT::compareRawBits, std::allocator<std::pair<const llvm::EVT, llvm::SDNode *> > >::_Rb_tree_impl.94.694.1054.2014.2134.2374.4174" = type { %"struct.llvm::EVT::compareRawBits.93.693.1053.2013.2133.2373.4173", %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126", i64 }
+%"struct.llvm::EVT::compareRawBits.93.693.1053.2013.2133.2373.4173" = type { i8 }
+%"class.llvm::StringMap.99.699.1059.2019.2139.2379.4179" = type { %"class.llvm::StringMapImpl.98.698.1058.2018.2138.2378.4178", %"class.llvm::MallocAllocator.61.661.1021.1981.2101.2341.4141" }
+%"class.llvm::StringMapImpl.98.698.1058.2018.2138.2378.4178" = type { %"class.llvm::StringMapEntryBase.97.697.1057.2017.2137.2377.4177"**, i32, i32, i32, i32 }
+%"class.llvm::StringMapEntryBase.97.697.1057.2017.2137.2377.4177" = type { i32 }
+%"class.std::map.66.103.703.1063.2023.2143.2383.4183" = type { %"class.std::_Rb_tree.67.102.702.1062.2022.2142.2382.4182" }
+%"class.std::_Rb_tree.67.102.702.1062.2022.2142.2382.4182" = type { %"struct.std::_Rb_tree<std::pair<std::basic_string<char>, unsigned char>, std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *>, std::_Select1st<std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *> >, std::less<std::pair<std::basic_string<char>, unsigned char> >, std::allocator<std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *> > >::_Rb_tree_impl.101.701.1061.2021.2141.2381.4181" }
+%"struct.std::_Rb_tree<std::pair<std::basic_string<char>, unsigned char>, std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *>, std::_Select1st<std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *> >, std::less<std::pair<std::basic_string<char>, unsigned char> >, std::allocator<std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *> > >::_Rb_tree_impl.101.701.1061.2021.2141.2381.4181" = type { %"struct.std::less.71.100.700.1060.2020.2140.2380.4180", %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126", i64 }
+%"struct.std::less.71.100.700.1060.2020.2140.2380.4180" = type { i8 }
+%"class.llvm::Type.7.607.967.1927.2047.2287.4087" = type { %"class.llvm::LLVMContext.6.606.966.1926.2046.2286.4086"*, i32, i32, %"class.llvm::Type.7.607.967.1927.2047.2287.4087"** }
+%"class.llvm::DAGTypeLegalizer.117.717.1077.2037.2157.2397.4197" = type { %"class.llvm::TargetLowering.51.651.1011.1971.2091.2331.4131"*, %"class.llvm::SelectionDAG.104.704.1064.2024.2144.2384.4184"*, %"class.llvm::TargetLoweringBase::ValueTypeActionImpl.40.640.1000.1960.2080.2320.4120", [6 x i8], %"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187", %"class.llvm::SmallDenseMap.77.110.710.1070.2030.2150.2390.4190", %"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187", %"class.llvm::SmallDenseMap.77.110.710.1070.2030.2150.2390.4190", %"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187", %"class.llvm::SmallDenseMap.77.110.710.1070.2030.2150.2390.4190", %"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187", %"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187", %"class.llvm::SmallVector.82.116.716.1076.2036.2156.2396.4196" }
+%"class.llvm::SmallDenseMap.77.110.710.1070.2030.2150.2390.4190" = type { [4 x i8], i32, %"struct.llvm::AlignedCharArrayUnion.80.109.709.1069.2029.2149.2389.4189" }
+%"struct.llvm::AlignedCharArrayUnion.80.109.709.1069.2029.2149.2389.4189" = type { %"struct.llvm::AlignedCharArray.81.108.708.1068.2028.2148.2388.4188" }
+%"struct.llvm::AlignedCharArray.81.108.708.1068.2028.2148.2388.4188" = type { [384 x i8] }
+%"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187" = type { [4 x i8], i32, %"struct.llvm::AlignedCharArrayUnion.75.106.706.1066.2026.2146.2386.4186" }
+%"struct.llvm::AlignedCharArrayUnion.75.106.706.1066.2026.2146.2386.4186" = type { %"struct.llvm::AlignedCharArray.76.105.705.1065.2025.2145.2385.4185" }
+%"struct.llvm::AlignedCharArray.76.105.705.1065.2025.2145.2385.4185" = type { [256 x i8] }
+%"class.llvm::SmallVector.82.116.716.1076.2036.2156.2396.4196" = type { %"class.llvm::SmallVectorImpl.83.114.714.1074.2034.2154.2394.4194", %"struct.llvm::SmallVectorStorage.87.115.715.1075.2035.2155.2395.4195" }
+%"class.llvm::SmallVectorImpl.83.114.714.1074.2034.2154.2394.4194" = type { %"class.llvm::SmallVectorTemplateBase.84.113.713.1073.2033.2153.2393.4193" }
+%"class.llvm::SmallVectorTemplateBase.84.113.713.1073.2033.2153.2393.4193" = type { %"class.llvm::SmallVectorTemplateCommon.85.112.712.1072.2032.2152.2392.4192" }
+%"class.llvm::SmallVectorTemplateCommon.85.112.712.1072.2032.2152.2392.4192" = type { %"class.llvm::SmallVectorBase.24.624.984.1944.2064.2304.4104", %"struct.llvm::AlignedCharArrayUnion.86.111.711.1071.2031.2151.2391.4191" }
+%"struct.llvm::AlignedCharArrayUnion.86.111.711.1071.2031.2151.2391.4191" = type { %"struct.llvm::AlignedCharArray.35.68.668.1028.1988.2108.2348.4148" }
+%"struct.llvm::SmallVectorStorage.87.115.715.1075.2035.2155.2395.4195" = type { [127 x %"struct.llvm::AlignedCharArrayUnion.86.111.711.1071.2031.2151.2391.4191"] }
+%"struct.std::pair.112.119.719.1079.2039.2159.2399.4199" = type { i32, %"struct.llvm::EVT.8.608.968.1928.2048.2288.4088" }
+%"class.llvm::DenseMapBase.73.118.718.1078.2038.2158.2398.4198" = type { i8 }
+
+@.str61 = external hidden unnamed_addr constant [80 x i8], align 1
+@.str63 = external hidden unnamed_addr constant [80 x i8], align 1
+@.str74 = external hidden unnamed_addr constant [49 x i8], align 1
+@__PRETTY_FUNCTION__._ZN4llvm16DAGTypeLegalizer16GetWidenedVectorENS_7SDValueE = external hidden unnamed_addr constant [70 x i8], align 1
+@.str98 = external hidden unnamed_addr constant [46 x i8], align 1
+@__PRETTY_FUNCTION__._ZNK4llvm6SDNode12getValueTypeEj = external hidden unnamed_addr constant [57 x i8], align 1
+@.str99 = external hidden unnamed_addr constant [19 x i8], align 1
+@__PRETTY_FUNCTION__._ZN4llvm5SDLocC2EPKNS_6SDNodeE = external hidden unnamed_addr constant [41 x i8], align 1
+@.str100 = external hidden unnamed_addr constant [50 x i8], align 1
+@__PRETTY_FUNCTION__._ZNK4llvm6SDNode10getOperandEj = external hidden unnamed_addr constant [66 x i8], align 1
+
+declare { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 } @_ZN4llvm12SelectionDAG7getNodeEjNS_5SDLocENS_3EVTENS_7SDValueES3_(%"class.llvm::SelectionDAG.104.704.1064.2024.2144.2384.4184"*, i32, i8*, i32, i32, %"class.llvm::Type.7.607.967.1927.2047.2287.4087"*, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* byval align 8, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* byval align 8)
+
+; Function Attrs: noreturn nounwind
+declare void @__assert_fail(i8*, i8*, i32, i8*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define hidden { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 } @_ZN4llvm16DAGTypeLegalizer18WidenVecRes_BinaryEPNS_6SDNodeE(%"class.llvm::DAGTypeLegalizer.117.717.1077.2037.2157.2397.4197"* %this, %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"* %N) #2 align 2 {
+entry:
+  %Op.i43 = alloca %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", align 8
+  %ref.tmp.i = alloca %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199", align 8
+  %Op.i = alloca %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", align 8
+  %0 = bitcast %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* %ref.tmp.i to i8*
+  %retval.sroa.0.0.idx.i36 = getelementptr inbounds %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* %ref.tmp.i, i64 0, i32 1, i32 0, i32 0
+  %retval.sroa.0.0.copyload.i37 = load i32* %retval.sroa.0.0.idx.i36, align 8
+  call void @llvm.lifetime.end(i64 24, i8* %0) #1
+  %agg.tmp8.sroa.2.0.copyload = load i32* undef, align 8
+  %1 = bitcast %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* %Op.i to i8*
+  call void @llvm.lifetime.start(i64 16, i8* %1) #1
+  %2 = getelementptr %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* %Op.i, i64 0, i32 1
+  store i32 %agg.tmp8.sroa.2.0.copyload, i32* %2, align 8
+
+; CHECK: movl	(%rax), %eax
+; CHECK-NOT: movl	%eax, {{[0-9]+}}(%rsp)
+; CHECK: movl	[[OFF:[0-9]+]](%rsp), %r8d
+; CHECK: movl	%eax, [[OFF]](%rsp)
+; CHECK: movl	$-1, %ecx
+; CHECK: callq	_ZN4llvm12SelectionDAG7getNodeEjNS_5SDLocENS_3EVTENS_7SDValueES3_
+
+  %call18 = call { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 } @_ZN4llvm12SelectionDAG7getNodeEjNS_5SDLocENS_3EVTENS_7SDValueES3_(%"class.llvm::SelectionDAG.104.704.1064.2024.2144.2384.4184"* undef, i32 undef, i8* undef, i32 -1, i32 %retval.sroa.0.0.copyload.i37, %"class.llvm::Type.7.607.967.1927.2047.2287.4087"* undef, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* byval align 8 undef, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* byval align 8 undef) #1
+  ret { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 } %call18
+}
+
+; Function Attrs: nounwind uwtable
+declare hidden %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* @_ZN4llvm12DenseMapBaseINS_13SmallDenseMapINS_7SDValueES2_Lj8ENS_12DenseMapInfoIS2_EEEES2_S2_S4_EixERKS2_(%"class.llvm::DenseMapBase.73.118.718.1078.2038.2158.2398.4198"*, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* nocapture readonly) #2 align 2
+
+declare hidden void @_ZN4llvm16DAGTypeLegalizer10RemapValueERNS_7SDValueE(%"class.llvm::DAGTypeLegalizer.117.717.1077.2037.2157.2397.4197"*, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"*)
+
+; Function Attrs: nounwind uwtable
+declare hidden void @_ZNK4llvm18TargetLoweringBase17getTypeConversionERNS_11LLVMContextENS_3EVTE(%"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* noalias sret, %"class.llvm::TargetLoweringBase.50.650.1010.1970.2090.2330.4130"* readonly, %"class.llvm::LLVMContext.6.606.966.1926.2046.2286.4086"*, i32, %"class.llvm::Type.7.607.967.1927.2047.2287.4087"*) #2 align 2
+
+attributes #0 = { noreturn nounwind }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind uwtable }
+
diff --git a/test/CodeGen/X86/misched-aa-mmos.ll b/test/CodeGen/X86/misched-aa-mmos.ll
new file mode 100644
index 0000000..343e26f
--- /dev/null
+++ b/test/CodeGen/X86/misched-aa-mmos.ll
@@ -0,0 +1,37 @@
+; RUN: llc -enable-misched -enable-aa-sched-mi < %s
+
+; This generates a decw instruction, which has two MMOs, and an alias SU edge
+; query involving that instruction. Make sure this does not crash.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%s1 = type { i16, i16, i32 }
+%c1 = type { %s1*, %u1, i16, i8 }
+%u1 = type { i64 }
+
+declare zeroext i1 @bar(i64*, i32) #5
+
+define i32 @foo() #0 align 2 {
+entry:
+  %temp_rhs = alloca %c1, align 8
+  br i1 undef, label %if.else56, label %cond.end.i
+
+cond.end.i:
+  %significand.i18.i = getelementptr inbounds %c1* %temp_rhs, i64 0, i32 1
+  %exponent.i = getelementptr inbounds %c1* %temp_rhs, i64 0, i32 2
+  %0 = load i16* %exponent.i, align 8
+  %sub.i = add i16 %0, -1
+  store i16 %sub.i, i16* %exponent.i, align 8
+  %parts.i.i = bitcast %u1* %significand.i18.i to i64**
+  %1 = load i64** %parts.i.i, align 8
+  %call5.i = call zeroext i1 @bar(i64* %1, i32 undef) #1
+  unreachable
+
+if.else56:
+  unreachable
+}
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+
diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll
index 5454b7c..3ea6512 100644
--- a/test/CodeGen/X86/misched-matmul.ll
+++ b/test/CodeGen/X86/misched-matmul.ll
@@ -10,7 +10,7 @@
 ; more complex cases.
 ;
 ; CHECK: @wrap_mul4
-; CHECK: 23 regalloc - Number of spills inserted
+; CHECK: 22 regalloc - Number of spills inserted
 
 define void @wrap_mul4(double* nocapture %Out, [4 x double]* nocapture %A, [4 x double]* nocapture %B) #0 {
 entry:
diff --git a/test/CodeGen/X86/movbe.ll b/test/CodeGen/X86/movbe.ll
index 3f459be..e248410 100644
--- a/test/CodeGen/X86/movbe.ll
+++ b/test/CodeGen/X86/movbe.ll
@@ -1,45 +1,66 @@
 ; RUN: llc -mtriple=x86_64-linux -mcpu=atom < %s | FileCheck %s
 ; RUN: llc -mtriple=x86_64-linux -mcpu=slm < %s | FileCheck %s -check-prefix=SLM
 
+declare i16 @llvm.bswap.i16(i16) nounwind readnone
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 declare i64 @llvm.bswap.i64(i64) nounwind readnone
 
-define void @test1(i32* nocapture %x, i32 %y) nounwind {
+define void @test1(i16* nocapture %x, i16 %y) nounwind {
+  %bswap = call i16 @llvm.bswap.i16(i16 %y)
+  store i16 %bswap, i16* %x, align 2
+  ret void
+; CHECK-LABEL: test1:
+; CHECK: movbew %si, (%rdi)
+; SLM-LABEL: test1:
+; SLM: movbew   %si, (%rdi)
+}
+
+define i16 @test2(i16* %x) nounwind {
+  %load = load i16* %x, align 2
+  %bswap = call i16 @llvm.bswap.i16(i16 %load)
+  ret i16 %bswap
+; CHECK-LABEL: test2:
+; CHECK: movbew (%rdi), %ax
+; SLM-LABEL: test2:
+; SLM: movbew   (%rdi), %ax
+}
+
+define void @test3(i32* nocapture %x, i32 %y) nounwind {
   %bswap = call i32 @llvm.bswap.i32(i32 %y)
   store i32 %bswap, i32* %x, align 4
   ret void
-; CHECK-LABEL: test1:
+; CHECK-LABEL: test3:
 ; CHECK: movbel	%esi, (%rdi)
-; SLM-LABEL: test1:
+; SLM-LABEL: test3:
 ; SLM: movbel	%esi, (%rdi)
 }
 
-define i32 @test2(i32* %x) nounwind {
+define i32 @test4(i32* %x) nounwind {
   %load = load i32* %x, align 4
   %bswap = call i32 @llvm.bswap.i32(i32 %load)
   ret i32 %bswap
-; CHECK-LABEL: test2:
+; CHECK-LABEL: test4:
 ; CHECK: movbel	(%rdi), %eax
-; SLM-LABEL: test2:
+; SLM-LABEL: test4:
 ; SLM: movbel	(%rdi), %eax
 }
 
-define void @test3(i64* %x, i64 %y) nounwind {
+define void @test5(i64* %x, i64 %y) nounwind {
   %bswap = call i64 @llvm.bswap.i64(i64 %y)
   store i64 %bswap, i64* %x, align 8
   ret void
-; CHECK-LABEL: test3:
+; CHECK-LABEL: test5:
 ; CHECK: movbeq	%rsi, (%rdi)
-; SLM-LABEL: test3:
+; SLM-LABEL: test5:
 ; SLM: movbeq	%rsi, (%rdi)
 }
 
-define i64 @test4(i64* %x) nounwind {
+define i64 @test6(i64* %x) nounwind {
   %load = load i64* %x, align 8
   %bswap = call i64 @llvm.bswap.i64(i64 %load)
   ret i64 %bswap
-; CHECK-LABEL: test4:
+; CHECK-LABEL: test6:
 ; CHECK: movbeq	(%rdi), %rax
-; SLM-LABEL: test4:
+; SLM-LABEL: test6:
 ; SLM: movbeq	(%rdi), %rax
 }
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
index 5e7ba37..6910515 100644
--- a/test/CodeGen/X86/ms-inline-asm.ll
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -1,11 +1,10 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core2 -no-integrated-as | FileCheck %s
 
 define i32 @t1() nounwind {
 entry:
   %0 = tail call i32 asm sideeffect inteldialect "mov eax, $1\0A\09mov $0, eax", "=r,r,~{eax},~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
   ret i32 %0
 ; CHECK: t1
-; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: mov eax, ecx
@@ -19,7 +18,6 @@ entry:
   call void asm sideeffect inteldialect "mov eax, $$1", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
   ret void
 ; CHECK: t2
-; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: mov eax, 1
@@ -34,7 +32,6 @@ entry:
   call void asm sideeffect inteldialect "mov eax, DWORD PTR [$0]", "*m,~{eax},~{dirflag},~{fpsr},~{flags}"(i32* %V.addr) nounwind
   ret void
 ; CHECK: t3
-; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: mov eax, DWORD PTR {{[[esp]}}
@@ -56,7 +53,6 @@ entry:
   %0 = load i32* %b1, align 4
   ret i32 %0
 ; CHECK: t18
-; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: lea ebx, foo
@@ -76,7 +72,6 @@ entry:
   call void asm sideeffect inteldialect "call $0", "r,~{dirflag},~{fpsr},~{flags}"(void ()* @t19_helper) nounwind
   ret void
 ; CHECK-LABEL: t19:
-; CHECK: movl %esp, %ebp
 ; CHECK: movl ${{_?}}t19_helper, %eax
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
@@ -95,7 +90,6 @@ entry:
   %0 = load i32** %res, align 4
   ret i32* %0
 ; CHECK-LABEL: t30:
-; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: lea edi, dword ptr [{{_?}}results]
@@ -103,8 +97,31 @@ entry:
 ; CHECK: {{## InlineAsm End|#NO_APP}}
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
-; CHECK: mov dword ptr [esi], edi
+; CHECK: mov dword ptr [esp], edi
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+; CHECK: movl (%esp), %eax
+}
+
+; Stack realignment plus MS inline asm that does *not* adjust the stack is no
+; longer an error.
+
+define i32 @t31() {
+entry:
+  %val = alloca i32, align 64
+  store i32 -1, i32* %val, align 64
+  call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val) #1
+  %sp = load i32* %val, align 64
+  ret i32 %sp
+; CHECK-LABEL: t31:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: andl $-64, %esp
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov dword ptr [esp], esp
 ; CHECK: .att_syntax
 ; CHECK: {{## InlineAsm End|#NO_APP}}
-; CHECK: movl (%esi), %eax
+; CHECK: movl (%esp), %eax
+; CHECK: ret
 }
diff --git a/test/CodeGen/X86/mul128_sext_loop.ll b/test/CodeGen/X86/mul128_sext_loop.ll
new file mode 100644
index 0000000..a516f03
--- /dev/null
+++ b/test/CodeGen/X86/mul128_sext_loop.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+define void @test(i64* nocapture %arr, i64 %arrsize, i64 %factor) nounwind uwtable {
+  %1 = icmp sgt i64 %arrsize, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = sext i64 %factor to i128
+  br label %3
+
+; <label>:3                                       ; preds = %3, %.lr.ph
+; CHECK-NOT: mul
+; CHECK: imulq
+; CHECK-NOT: mul
+  %carry.02 = phi i128 [ 0, %.lr.ph ], [ %10, %3 ]
+  %i.01 = phi i64 [ 0, %.lr.ph ], [ %11, %3 ]
+  %4 = getelementptr inbounds i64* %arr, i64 %i.01
+  %5 = load i64* %4, align 8
+  %6 = sext i64 %5 to i128
+  %7 = mul nsw i128 %6, %2
+  %8 = add nsw i128 %7, %carry.02
+  %.tr = trunc i128 %8 to i64
+  %9 = and i64 %.tr, 9223372036854775807
+  store i64 %9, i64* %4, align 8
+  %10 = ashr i128 %8, 63
+  %11 = add nsw i64 %i.01, 1
+  %exitcond = icmp eq i64 %11, %arrsize
+  br i1 %exitcond, label %._crit_edge, label %3
+
+._crit_edge:                                      ; preds = %3, %0
+  ret void
+}
diff --git a/test/CodeGen/X86/mult-alt-generic-i686.ll b/test/CodeGen/X86/mult-alt-generic-i686.ll
index 7c3499f..54bc3a4 100644
--- a/test/CodeGen/X86/mult-alt-generic-i686.ll
+++ b/test/CodeGen/X86/mult-alt-generic-i686.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -march=x86 -no-integrated-as
 ; ModuleID = 'mult-alt-generic.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 target triple = "i686"
diff --git a/test/CodeGen/X86/mult-alt-generic-x86_64.ll b/test/CodeGen/X86/mult-alt-generic-x86_64.ll
index f35bb5e..84a9c81 100644
--- a/test/CodeGen/X86/mult-alt-generic-x86_64.ll
+++ b/test/CodeGen/X86/mult-alt-generic-x86_64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -march=x86-64 -no-integrated-as
 ; ModuleID = 'mult-alt-generic.c'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64"
diff --git a/test/CodeGen/X86/mult-alt-x86.ll b/test/CodeGen/X86/mult-alt-x86.ll
index 06175da..cb2219a 100644
--- a/test/CodeGen/X86/mult-alt-x86.ll
+++ b/test/CodeGen/X86/mult-alt-x86.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -march=x86 -mattr=+sse2 -no-integrated-as
 ; ModuleID = 'mult-alt-x86.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 target triple = "i686-pc-win32"
diff --git a/test/CodeGen/X86/multiple-loop-post-inc.ll b/test/CodeGen/X86/multiple-loop-post-inc.ll
index 29b9f34..4edc1ff 100644
--- a/test/CodeGen/X86/multiple-loop-post-inc.ll
+++ b/test/CodeGen/X86/multiple-loop-post-inc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false -disable-branch-fold -disable-block-placement -disable-tail-duplicate -march=x86-64 -mcpu=nehalem < %s | FileCheck %s
+; RUN: llc -asm-verbose=false -disable-branch-fold -disable-block-placement -disable-tail-duplicate -march=x86-64 -mcpu=nehalem -no-integrated-as < %s | FileCheck %s
 ; rdar://7236213
 ;
 ; The scheduler's 2-address hack has been disabled, so there is
diff --git a/test/CodeGen/X86/negate-add-zero.ll b/test/CodeGen/X86/negate-add-zero.ll
index 92850f2..c961bd0 100644
--- a/test/CodeGen/X86/negate-add-zero.ll
+++ b/test/CodeGen/X86/negate-add-zero.ll
@@ -827,9 +827,7 @@ declare void @_ZN11MatrixTools9transposeI11FixedMatrixIdLi6ELi6ELi0ELi0EEEENT_13
 declare void @_ZN21HNodeTranslateRotate311toCartesianEv(%struct.HNodeTranslateRotate3*)
 
 define linkonce void @_ZN21HNodeTranslateRotate36setVelERK9CDSVectorIdLi1EN3CDS12DefaultAllocEE(%struct.HNodeTranslateRotate3* %this, %"struct.CDSVector<double,0,CDS::DefaultAlloc>"* %velv) {
-entry:
-	%0 = add i32 0, -1		; <i32> [#uses=1]
-	%1 = getelementptr double* null, i32 %0		; <double*> [#uses=1]
+	%1 = getelementptr double* null, i32 -1		; <double*> [#uses=1]
 	%2 = load double* %1, align 8		; <double> [#uses=1]
 	%3 = load double* null, align 8		; <double> [#uses=2]
 	%4 = load double* null, align 8		; <double> [#uses=2]
@@ -890,13 +888,12 @@ entry:
 	store double %52, double* %55, align 8
 	%56 = getelementptr %struct.HNodeTranslateRotate3* %this, i32 0, i32 0, i32 10, i32 0, i32 0, i32 2		; <double*> [#uses=1]
 	store double %53, double* %56, align 8
-	%57 = add i32 0, 4		; <i32> [#uses=1]
-	%58 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 0		; <%"struct.CDSVector<double,0,CDS::DefaultAlloc>"**> [#uses=1]
-	store %"struct.CDSVector<double,0,CDS::DefaultAlloc>"* %velv, %"struct.CDSVector<double,0,CDS::DefaultAlloc>"** %58, align 8
-	%59 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 1		; <i32*> [#uses=1]
-	store i32 %57, i32* %59, align 4
-	%60 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 2		; <i32*> [#uses=1]
-	store i32 3, i32* %60, align 8
+	%57 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 0		; <%"struct.CDSVector<double,0,CDS::DefaultAlloc>"**> [#uses=1]
+	store %"struct.CDSVector<double,0,CDS::DefaultAlloc>"* %velv, %"struct.CDSVector<double,0,CDS::DefaultAlloc>"** %57, align 8
+	%58 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 1		; <i32*> [#uses=1]
+	store i32 4, i32* %58, align 4
+	%59 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 2		; <i32*> [#uses=1]
+	store i32 3, i32* %59, align 8
 	unreachable
 }
 
diff --git a/test/CodeGen/X86/no-elf-compact-unwind.ll b/test/CodeGen/X86/no-elf-compact-unwind.ll
deleted file mode 100644
index 8a15817..0000000
--- a/test/CodeGen/X86/no-elf-compact-unwind.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -disable-cfi | FileCheck -check-prefix=MACHO %s
-; RUN: llc < %s -mtriple x86_64-unknown-linux -disable-cfi | FileCheck -check-prefix=ELF %s
-
-; Make sure we don't generate a compact unwind for ELF.
-
-; MACHO-LABEL: _Z3barv:
-; MACHO:       __compact_unwind
-
-; ELF-LABEL:   _Z3barv:
-; ELF-NOT:     __compact_unwind
-
-@_ZTIi = external constant i8*
-
-define void @_Z3barv() uwtable {
-entry:
-  invoke void @_Z3foov()
-          to label %try.cont unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-          catch i8* bitcast (i8** @_ZTIi to i8*)
-  %1 = extractvalue { i8*, i32 } %0, 1
-  %2 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
-  %matches = icmp eq i32 %1, %2
-  br i1 %matches, label %catch, label %eh.resume
-
-catch:                                            ; preds = %lpad
-  %3 = extractvalue { i8*, i32 } %0, 0
-  %4 = tail call i8* @__cxa_begin_catch(i8* %3)
-  tail call void @__cxa_end_catch()
-  br label %try.cont
-
-try.cont:                                         ; preds = %entry, %catch
-  ret void
-
-eh.resume:                                        ; preds = %lpad
-  resume { i8*, i32 } %0
-}
-
-declare void @_Z3foov()
-
-declare i32 @__gxx_personality_v0(...)
-
-declare i32 @llvm.eh.typeid.for(i8*)
-
-declare i8* @__cxa_begin_catch(i8*)
-
-declare void @__cxa_end_catch()
diff --git a/test/CodeGen/X86/nocx16.ll b/test/CodeGen/X86/nocx16.ll
index cceaac4..8b995da 100644
--- a/test/CodeGen/X86/nocx16.ll
+++ b/test/CodeGen/X86/nocx16.ll
@@ -2,7 +2,7 @@
 define void @test(i128* %a) nounwind {
 entry:
 ; CHECK: __sync_val_compare_and_swap_16
-  %0 = cmpxchg i128* %a, i128 1, i128 1 seq_cst
+  %0 = cmpxchg i128* %a, i128 1, i128 1 seq_cst seq_cst
 ; CHECK: __sync_lock_test_and_set_16
   %1 = atomicrmw xchg i128* %a, i128 1 seq_cst
 ; CHECK: __sync_fetch_and_add_16
diff --git a/test/CodeGen/X86/opaque-constant-asm.ll b/test/CodeGen/X86/opaque-constant-asm.ll
new file mode 100644
index 0000000..dd1cc8e
--- /dev/null
+++ b/test/CodeGen/X86/opaque-constant-asm.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s
+; This tests makes sure that we not mistake the bitcast inside the asm statement
+; as an opaque constant. If we do, then the compilation will simply fail.
+
+%struct2 = type <{ i32, i32, i32, i32 }>
+%union.anon = type { [2 x i64], [4 x i32] }
+%struct1 = type { i32, %union.anon }
+
+define void @test() {
+; CHECK: #ASM $16
+  call void asm sideeffect "#ASM $0", "n"(i32 ptrtoint (i32* getelementptr inbounds (%struct2* bitcast (%union.anon* getelementptr inbounds (%struct1* null, i32 0, i32 1) to %struct2*), i32 0, i32 2) to i32))
+  ret void
+}
diff --git a/test/CodeGen/X86/osx-private-labels.ll b/test/CodeGen/X86/osx-private-labels.ll
new file mode 100644
index 0000000..349ce7d
--- /dev/null
+++ b/test/CodeGen/X86/osx-private-labels.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; Test all the cases where a L label is safe. Removing any entry from
+; TargetLoweringObjectFileMachO::isSectionAtomizableBySymbols should cause
+; this to fail.
+; We also test some noteworthy cases that require an l label.
+
+@private1 = private unnamed_addr constant [4 x i8] c"zed\00"
+; CHECK: .section	__TEXT,__cstring,cstring_literals
+; CHECK-NEXT: L_private1:
+
+@private2 = private unnamed_addr constant [5 x i16] [i16 116, i16 101,
+                                                     i16 115, i16 116, i16 0]
+; CHECK: .section	__TEXT,__ustring
+; CHECK-NEXT: .align	1
+; CHECK-NEXT: l_private2:
+
+; There is no dedicated 4 byte strings on MachO.
+
+%struct.NSConstantString = type { i32*, i32, i8*, i32 }
+@private3 = private constant %struct.NSConstantString { i32* null, i32 1992, i8* null, i32 0 }, section "__DATA,__cfstring"
+; CHECK: .section	__DATA,__cfstring
+; CHECK-NEXT: .align	4
+; CHECK-NEXT: L_private3:
+
+; There is no dedicated 1 or 2 byte constant section on MachO.
+
+@private4 = private unnamed_addr constant i32 42
+; CHECK: .section	__TEXT,__literal4,4byte_literals
+; CHECK-NEXT: .align	2
+; CHECK-NEXT: L_private4:
+
+@private5 = private unnamed_addr constant i64 42
+; CHECK: .section	__TEXT,__literal8,8byte_literals
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private5:
+
+@private6 = private unnamed_addr constant i128 42
+; CHECK: .section	__TEXT,__literal16,16byte_literals
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private6:
+
+%struct._objc_class = type { i8* }
+@private7 = private global %struct._objc_class* null, section "__OBJC,__cls_refs,literal_pointers,no_dead_strip"
+; CHECK: .section	__OBJC,__cls_refs,literal_pointers,no_dead_strip
+; CHECK: .align	3
+; CHECK: L_private7:
+
+@private8 = private global i32* null, section "__DATA,__nl_symbol_ptr,non_lazy_symbol_pointers"
+; CHECK: .section	__DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private8:
+
+@private9 = private global i32* null, section "__DATA,__la_symbol_ptr,lazy_symbol_pointers"
+; CHECK: .section	__DATA,__la_symbol_ptr,lazy_symbol_pointers
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private9:
+
+@private10 = private global i32* null, section "__DATA,__mod_init_func,mod_init_funcs"
+; CHECK: .section	__DATA,__mod_init_func,mod_init_funcs
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private10:
+
+@private11 = private global i32* null, section "__DATA,__mod_term_func,mod_term_funcs"
+; CHECK: .section	__DATA,__mod_term_func,mod_term_funcs
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private11:
+
+@private12 = private global i32* null, section "__DATA,__foobar,interposing"
+; CHECK: .section	__DATA,__foobar,interposing
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private12:
diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll
index d534639..62b1273 100644
--- a/test/CodeGen/X86/patchpoint.ll
+++ b/test/CodeGen/X86/patchpoint.ll
@@ -7,16 +7,16 @@ entry:
 ; CHECK-LABEL: trivial_patchpoint_codegen:
 ; CHECK:      movabsq $-559038736, %r11
 ; CHECK-NEXT: callq *%r11
-; CHECK-NEXT: nop
+; CHECK-NEXT: xchgw %ax, %ax
 ; CHECK:      movq %rax, %[[REG:r.+]]
 ; CHECK:      callq *%r11
-; CHECK-NEXT: nop
+; CHECK-NEXT: xchgw %ax, %ax
 ; CHECK:      movq %[[REG]], %rax
 ; CHECK:      ret
   %resolveCall2 = inttoptr i64 -559038736 to i8*
-  %result = tail call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 2, i32 15, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   %resolveCall3 = inttoptr i64 -559038737 to i8*
-  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 3, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
   ret i64 %result
 }
 
@@ -34,31 +34,65 @@ entry:
   store i64 11, i64* %metadata
   store i64 12, i64* %metadata
   store i64 13, i64* %metadata
-  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 4, i32 0, i64* %metadata)
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
   ret void
 }
 
 ; Test the webkit_jscc calling convention.
-; Two arguments will be pushed on the stack.
+; One argument will be passed in register, the other will be pushed on the stack.
 ; Return value in $rax.
 define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
 ; CHECK-LABEL: jscall_patchpoint_codegen:
 ; CHECK:      Ltmp
-; CHECK:      movq %r{{.+}}, 8(%rsp)
 ; CHECK:      movq %r{{.+}}, (%rsp)
+; CHECK:      movq %r{{.+}}, %rax
 ; CHECK:      Ltmp
 ; CHECK-NEXT: movabsq $-559038736, %r11
 ; CHECK-NEXT: callq *%r11
-; CHECK:      movq %rax, 8(%rsp)
+; CHECK:      movq %rax, (%rsp)
 ; CHECK:      callq
   %resolveCall2 = inttoptr i64 -559038736 to i8*
-  %result = tail call webkit_jscc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %resolveCall2, i32 2, i64 %p1, i64 %p2)
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
   %resolveCall3 = inttoptr i64 -559038737 to i8*
-  tail call webkit_jscc void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 6, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
   ret void
 }
 
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:      Ltmp
+; CHECK:      movq $6, 24(%rsp)
+; CHECK-NEXT: movl $4, 16(%rsp)
+; CHECK-NEXT: movq $2, (%rsp)
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+  %call = inttoptr i64 -559038736 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:      Ltmp
+; CHECK:      movq $10, 48(%rsp)
+; CHECK-NEXT: movl  $8, 36(%rsp)
+; CHECK-NEXT: movq  $6, 24(%rsp)
+; CHECK-NEXT: movl  $4, 16(%rsp)
+; CHECK-NEXT: movq  $2, (%rsp)
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+  %call = inttoptr i64 -559038736 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
 ; Test patchpoints reusing the same TargetConstant.
 ; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
 ; There is no way to verify this, since it depends on memory allocation.
@@ -68,14 +102,14 @@ entry:
   %tmp80 = add i64 %tmp79, -16
   %tmp81 = inttoptr i64 %tmp80 to i64*
   %tmp82 = load i64* %tmp81, align 8
-  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 5, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
-  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 15, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 5, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
   %tmp83 = load i64* %tmp33, align 8
   %tmp84 = add i64 %tmp83, -24
   %tmp85 = inttoptr i64 %tmp84 to i64*
   %tmp86 = load i64* %tmp85, align 8
-  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 17, i32 5, i64 %arg, i64 %tmp10, i64 %tmp86)
-  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 18, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 5, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
   ret i64 10
 }
 
@@ -84,17 +118,13 @@ define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
 ; CHECK-LABEL: small_patchpoint_codegen:
 ; CHECK:      Ltmp
-; CHECK:      nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
+; CHECK:      nopl 8(%rax,%rax)
 ; CHECK-NEXT: popq
 ; CHECK-NEXT: ret
-  %result = tail call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
   ret void
 }
 
-declare void @llvm.experimental.stackmap(i32, i32, ...)
-declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/peephole-multiple-folds.ll b/test/CodeGen/X86/peephole-multiple-folds.ll
new file mode 100644
index 0000000..d184569
--- /dev/null
+++ b/test/CodeGen/X86/peephole-multiple-folds.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s
+;
+; Test multiple peephole-time folds in a single basic block.
+; <rdar://problem/16478629>
+
+define <8 x float> @test_peephole_multi_fold(<8 x float>* %p1, <8 x float>* %p2) {
+entry:
+  br label %loopbody
+
+loopbody:
+; CHECK: test_peephole_multi_fold:
+; CHECK: vfmadd231ps (%rdi),
+; CHECK: vfmadd231ps (%rsi),
+  %vsum1 = phi <8 x float> [ %vsum1.next, %loopbody ], [ zeroinitializer, %entry ]
+  %vsum2 = phi <8 x float> [ %vsum2.next, %loopbody ], [ zeroinitializer, %entry ]
+  %m1 = load <8 x float>* %p1, align 1
+  %m2 = load <8 x float>* %p2, align 1
+  %vsum1.next = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %m1, <8 x float> zeroinitializer, <8 x float> %vsum1)
+  %vsum2.next = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %m2, <8 x float> zeroinitializer, <8 x float> %vsum2)
+  %vsum1.next.1 = extractelement <8 x float> %vsum1.next, i32 0
+  %c = fcmp oeq float %vsum1.next.1, 0.0
+  br i1 %c, label %loopbody, label %loopexit
+
+loopexit:
+  %r = fadd <8 x float> %vsum1.next, %vsum2.next
+  ret <8 x float> %r
+}
+
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
diff --git a/test/CodeGen/X86/personality.ll b/test/CodeGen/X86/personality.ll
index 51be7bc..424a307 100644
--- a/test/CodeGen/X86/personality.ll
+++ b/test/CodeGen/X86/personality.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -disable-cfi -mtriple=x86_64-apple-darwin9 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -disable-cfi -mtriple=i386-apple-darwin9 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i386-apple-darwin9 | FileCheck %s -check-prefix=X32
 ; PR1632
 
 define void @_Z1fv() {
@@ -41,15 +41,10 @@ declare void @__cxa_end_catch()
 
 declare i32 @__gxx_personality_v0(...)
 
-; X64:      zPLR
-; X64:      .byte 155
-; X64-NEXT: .long	___gxx_personality_v0@GOTPCREL+4
+; X64: .cfi_personality 155, ___gxx_personality_v0
+
+; X32: .cfi_personality 155, L___gxx_personality_v0$non_lazy_ptr
 
 ; X32:        .section	__IMPORT,__pointers,non_lazy_symbol_pointers
 ; X32-NEXT: L___gxx_personality_v0$non_lazy_ptr:
 ; X32-NEXT:   .indirect_symbol ___gxx_personality_v0
-
-; X32:      zPLR
-; X32:      .byte 155
-; X32-NEXT: :
-; X32-NEXT: .long	L___gxx_personality_v0$non_lazy_ptr-
diff --git a/test/CodeGen/X86/personality_size.ll b/test/CodeGen/X86/personality_size.ll
index 30a5d39..79d131b 100644
--- a/test/CodeGen/X86/personality_size.ll
+++ b/test/CodeGen/X86/personality_size.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -relocation-model=pic -disable-cfi -mtriple=x86_64-pc-solaris2.11 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -relocation-model=pic -disable-cfi -mtriple=i386-pc-solaris2.11 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -relocation-model=pic -mtriple=x86_64-pc-solaris2.11 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -relocation-model=pic -mtriple=i386-pc-solaris2.11 | FileCheck %s -check-prefix=X32
 ; PR1632
 
 define void @_Z1fv() {
diff --git a/test/CodeGen/X86/pic.ll b/test/CodeGen/X86/pic.ll
index 7bb127e..da1e224 100644
--- a/test/CodeGen/X86/pic.ll
+++ b/test/CodeGen/X86/pic.ll
@@ -192,7 +192,8 @@ bb12:
 ; LINUX:   .LJTI7_0@GOTOFF(
 ; LINUX:   jmpl	*
 
-; LINUX: .LJTI7_0:
+; LINUX: .align 4
+; LINUX-NEXT: .LJTI7_0:
 ; LINUX:   .long	 .LBB7_2@GOTOFF
 ; LINUX:   .long	 .LBB7_8@GOTOFF
 ; LINUX:   .long	 .LBB7_14@GOTOFF
diff --git a/test/CodeGen/X86/pr10420.ll b/test/CodeGen/X86/pr10420.ll
deleted file mode 100644
index 3993f24..0000000
--- a/test/CodeGen/X86/pr10420.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -disable-cfi | FileCheck %s
-
-define private void @foo() {
-       ret void
-}
-
-define void @bar() {
-       call void @foo()
-       ret void;
-}
-
-; CHECK: _bar:                                   ## @bar
-; CHECK-NEXT: Ltmp2:
-
-; CHECK: Ltmp12:
-; CHECK-NEXT: Ltmp13 = L_foo-Ltmp12                   ## FDE initial location
-; CHECK-NEXT:         .quad   Ltmp13
-
-; CHECK: Ltmp19:
-; CHECK-NEXT: Ltmp20 = Ltmp2-Ltmp19                   ## FDE initial location
-; CHECK-NEXT:         .quad   Ltmp20
diff --git a/test/CodeGen/X86/pr14090.ll b/test/CodeGen/X86/pr14090.ll
deleted file mode 100644
index 2f7c720..0000000
--- a/test/CodeGen/X86/pr14090.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: llc < %s -march=x86-64 -print-before=stack-coloring -print-after=stack-coloring >%t 2>&1 && FileCheck <%t %s
-
-define void @foo(i64* %retval.i, i32 %call, i32* %.ph.i80, i32 %fourteen, i32* %out.lo, i32* %out.hi) nounwind align 2 {
-entry:
-  %_Tmp.i39 = alloca i64, align 8
-  %retval.i33 = alloca i64, align 8
-  %_Tmp.i = alloca i64, align 8
-  %retval.i.i = alloca i64, align 8
-  %_First.i = alloca i64, align 8
-
-  %0 = load i64* %retval.i, align 8
-
-  %1 = load i64* %retval.i, align 8
-
-  %_Tmp.i39.0.cast73 = bitcast i64* %_Tmp.i39 to i8*
-  call void @llvm.lifetime.start(i64 8, i8* %_Tmp.i39.0.cast73)
-  store i64 %1, i64* %_Tmp.i39, align 8
-  %cmp.i.i.i40 = icmp slt i32 %call, 0
-  %2 = lshr i64 %1, 32
-  %3 = trunc i64 %2 to i32
-  %sub.i.i.i44 = sub i32 0, %call
-  %cmp2.i.i.i45 = icmp ult i32 %3, %sub.i.i.i44
-  %or.cond.i.i.i46 = and i1 %cmp.i.i.i40, %cmp2.i.i.i45
-  %add.i.i.i47 = add i32 %3, %call
-  %sub5.i.i.i48 = lshr i32 %add.i.i.i47, 5
-  %trunc.i50 = trunc i64 %1 to i32
-  %inttoptr.i51 = inttoptr i32 %trunc.i50 to i32*
-  %add61617.i.i.i52 = or i32 %sub5.i.i.i48, -134217728
-  %add61617.i.sub5.i.i.i53 = select i1 %or.cond.i.i.i46, i32 %add61617.i.i.i52, i32 %sub5.i.i.i48
-  %storemerge2.i.i54 = getelementptr inbounds i32* %inttoptr.i51, i32 %add61617.i.sub5.i.i.i53
-  %_Tmp.i39.0.cast74 = bitcast i64* %_Tmp.i39 to i32**
-  store i32* %storemerge2.i.i54, i32** %_Tmp.i39.0.cast74, align 8
-  %storemerge.i.i55 = and i32 %add.i.i.i47, 31
-  %_Tmp.i39.4.raw_idx = getelementptr inbounds i8* %_Tmp.i39.0.cast73, i32 4
-  %_Tmp.i39.4.cast = bitcast i8* %_Tmp.i39.4.raw_idx to i32*
-  store i32 %storemerge.i.i55, i32* %_Tmp.i39.4.cast, align 4
-  %srcval.i56 = load i64* %_Tmp.i39, align 8
-  call void @llvm.lifetime.end(i64 8, i8* %_Tmp.i39.0.cast73)
-
-; CHECK: Before Merge disjoint stack slots
-; CHECK: [[PREFIX15:MOV64mr.*<fi#]]{{[0-9]}}[[SUFFIX15:.*;]] mem:ST8[%fifteen]
-; CHECK: [[PREFIX87:MOV32mr.*;]] mem:ST4[%sunkaddr87]
-
-; CHECK: After Merge disjoint stack slots
-; CHECK: [[PREFIX15]]{{[0-9]}}[[SUFFIX15]] mem:ST8[%_Tmp.i39]
-; CHECK: [[PREFIX87]] mem:ST4[<unknown>]
-
-  %fifteen = bitcast i64* %retval.i.i to i32**
-  %sixteen = bitcast i64* %retval.i.i to i8*
-  call void @llvm.lifetime.start(i64 8, i8* %sixteen)
-  store i32* %.ph.i80, i32** %fifteen, align 8
-  %sunkaddr = ptrtoint i64* %retval.i.i to i32
-  %sunkaddr86 = add i32 %sunkaddr, 4
-  %sunkaddr87 = inttoptr i32 %sunkaddr86 to i32*
-  store i32 %fourteen, i32* %sunkaddr87, align 4
-  %seventeen = load i64* %retval.i.i, align 8
-  call void @llvm.lifetime.end(i64 8, i8* %sixteen)
-  %eighteen = lshr i64 %seventeen, 32
-  %nineteen = trunc i64 %eighteen to i32
-  %shl.i.i.i = shl i32 1, %nineteen
-
-  store i32 %shl.i.i.i, i32* %out.lo, align 8
-  store i32 %nineteen, i32* %out.hi, align 8
-
-  ret void
-}
-
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
-
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/X86/pr1462.ll b/test/CodeGen/X86/pr1462.ll
index 62549a5..3aa1860 100644
--- a/test/CodeGen/X86/pr1462.ll
+++ b/test/CodeGen/X86/pr1462.ll
@@ -1,8 +1,7 @@
 ; RUN: llc < %s
 ; PR1462
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-
-v64:64:64-v128:128:128-a0:0:64"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "x86_64-unknown-linux-gnu"
 
 define hidden i128 @__addvti3(i128 %a1, i128 %b2) {
diff --git a/test/CodeGen/X86/pr16031.ll b/test/CodeGen/X86/pr16031.ll
index ecf6218..dc16fd9 100644
--- a/test/CodeGen/X86/pr16031.ll
+++ b/test/CodeGen/X86/pr16031.ll
@@ -2,9 +2,9 @@
 
 ; CHECK-LABEL: main:
 ; CHECK: pushl %esi
+; CHECK-NEXT: testb $1, 8(%esp)
 ; CHECK-NEXT: movl  $-12, %eax
 ; CHECK-NEXT: movl  $-1, %edx
-; CHECK-NEXT: testb $1, 8(%esp)
 ; CHECK-NEXT: cmovel    %edx, %eax
 ; CHECK-NEXT: xorl  %ecx, %ecx
 ; CHECK-NEXT: movl  %eax, %esi
diff --git a/test/CodeGen/X86/pr19049.ll b/test/CodeGen/X86/pr19049.ll
new file mode 100644
index 0000000..027c981
--- /dev/null
+++ b/test/CodeGen/X86/pr19049.ll
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple x86_64-pc-linux %s -o - | FileCheck %s
+
+module asm ".pushsection foo"
+module asm ".popsection"
+
+; CHECK: .section	foo,"",@progbits
+; CHECK: .text
diff --git a/test/CodeGen/X86/preserve_allcc64.ll b/test/CodeGen/X86/preserve_allcc64.ll
new file mode 100644
index 0000000..545cd36
--- /dev/null
+++ b/test/CodeGen/X86/preserve_allcc64.ll
@@ -0,0 +1,104 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7     | FileCheck --check-prefix=SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck --check-prefix=AVX %s
+
+define preserve_allcc void @preserve_allcc1() nounwind {
+entry:
+;SSE-LABEL: preserve_allcc1
+;SSE:       pushq %r10
+;SSE-NEXT:  pushq %r9
+;SSE-NEXT:  pushq %r8
+;SSE-NEXT:  pushq %rdi
+;SSE-NEXT:  pushq %rsi
+;SSE-NEXT:  pushq %rdx
+;SSE-NEXT:  pushq %rcx
+;SSE-NEXT:  pushq %rax
+;SSE-NEXT:  pushq %rbp
+;SSE-NEXT:  pushq %r15
+;SSE-NEXT:  pushq %r14
+;SSE-NEXT:  pushq %r13
+;SSE-NEXT:  pushq %r12
+;SSE-NEXT:  pushq %rbx
+;SSE:       movaps %xmm15
+;SSE-NEXT:  movaps %xmm14
+;SSE-NEXT:  movaps %xmm13
+;SSE-NEXT:  movaps %xmm12
+;SSE-NEXT:  movaps %xmm11
+;SSE-NEXT:  movaps %xmm10
+;SSE-NEXT:  movaps %xmm9
+;SSE-NEXT:  movaps %xmm8
+;SSE-NEXT:  movaps %xmm7
+;SSE-NEXT:  movaps %xmm6
+;SSE-NEXT:  movaps %xmm5
+;SSE-NEXT:  movaps %xmm4
+;SSE-NEXT:  movaps %xmm3
+;SSE-NEXT:  movaps %xmm2
+;SSE-NEXT:  movaps %xmm1
+;SSE-NEXT:  movaps %xmm0
+;AVX-LABEL: preserve_allcc1
+;AVX:       pushq %r10
+;AVX-NEXT:  pushq %r9
+;AVX-NEXT:  pushq %r8
+;AVX-NEXT:  pushq %rdi
+;AVX-NEXT:  pushq %rsi
+;AVX-NEXT:  pushq %rdx
+;AVX-NEXT:  pushq %rcx
+;AVX-NEXT:  pushq %rax
+;AVX-NEXT:  pushq %rbp
+;AVX-NEXT:  pushq %r15
+;AVX-NEXT:  pushq %r14
+;AVX-NEXT:  pushq %r13
+;AVX-NEXT:  pushq %r12
+;AVX-NEXT:  pushq %rbx
+;AVX:       vmovups %ymm15
+;AVX-NEXT:  vmovups %ymm14
+;AVX-NEXT:  vmovups %ymm13
+;AVX-NEXT:  vmovups %ymm12
+;AVX-NEXT:  vmovups %ymm11
+;AVX-NEXT:  vmovups %ymm10
+;AVX-NEXT:  vmovups %ymm9
+;AVX-NEXT:  vmovups %ymm8
+;AVX-NEXT:  vmovups %ymm7
+;AVX-NEXT:  vmovups %ymm6
+;AVX-NEXT:  vmovups %ymm5
+;AVX-NEXT:  vmovups %ymm4
+;AVX-NEXT:  vmovups %ymm3
+;AVX-NEXT:  vmovups %ymm2
+;AVX-NEXT:  vmovups %ymm1
+;AVX-NEXT:  vmovups %ymm0
+  call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{rbp},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"()
+  ret void
+}
+
+; Make sure only R11 is saved before the call
+declare preserve_allcc void @bar(i64, i64, double, double)
+define void @preserve_allcc2() nounwind {
+entry:
+;SSE-LABEL: preserve_allcc2
+;SSE:       movq %r11, [[REG:%[a-z0-9]+]]
+;SSE-NOT:   movaps %xmm
+;SSE:       movq [[REG]], %r11
+  %a0 = call i64 asm sideeffect "", "={rax}"() nounwind
+  %a1 = call i64 asm sideeffect "", "={rcx}"() nounwind
+  %a2 = call i64 asm sideeffect "", "={rdx}"() nounwind
+  %a3 = call i64 asm sideeffect "", "={r8}"() nounwind
+  %a4 = call i64 asm sideeffect "", "={r9}"() nounwind
+  %a5 = call i64 asm sideeffect "", "={r10}"() nounwind
+  %a6 = call i64 asm sideeffect "", "={r11}"() nounwind
+  %a10 = call <2 x double> asm sideeffect "", "={xmm2}"() nounwind
+  %a11 = call <2 x double> asm sideeffect "", "={xmm3}"() nounwind
+  %a12 = call <2 x double> asm sideeffect "", "={xmm4}"() nounwind
+  %a13 = call <2 x double> asm sideeffect "", "={xmm5}"() nounwind
+  %a14 = call <2 x double> asm sideeffect "", "={xmm6}"() nounwind
+  %a15 = call <2 x double> asm sideeffect "", "={xmm7}"() nounwind
+  %a16 = call <2 x double> asm sideeffect "", "={xmm8}"() nounwind
+  %a17 = call <2 x double> asm sideeffect "", "={xmm9}"() nounwind
+  %a18 = call <2 x double> asm sideeffect "", "={xmm10}"() nounwind
+  %a19 = call <2 x double> asm sideeffect "", "={xmm11}"() nounwind
+  %a20 = call <2 x double> asm sideeffect "", "={xmm12}"() nounwind
+  %a21 = call <2 x double> asm sideeffect "", "={xmm13}"() nounwind
+  %a22 = call <2 x double> asm sideeffect "", "={xmm14}"() nounwind
+  %a23 = call <2 x double> asm sideeffect "", "={xmm15}"() nounwind
+  call preserve_allcc void @bar(i64 1, i64 2, double 3.0, double 4.0)
+  call void asm sideeffect "", "{rax},{rcx},{rdx},{r8},{r9},{r10},{r11},{xmm2},{xmm3},{xmm4},{xmm5},{xmm6},{xmm7},{xmm8},{xmm9},{xmm10},{xmm11},{xmm12},{xmm13},{xmm14},{xmm15}"(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, <2 x double> %a10, <2 x double> %a11, <2 x double> %a12, <2 x double> %a13, <2 x double> %a14, <2 x double> %a15, <2 x double> %a16, <2 x double> %a17, <2 x double> %a18, <2 x double> %a19, <2 x double> %a20, <2 x double> %a21, <2 x double> %a22, <2 x double> %a23)
+  ret void
+}
diff --git a/test/CodeGen/X86/preserve_mostcc64.ll b/test/CodeGen/X86/preserve_mostcc64.ll
new file mode 100644
index 0000000..4ee293e
--- /dev/null
+++ b/test/CodeGen/X86/preserve_mostcc64.ll
@@ -0,0 +1,86 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7     | FileCheck --check-prefix=SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck --check-prefix=AVX %s
+
+; Every GPR should be saved - except r11
+define preserve_mostcc void @preserve_mostcc1() nounwind {
+entry:
+;SSE-LABEL: preserve_mostcc1
+;SSE:       pushq %r10
+;SSE-NEXT:  pushq %r9
+;SSE-NEXT:  pushq %r8
+;SSE-NEXT:  pushq %rdi
+;SSE-NEXT:  pushq %rsi
+;SSE-NEXT:  pushq %rdx
+;SSE-NEXT:  pushq %rcx
+;SSE-NEXT:  pushq %rax
+;SSE-NEXT:  pushq %rbp
+;SSE-NEXT:  pushq %r15
+;SSE-NEXT:  pushq %r14
+;SSE-NEXT:  pushq %r13
+;SSE-NEXT:  pushq %r12
+;SSE-NEXT:  pushq %rbx
+;AVX-LABEL: preserve_mostcc1
+;AVX:       pushq %r10
+;AVX-NEXT:  pushq %r9
+;AVX-NEXT:  pushq %r8
+;AVX-NEXT:  pushq %rdi
+;AVX-NEXT:  pushq %rsi
+;AVX-NEXT:  pushq %rdx
+;AVX-NEXT:  pushq %rcx
+;AVX-NEXT:  pushq %rax
+;AVX-NEXT:  pushq %rbp
+;AVX-NEXT:  pushq %r15
+;AVX-NEXT:  pushq %r14
+;AVX-NEXT:  pushq %r13
+;AVX-NEXT:  pushq %r12
+;AVX-NEXT:  pushq %rbx
+  call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{rbp},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"()
+  ret void
+}
+
+; Make sure R11 and XMMs are saved before the call
+declare preserve_mostcc void @foo(i64, i64, double, double)
+define void @preserve_mostcc2() nounwind {
+entry:
+;SSE-LABEL: preserve_mostcc2
+;SSE:       movq %r11, [[REG:%[a-z0-9]+]]
+;SSE:       movaps %xmm2
+;SSE:       movaps %xmm3
+;SSE:       movaps %xmm4
+;SSE:       movaps %xmm5
+;SSE:       movaps %xmm6
+;SSE:       movaps %xmm7
+;SSE:       movaps %xmm8
+;SSE:       movaps %xmm9
+;SSE:       movaps %xmm10
+;SSE:       movaps %xmm11
+;SSE:       movaps %xmm12
+;SSE:       movaps %xmm13
+;SSE:       movaps %xmm14
+;SSE:       movaps %xmm15
+;SSE:       movq [[REG]], %r11
+  %a0 = call i64 asm sideeffect "", "={rax}"() nounwind
+  %a1 = call i64 asm sideeffect "", "={rcx}"() nounwind
+  %a2 = call i64 asm sideeffect "", "={rdx}"() nounwind
+  %a3 = call i64 asm sideeffect "", "={r8}"() nounwind
+  %a4 = call i64 asm sideeffect "", "={r9}"() nounwind
+  %a5 = call i64 asm sideeffect "", "={r10}"() nounwind
+  %a6 = call i64 asm sideeffect "", "={r11}"() nounwind
+  %a10 = call <2 x double> asm sideeffect "", "={xmm2}"() nounwind
+  %a11 = call <2 x double> asm sideeffect "", "={xmm3}"() nounwind
+  %a12 = call <2 x double> asm sideeffect "", "={xmm4}"() nounwind
+  %a13 = call <2 x double> asm sideeffect "", "={xmm5}"() nounwind
+  %a14 = call <2 x double> asm sideeffect "", "={xmm6}"() nounwind
+  %a15 = call <2 x double> asm sideeffect "", "={xmm7}"() nounwind
+  %a16 = call <2 x double> asm sideeffect "", "={xmm8}"() nounwind
+  %a17 = call <2 x double> asm sideeffect "", "={xmm9}"() nounwind
+  %a18 = call <2 x double> asm sideeffect "", "={xmm10}"() nounwind
+  %a19 = call <2 x double> asm sideeffect "", "={xmm11}"() nounwind
+  %a20 = call <2 x double> asm sideeffect "", "={xmm12}"() nounwind
+  %a21 = call <2 x double> asm sideeffect "", "={xmm13}"() nounwind
+  %a22 = call <2 x double> asm sideeffect "", "={xmm14}"() nounwind
+  %a23 = call <2 x double> asm sideeffect "", "={xmm15}"() nounwind
+  call preserve_mostcc void @foo(i64 1, i64 2, double 3.0, double 4.0)
+  call void asm sideeffect "", "{rax},{rcx},{rdx},{r8},{r9},{r10},{r11},{xmm2},{xmm3},{xmm4},{xmm5},{xmm6},{xmm7},{xmm8},{xmm9},{xmm10},{xmm11},{xmm12},{xmm13},{xmm14},{xmm15}"(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, <2 x double> %a10, <2 x double> %a11, <2 x double> %a12, <2 x double> %a13, <2 x double> %a14, <2 x double> %a15, <2 x double> %a16, <2 x double> %a17, <2 x double> %a18, <2 x double> %a19, <2 x double> %a20, <2 x double> %a21, <2 x double> %a22, <2 x double> %a23)
+  ret void
+}
diff --git a/test/CodeGen/X86/private-2.ll b/test/CodeGen/X86/private-2.ll
index 4413cee..cf2d741 100644
--- a/test/CodeGen/X86/private-2.ll
+++ b/test/CodeGen/X86/private-2.ll
@@ -2,7 +2,7 @@
 ; Quote should be outside of private prefix.
 ; rdar://6855766x
 
-; CHECK: L__ZZ20
+; CHECK: "l__ZZ20-[Example1 whatever]E4C.91"
 
 	%struct.A = type { i32*, i32 }
 @"_ZZ20-[Example1 whatever]E4C.91" = private constant %struct.A { i32* null, i32 1 }		; <%struct.A*> [#uses=1]
diff --git a/test/CodeGen/X86/ragreedy-bug.ll b/test/CodeGen/X86/ragreedy-bug.ll
new file mode 100644
index 0000000..df9b41d
--- /dev/null
+++ b/test/CodeGen/X86/ragreedy-bug.ll
@@ -0,0 +1,292 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
+
+; This testing case is reduced from 197.parser prune_match function.
+; We make sure register copies are not generated on isupper.exit blocks.
+
+; CHECK: isupper.exit
+; CHECK-NEXT: in Loop
+; CHECK-NEXT: testl
+; CHECK-NEXT: jne
+; CHECK: isupper.exit
+; CHECK-NEXT: in Loop
+; CHECK-NEXT: testl
+; CHECK-NEXT: je
+; CHECK: maskrune
+; CHECK: maskrune
+
+%struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* }
+%struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* }
+%struct._RuneLocale = type { [8 x i8], [32 x i8], i32 (i8*, i64, i8**)*, i32 (i32, i8*, i64, i8**)*, i32, [256 x i32], [256 x i32], [256 x i32], %struct._RuneRange, %struct._RuneRange, %struct._RuneRange, i8*, i32, i32, %struct._RuneCharClass* }
+%struct._RuneRange = type { i32, %struct._RuneEntry* }
+%struct._RuneEntry = type { i32, i32, i32, i32* }
+%struct._RuneCharClass = type { [14 x i8], i32 }
+%struct.Exp_struct = type { i8, i8, i8, i8, %union.anon }
+%union.anon = type { %struct.E_list_struct* }
+%struct.E_list_struct = type { %struct.E_list_struct*, %struct.Exp_struct* }
+%struct.domain_struct = type { i8*, i32, %struct.List_o_links_struct*, i32, i32, %struct.d_tree_leaf_struct*, %struct.domain_struct* }
+%struct.d_tree_leaf_struct = type { %struct.domain_struct*, i32, %struct.d_tree_leaf_struct* }
+@_DefaultRuneLocale = external global %struct._RuneLocale
+declare i32 @__maskrune(i32, i64) #7
+define fastcc i32 @prune_match(%struct.Connector_struct* nocapture readonly %a, %struct.Connector_struct* nocapture readonly %b) #9 {
+entry:
+  %label56 = bitcast %struct.Connector_struct* %a to i16*
+  %0 = load i16* %label56, align 2
+  %label157 = bitcast %struct.Connector_struct* %b to i16*
+  %1 = load i16* %label157, align 2
+  %cmp = icmp eq i16 %0, %1
+  br i1 %cmp, label %if.end, label %return, !prof !988
+if.end:
+  %priority = getelementptr inbounds %struct.Connector_struct* %a, i64 0, i32 2
+  %2 = load i8* %priority, align 1
+  %priority5 = getelementptr inbounds %struct.Connector_struct* %b, i64 0, i32 2
+  %3 = load i8* %priority5, align 1
+  %string = getelementptr inbounds %struct.Connector_struct* %a, i64 0, i32 5
+  %4 = load i8** %string, align 8
+  %string7 = getelementptr inbounds %struct.Connector_struct* %b, i64 0, i32 5
+  %5 = load i8** %string7, align 8
+  br label %while.cond
+while.cond:
+  %lsr.iv27 = phi i64 [ %lsr.iv.next28, %if.end17 ], [ 0, %if.end ]
+  %scevgep55 = getelementptr i8* %4, i64 %lsr.iv27
+  %6 = load i8* %scevgep55, align 1
+  %idxprom.i.i = sext i8 %6 to i64
+  %isascii.i.i224 = icmp sgt i8 %6, -1
+  br i1 %isascii.i.i224, label %cond.true.i.i, label %cond.false.i.i, !prof !181
+cond.true.i.i:
+  %arrayidx.i.i = getelementptr inbounds %struct._RuneLocale* @_DefaultRuneLocale, i64 0, i32 5, i64 %idxprom.i.i
+  %7 = load i32* %arrayidx.i.i, align 4
+  %and.i.i = and i32 %7, 32768
+  br label %isupper.exit
+cond.false.i.i:
+  %8 = trunc i64 %idxprom.i.i to i8
+  %conv8 = sext i8 %8 to i32
+  %call3.i.i = tail call i32 @__maskrune(i32 %conv8, i64 32768) #3
+  br label %isupper.exit
+isupper.exit:
+  %tobool1.sink.i.in.i = phi i32 [ %and.i.i, %cond.true.i.i ], [ %call3.i.i, %cond.false.i.i ]
+  %tobool1.sink.i.i = icmp eq i32 %tobool1.sink.i.in.i, 0
+  br i1 %tobool1.sink.i.i, label %lor.rhs, label %while.body, !prof !989
+lor.rhs:
+  %sunkaddr = ptrtoint i8* %5 to i64
+  %sunkaddr58 = add i64 %sunkaddr, %lsr.iv27
+  %sunkaddr59 = inttoptr i64 %sunkaddr58 to i8*
+  %9 = load i8* %sunkaddr59, align 1
+  %idxprom.i.i214 = sext i8 %9 to i64
+  %isascii.i.i213225 = icmp sgt i8 %9, -1
+  br i1 %isascii.i.i213225, label %cond.true.i.i217, label %cond.false.i.i219, !prof !181
+cond.true.i.i217:
+  %arrayidx.i.i215 = getelementptr inbounds %struct._RuneLocale* @_DefaultRuneLocale, i64 0, i32 5, i64 %idxprom.i.i214
+  %10 = load i32* %arrayidx.i.i215, align 4
+  %and.i.i216 = and i32 %10, 32768
+  br label %isupper.exit223
+cond.false.i.i219:
+  %11 = trunc i64 %idxprom.i.i214 to i8
+  %conv9 = sext i8 %11 to i32
+  %call3.i.i218 = tail call i32 @__maskrune(i32 %conv9, i64 32768) #3
+  br label %isupper.exit223
+isupper.exit223:
+  %tobool1.sink.i.in.i220 = phi i32 [ %and.i.i216, %cond.true.i.i217 ], [ %call3.i.i218, %cond.false.i.i219 ]
+  %tobool1.sink.i.i221 = icmp eq i32 %tobool1.sink.i.in.i220, 0
+  br i1 %tobool1.sink.i.i221, label %while.end, label %while.body, !prof !990
+while.body:
+  %sunkaddr60 = ptrtoint i8* %4 to i64
+  %sunkaddr61 = add i64 %sunkaddr60, %lsr.iv27
+  %sunkaddr62 = inttoptr i64 %sunkaddr61 to i8*
+  %12 = load i8* %sunkaddr62, align 1
+  %sunkaddr63 = ptrtoint i8* %5 to i64
+  %sunkaddr64 = add i64 %sunkaddr63, %lsr.iv27
+  %sunkaddr65 = inttoptr i64 %sunkaddr64 to i8*
+  %13 = load i8* %sunkaddr65, align 1
+  %cmp14 = icmp eq i8 %12, %13
+  br i1 %cmp14, label %if.end17, label %return, !prof !991
+if.end17:
+  %lsr.iv.next28 = add i64 %lsr.iv27, 1
+  br label %while.cond
+while.end:
+  %14 = or i8 %3, %2
+  %15 = icmp eq i8 %14, 0
+  br i1 %15, label %if.then23, label %if.else88, !prof !992
+if.then23:
+  %sunkaddr66 = ptrtoint %struct.Connector_struct* %a to i64
+  %sunkaddr67 = add i64 %sunkaddr66, 16
+  %sunkaddr68 = inttoptr i64 %sunkaddr67 to i8**
+  %16 = load i8** %sunkaddr68, align 8
+  %17 = load i8* %16, align 1
+  %cmp26 = icmp eq i8 %17, 83
+  %sunkaddr69 = ptrtoint i8* %4 to i64
+  %sunkaddr70 = add i64 %sunkaddr69, %lsr.iv27
+  %sunkaddr71 = inttoptr i64 %sunkaddr70 to i8*
+  %18 = load i8* %sunkaddr71, align 1
+  br i1 %cmp26, label %land.lhs.true28, label %while.cond59.preheader, !prof !993
+land.lhs.true28:
+  switch i8 %18, label %land.rhs.preheader [
+    i8 112, label %land.lhs.true35
+    i8 0, label %return
+  ], !prof !994
+land.lhs.true35:
+  %sunkaddr72 = ptrtoint i8* %5 to i64
+  %sunkaddr73 = add i64 %sunkaddr72, %lsr.iv27
+  %sunkaddr74 = inttoptr i64 %sunkaddr73 to i8*
+  %19 = load i8* %sunkaddr74, align 1
+  switch i8 %19, label %land.rhs.preheader [
+    i8 112, label %land.lhs.true43
+  ], !prof !995
+land.lhs.true43:
+  %20 = ptrtoint i8* %16 to i64
+  %21 = sub i64 0, %20
+  %scevgep52 = getelementptr i8* %4, i64 %21
+  %scevgep53 = getelementptr i8* %scevgep52, i64 %lsr.iv27
+  %scevgep54 = getelementptr i8* %scevgep53, i64 -1
+  %cmp45 = icmp eq i8* %scevgep54, null
+  br i1 %cmp45, label %return, label %lor.lhs.false47, !prof !996
+lor.lhs.false47:
+  %22 = ptrtoint i8* %16 to i64
+  %23 = sub i64 0, %22
+  %scevgep47 = getelementptr i8* %4, i64 %23
+  %scevgep48 = getelementptr i8* %scevgep47, i64 %lsr.iv27
+  %scevgep49 = getelementptr i8* %scevgep48, i64 -2
+  %cmp50 = icmp eq i8* %scevgep49, null
+  br i1 %cmp50, label %land.lhs.true52, label %while.cond59.preheader, !prof !997
+land.lhs.true52:
+  %sunkaddr75 = ptrtoint i8* %4 to i64
+  %sunkaddr76 = add i64 %sunkaddr75, %lsr.iv27
+  %sunkaddr77 = add i64 %sunkaddr76, -1
+  %sunkaddr78 = inttoptr i64 %sunkaddr77 to i8*
+  %24 = load i8* %sunkaddr78, align 1
+  %cmp55 = icmp eq i8 %24, 73
+  %cmp61233 = icmp eq i8 %18, 0
+  %or.cond265 = or i1 %cmp55, %cmp61233
+  br i1 %or.cond265, label %return, label %land.rhs.preheader, !prof !998
+while.cond59.preheader:
+  %cmp61233.old = icmp eq i8 %18, 0
+  br i1 %cmp61233.old, label %return, label %land.rhs.preheader, !prof !999
+land.rhs.preheader:
+  %scevgep33 = getelementptr i8* %5, i64 %lsr.iv27
+  %scevgep43 = getelementptr i8* %4, i64 %lsr.iv27
+  br label %land.rhs
+land.rhs:
+  %lsr.iv = phi i64 [ 0, %land.rhs.preheader ], [ %lsr.iv.next, %if.then83 ]
+  %25 = phi i8 [ %27, %if.then83 ], [ %18, %land.rhs.preheader ]
+  %scevgep34 = getelementptr i8* %scevgep33, i64 %lsr.iv
+  %26 = load i8* %scevgep34, align 1
+  %cmp64 = icmp eq i8 %26, 0
+  br i1 %cmp64, label %return, label %while.body66, !prof !1000
+while.body66:
+  %cmp68 = icmp eq i8 %25, 42
+  %cmp72 = icmp eq i8 %26, 42
+  %or.cond = or i1 %cmp68, %cmp72
+  br i1 %or.cond, label %if.then83, label %lor.lhs.false74, !prof !1001
+lor.lhs.false74:
+  %cmp77 = icmp ne i8 %25, %26
+  %cmp81 = icmp eq i8 %25, 94
+  %or.cond208 = or i1 %cmp77, %cmp81
+  br i1 %or.cond208, label %return, label %if.then83, !prof !1002
+if.then83:
+  %scevgep44 = getelementptr i8* %scevgep43, i64 %lsr.iv
+  %scevgep45 = getelementptr i8* %scevgep44, i64 1
+  %27 = load i8* %scevgep45, align 1
+  %cmp61 = icmp eq i8 %27, 0
+  %lsr.iv.next = add i64 %lsr.iv, 1
+  br i1 %cmp61, label %return, label %land.rhs, !prof !999
+if.else88:
+  %cmp89 = icmp eq i8 %2, 1
+  %cmp92 = icmp eq i8 %3, 2
+  %or.cond159 = and i1 %cmp89, %cmp92
+  br i1 %or.cond159, label %while.cond95.preheader, label %if.else123, !prof !1003
+while.cond95.preheader:
+  %sunkaddr79 = ptrtoint i8* %4 to i64
+  %sunkaddr80 = add i64 %sunkaddr79, %lsr.iv27
+  %sunkaddr81 = inttoptr i64 %sunkaddr80 to i8*
+  %28 = load i8* %sunkaddr81, align 1
+  %cmp97238 = icmp eq i8 %28, 0
+  br i1 %cmp97238, label %return, label %land.rhs99.preheader, !prof !1004
+land.rhs99.preheader:
+  %scevgep31 = getelementptr i8* %5, i64 %lsr.iv27
+  %scevgep40 = getelementptr i8* %4, i64 %lsr.iv27
+  br label %land.rhs99
+land.rhs99:
+  %lsr.iv17 = phi i64 [ 0, %land.rhs99.preheader ], [ %lsr.iv.next18, %if.then117 ]
+  %29 = phi i8 [ %31, %if.then117 ], [ %28, %land.rhs99.preheader ]
+  %scevgep32 = getelementptr i8* %scevgep31, i64 %lsr.iv17
+  %30 = load i8* %scevgep32, align 1
+  %cmp101 = icmp eq i8 %30, 0
+  br i1 %cmp101, label %return, label %while.body104, !prof !1005
+while.body104:
+  %cmp107 = icmp eq i8 %29, %30
+  %cmp111 = icmp eq i8 %29, 42
+  %or.cond209 = or i1 %cmp107, %cmp111
+  %cmp115 = icmp eq i8 %30, 94
+  %or.cond210 = or i1 %or.cond209, %cmp115
+  br i1 %or.cond210, label %if.then117, label %return, !prof !1006
+if.then117:
+  %scevgep41 = getelementptr i8* %scevgep40, i64 %lsr.iv17
+  %scevgep42 = getelementptr i8* %scevgep41, i64 1
+  %31 = load i8* %scevgep42, align 1
+  %cmp97 = icmp eq i8 %31, 0
+  %lsr.iv.next18 = add i64 %lsr.iv17, 1
+  br i1 %cmp97, label %return, label %land.rhs99, !prof !1004
+if.else123:
+  %cmp124 = icmp eq i8 %3, 1
+  %cmp127 = icmp eq i8 %2, 2
+  %or.cond160 = and i1 %cmp124, %cmp127
+  br i1 %or.cond160, label %while.cond130.preheader, label %return, !prof !1007
+while.cond130.preheader:
+  %sunkaddr82 = ptrtoint i8* %4 to i64
+  %sunkaddr83 = add i64 %sunkaddr82, %lsr.iv27
+  %sunkaddr84 = inttoptr i64 %sunkaddr83 to i8*
+  %32 = load i8* %sunkaddr84, align 1
+  %cmp132244 = icmp eq i8 %32, 0
+  br i1 %cmp132244, label %return, label %land.rhs134.preheader, !prof !1008
+land.rhs134.preheader:
+  %scevgep29 = getelementptr i8* %5, i64 %lsr.iv27
+  %scevgep37 = getelementptr i8* %4, i64 %lsr.iv27
+  br label %land.rhs134
+land.rhs134:
+  %lsr.iv22 = phi i64 [ 0, %land.rhs134.preheader ], [ %lsr.iv.next23, %if.then152 ]
+  %33 = phi i8 [ %35, %if.then152 ], [ %32, %land.rhs134.preheader ]
+  %scevgep30 = getelementptr i8* %scevgep29, i64 %lsr.iv22
+  %34 = load i8* %scevgep30, align 1
+  %cmp136 = icmp eq i8 %34, 0
+  br i1 %cmp136, label %return, label %while.body139, !prof !1009
+while.body139:
+  %cmp142 = icmp eq i8 %33, %34
+  %cmp146 = icmp eq i8 %34, 42
+  %or.cond211 = or i1 %cmp142, %cmp146
+  %cmp150 = icmp eq i8 %33, 94
+  %or.cond212 = or i1 %or.cond211, %cmp150
+  br i1 %or.cond212, label %if.then152, label %return, !prof !1010
+if.then152:
+  %scevgep38 = getelementptr i8* %scevgep37, i64 %lsr.iv22
+  %scevgep39 = getelementptr i8* %scevgep38, i64 1
+  %35 = load i8* %scevgep39, align 1
+  %cmp132 = icmp eq i8 %35, 0
+  %lsr.iv.next23 = add i64 %lsr.iv22, 1
+  br i1 %cmp132, label %return, label %land.rhs134, !prof !1008
+return:
+  %retval.0 = phi i32 [ 0, %entry ], [ 1, %land.lhs.true52 ], [ 1, %land.lhs.true43 ], [ 0, %if.else123 ], [ 1, %while.cond59.preheader ], [ 1, %while.cond95.preheader ], [ 1, %while.cond130.preheader ], [ 1, %land.lhs.true28 ], [ 1, %if.then83 ], [ 0, %lor.lhs.false74 ], [ 1, %land.rhs ], [ 1, %if.then117 ], [ 0, %while.body104 ], [ 1, %land.rhs99 ], [ 1, %if.then152 ], [ 0, %while.body139 ], [ 1, %land.rhs134 ], [ 0, %while.body ]
+  ret i32 %retval.0
+}
+!181 = metadata !{metadata !"branch_weights", i32 662038, i32 1}
+!988 = metadata !{metadata !"branch_weights", i32 12091450, i32 1916}
+!989 = metadata !{metadata !"branch_weights", i32 7564670, i32 4526781}
+!990 = metadata !{metadata !"branch_weights", i32 7484958, i32 13283499}
+!991 = metadata !{metadata !"branch_weights", i32 8677007, i32 4606493}
+!992 = metadata !{metadata !"branch_weights", i32 -1172426948, i32 145094705}
+!993 = metadata !{metadata !"branch_weights", i32 1468914, i32 5683688}
+!994 = metadata !{metadata !"branch_weights", i32 114025221, i32 -1217548794, i32 -1199521551, i32 87712616}
+!995 = metadata !{metadata !"branch_weights", i32 1853716452, i32 -444717951, i32 932776759}
+!996 = metadata !{metadata !"branch_weights", i32 1004870, i32 20259}
+!997 = metadata !{metadata !"branch_weights", i32 20071, i32 189}
+!998 = metadata !{metadata !"branch_weights", i32 -1020255939, i32 572177766}
+!999 = metadata !{metadata !"branch_weights", i32 2666513, i32 3466431}
+!1000 = metadata !{metadata !"branch_weights", i32 5117635, i32 1859780}
+!1001 = metadata !{metadata !"branch_weights", i32 354902465, i32 -1444604407}
+!1002 = metadata !{metadata !"branch_weights", i32 -1762419279, i32 1592770684}
+!1003 = metadata !{metadata !"branch_weights", i32 1435905930, i32 -1951930624}
+!1004 = metadata !{metadata !"branch_weights", i32 1, i32 504888}
+!1005 = metadata !{metadata !"branch_weights", i32 94662, i32 504888}
+!1006 = metadata !{metadata !"branch_weights", i32 -1897793104, i32 160196332}
+!1007 = metadata !{metadata !"branch_weights", i32 2074643678, i32 -29579071}
+!1008 = metadata !{metadata !"branch_weights", i32 1, i32 226163}
+!1009 = metadata !{metadata !"branch_weights", i32 58357, i32 226163}
+!1010 = metadata !{metadata !"branch_weights", i32 -2072848646, i32 92907517}
diff --git a/test/CodeGen/X86/ragreedy-hoist-spill.ll b/test/CodeGen/X86/ragreedy-hoist-spill.ll
new file mode 100644
index 0000000..c6b28f7
--- /dev/null
+++ b/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -0,0 +1,389 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
+
+; This testing case is reduced from 254.gap SyFgets funciton.
+; We make sure a spill is not hoisted to a hotter outer loop.
+
+%struct.TMP.1 = type { %struct.TMP.2*, %struct.TMP.2*, [1024 x i8] }
+%struct.TMP.2 = type { i8*, i32, i32, i16, i16, %struct.TMP.3, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.TMP.3, %struct.TMP.4*, i32, [3 x i8], [1 x i8], %struct.TMP.3, i32, i64 }
+%struct.TMP.4 = type opaque
+%struct.TMP.3 = type { i8*, i32 }
+
+@syBuf = external global [16 x %struct.TMP.1], align 16
+@syHistory = external global [8192 x i8], align 16
+@SyFgets.yank = external global [512 x i8], align 16
+@syCTRO = external global i32, align 4
+
+; CHECK-LABEL: SyFgets
+define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
+entry:
+  %sub.ptr.rhs.cast646 = ptrtoint i8* %line to i64
+  %old = alloca [512 x i8], align 16
+  %0 = getelementptr inbounds [512 x i8]* %old, i64 0, i64 0
+  switch i64 %fid, label %if.then [
+    i64 2, label %if.end
+    i64 0, label %if.end
+  ]
+
+if.then:
+  br label %cleanup
+
+if.end:
+  switch i64 undef, label %if.end25 [
+    i64 0, label %if.then4
+    i64 1, label %land.lhs.true14
+  ]
+
+if.then4:
+  br i1 undef, label %SyTime.exit, label %if.then.i
+
+if.then.i:
+  unreachable
+
+SyTime.exit:
+  br i1 undef, label %SyTime.exit2681, label %if.then.i2673
+
+if.then.i2673:
+  unreachable
+
+SyTime.exit2681:
+  br label %cleanup
+
+land.lhs.true14:
+  unreachable
+
+if.end25:
+  br i1 undef, label %SyTime.exit2720, label %if.then.i2712
+
+if.then.i2712:
+  unreachable
+
+SyTime.exit2720:
+  %add.ptr = getelementptr [512 x i8]* %old, i64 0, i64 512
+  %cmp293427 = icmp ult i8* %0, %add.ptr
+  br i1 %cmp293427, label %for.body.lr.ph, label %while.body.preheader
+
+for.body.lr.ph:
+  call void @llvm.memset.p0i8.i64(i8* undef, i8 32, i64 512, i32 16, i1 false)
+  br label %while.body.preheader
+
+while.body.preheader:
+  %add.ptr1603 = getelementptr [512 x i8]* null, i64 0, i64 512
+  %echo.i3101 = getelementptr [16 x %struct.TMP.1]* @syBuf, i64 0, i64 %fid, i32 1
+  %1 = xor i64 %sub.ptr.rhs.cast646, -1
+  br label %do.body
+
+do.body:
+  %ch2.0 = phi i32 [ 0, %while.body.preheader ], [ %ch.12.ch2.12, %do.body ]
+  %rep.0 = phi i32 [ 1, %while.body.preheader ], [ %rep.6, %do.body ]
+  store i32 0, i32* @syCTRO, align 4, !tbaa !1
+  %ch.0.ch2.0 = select i1 undef, i32 14, i32 %ch2.0
+  %ch2.2 = select i1 undef, i32 0, i32 %ch.0.ch2.0
+  %ch.2.ch2.2 = select i1 undef, i32 0, i32 %ch2.2
+  %ch2.4 = select i1 undef, i32 278, i32 %ch.2.ch2.2
+  %ch2.5 = select i1 undef, i32 0, i32 %ch2.4
+  %rep.2 = select i1 undef, i32 undef, i32 %rep.0
+  %ch.5.ch2.5 = select i1 undef, i32 undef, i32 %ch2.5
+  %ch2.7 = select i1 undef, i32 0, i32 %ch.5.ch2.5
+  %rep.3 = select i1 undef, i32 undef, i32 %rep.2
+  %ch.7.ch2.7 = select i1 false, i32 0, i32 %ch2.7
+  %mul98.rep.3 = select i1 false, i32 0, i32 %rep.3
+  %ch2.9 = select i1 undef, i32 undef, i32 %ch.7.ch2.7
+  %rep.5 = select i1 undef, i32 undef, i32 %mul98.rep.3
+  %ch2.10 = select i1 false, i32 undef, i32 %ch2.9
+  %rep.6 = select i1 false, i32 undef, i32 %rep.5
+  %isdigittmp = add i32 %ch2.10, -48
+  %isdigit = icmp ult i32 %isdigittmp, 10
+  %cmp119 = icmp eq i32 undef, 22
+  %or.cond1875 = and i1 %isdigit, %cmp119
+  %ch.10.ch2.10 = select i1 %or.cond1875, i32 undef, i32 %ch2.10
+  %.ch.10 = select i1 %or.cond1875, i32 0, i32 undef
+  %ch2.12 = select i1 undef, i32 %.ch.10, i32 %ch.10.ch2.10
+  %ch.12 = select i1 undef, i32 0, i32 %.ch.10
+  %ch.12.ch2.12 = select i1 false, i32 %ch.12, i32 %ch2.12
+  %.ch.12 = select i1 false, i32 0, i32 %ch.12
+  %cmp147 = icmp eq i32 %.ch.12, 0
+  br i1 %cmp147, label %do.body, label %do.end
+
+do.end:
+  %cmp164 = icmp eq i32 %ch.12.ch2.12, 21
+  %mul167 = shl i32 %rep.6, 2
+  %rep.8 = select i1 %cmp164, i32 %mul167, i32 %rep.6
+  %..ch.19 = select i1 false, i32 2, i32 0
+  br i1 undef, label %while.body200, label %while.end1465
+
+while.body200:
+  %dec3386.in = phi i32 [ %dec3386, %while.cond197.backedge ], [ %rep.8, %do.end ]
+  %oldc.13384 = phi i32 [ %oldc.1.be, %while.cond197.backedge ], [ 0, %do.end ]
+  %ch.213379 = phi i32 [ %last.1.be, %while.cond197.backedge ], [ %..ch.19, %do.end ]
+  %last.13371 = phi i32 [ %last.1.be, %while.cond197.backedge ], [ 0, %do.end ]
+  %dec3386 = add i32 %dec3386.in, -1
+  switch i32 %ch.213379, label %sw.default [
+    i32 1, label %while.cond201.preheader
+    i32 322, label %sw.bb206
+    i32 354, label %sw.bb206
+    i32 2, label %sw.bb243
+    i32 364, label %sw.bb1077
+    i32 326, label %sw.bb256
+    i32 358, label %sw.bb256
+    i32 341, label %sw.bb979
+    i32 323, label %while.cond1037.preheader
+    i32 373, label %sw.bb979
+    i32 4, label %if.then1477
+    i32 332, label %sw.bb1077
+    i32 11, label %for.cond357
+    i32 355, label %while.cond1037.preheader
+    i32 324, label %sw.bb474
+    i32 356, label %sw.bb474
+    i32 20, label %sw.bb566
+    i32 -1, label %while.cond197.backedge
+    i32 268, label %sw.bb1134
+    i32 16, label %while.cond635.preheader
+    i32 18, label %sw.bb956
+    i32 316, label %while.cond864
+  ]
+
+while.cond1037.preheader:
+  %cmp10393273 = icmp eq i8 undef, 0
+  br i1 %cmp10393273, label %if.end1070, label %land.rhs1041
+
+while.cond635.preheader:
+  br i1 undef, label %for.body643.us, label %while.cond661
+
+for.body643.us:
+  br label %for.body643.us
+
+while.cond201.preheader:
+  %umax = select i1 false, i64 undef, i64 %1
+  %2 = xor i64 %umax, -1
+  %3 = inttoptr i64 %2 to i8*
+  br label %while.cond197.backedge
+
+sw.bb206:
+  br label %while.cond197.backedge
+
+sw.bb243:
+  br label %while.cond197.backedge
+
+sw.bb256:
+  br label %while.cond197.backedge
+
+while.cond197.backedge:
+  %last.1.be = phi i32 [ %ch.213379, %sw.default ], [ -1, %while.body200 ], [ %ch.213379, %sw.bb1077 ], [ %ch.213379, %sw.bb979 ], [ 18, %sw.bb956 ], [ 20, %sw.bb566 ], [ %ch.213379, %for.end552 ], [ %ch.213379, %sw.bb256 ], [ 2, %sw.bb243 ], [ 1, %while.cond201.preheader ], [ 268, %for.cond1145.preheader ], [ %ch.213379, %sw.bb206 ]
+  %oldc.1.be = phi i32 [ %oldc.13384, %sw.default ], [ %oldc.13384, %while.body200 ], [ %oldc.13384, %sw.bb1077 ], [ %oldc.13384, %sw.bb979 ], [ %oldc.13384, %sw.bb956 ], [ %oldc.13384, %sw.bb566 ], [ %oldc.13384, %for.end552 ], [ %oldc.13384, %sw.bb256 ], [ %oldc.13384, %sw.bb243 ], [ %oldc.13384, %while.cond201.preheader ], [ 0, %for.cond1145.preheader ], [ %oldc.13384, %sw.bb206 ]
+  %cmp198 = icmp sgt i32 %dec3386, 0
+  br i1 %cmp198, label %while.body200, label %while.end1465
+
+for.cond357:
+  br label %for.cond357
+
+sw.bb474:
+  %cmp476 = icmp eq i8 undef, 0
+  br i1 %cmp476, label %if.end517, label %do.body479.preheader
+
+do.body479.preheader:
+  %cmp4833314 = icmp eq i8 undef, 0
+  br i1 %cmp4833314, label %if.end517, label %land.rhs485
+
+land.rhs485:
+  %incdec.ptr4803316 = phi i8* [ %incdec.ptr480, %do.body479.backedge.land.rhs485_crit_edge ], [ undef, %do.body479.preheader ]
+  %isascii.i.i27763151 = icmp sgt i8 undef, -1
+  br i1 %isascii.i.i27763151, label %cond.true.i.i2780, label %cond.false.i.i2782
+
+cond.true.i.i2780:
+  br i1 undef, label %land.lhs.true490, label %lor.rhs500
+
+cond.false.i.i2782:
+  unreachable
+
+land.lhs.true490:
+  br i1 false, label %lor.rhs500, label %do.body479.backedge
+
+lor.rhs500:
+  ; CHECK: lor.rhs500
+  ; Make sure that we don't hoist the spill to outer loops.
+  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; CHECK: callq {{.*}}maskrune
+  %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
+  br i1 undef, label %land.lhs.true504, label %do.body479.backedge
+
+land.lhs.true504:
+  br i1 undef, label %do.body479.backedge, label %if.end517
+
+do.body479.backedge:
+  %incdec.ptr480 = getelementptr i8* %incdec.ptr4803316, i64 1
+  %cmp483 = icmp eq i8 undef, 0
+  br i1 %cmp483, label %if.end517, label %do.body479.backedge.land.rhs485_crit_edge
+
+do.body479.backedge.land.rhs485_crit_edge:
+  br label %land.rhs485
+
+if.end517:
+  %q.4 = phi i8* [ undef, %sw.bb474 ], [ undef, %do.body479.preheader ], [ %incdec.ptr480, %do.body479.backedge ], [ %incdec.ptr4803316, %land.lhs.true504 ]
+  switch i32 %last.13371, label %if.then532 [
+    i32 383, label %for.cond534
+    i32 356, label %for.cond534
+    i32 324, label %for.cond534
+    i32 24, label %for.cond534
+    i32 11, label %for.cond534
+  ]
+
+if.then532:
+  store i8 0, i8* getelementptr inbounds ([512 x i8]* @SyFgets.yank, i64 0, i64 0), align 16, !tbaa !5
+  br label %for.cond534
+
+for.cond534:
+  %cmp536 = icmp eq i8 undef, 0
+  br i1 %cmp536, label %for.cond542.preheader, label %for.cond534
+
+for.cond542.preheader:
+  br i1 undef, label %for.body545, label %for.end552
+
+for.body545:
+  br i1 undef, label %for.end552, label %for.body545
+
+for.end552:
+  %s.2.lcssa = phi i8* [ undef, %for.cond542.preheader ], [ %q.4, %for.body545 ]
+  %sub.ptr.lhs.cast553 = ptrtoint i8* %s.2.lcssa to i64
+  %sub.ptr.sub555 = sub i64 %sub.ptr.lhs.cast553, 0
+  %arrayidx556 = getelementptr i8* null, i64 %sub.ptr.sub555
+  store i8 0, i8* %arrayidx556, align 1, !tbaa !5
+  br label %while.cond197.backedge
+
+sw.bb566:
+  br label %while.cond197.backedge
+
+while.cond661:
+  br label %while.cond661
+
+while.cond864:
+  br label %while.cond864
+
+sw.bb956:
+  br i1 undef, label %if.then959, label %while.cond197.backedge
+
+if.then959:
+  br label %while.cond962
+
+while.cond962:
+  br label %while.cond962
+
+sw.bb979:
+  br label %while.cond197.backedge
+
+land.rhs1041:
+  unreachable
+
+if.end1070:
+  br label %sw.bb1077
+
+sw.bb1077:
+  br label %while.cond197.backedge
+
+sw.bb1134:
+  br i1 false, label %for.body1139, label %for.cond1145.preheader
+
+for.cond1145.preheader:
+  br i1 %cmp293427, label %for.body1150.lr.ph, label %while.cond197.backedge
+
+for.body1150.lr.ph:
+  unreachable
+
+for.body1139:
+  unreachable
+
+sw.default:
+  br label %while.cond197.backedge
+
+while.end1465:
+  %oldc.1.lcssa = phi i32 [ 0, %do.end ], [ %oldc.1.be, %while.cond197.backedge ]
+  %ch.21.lcssa = phi i32 [ %..ch.19, %do.end ], [ %last.1.be, %while.cond197.backedge ]
+  switch i32 %ch.21.lcssa, label %for.cond1480.preheader [
+    i32 -1, label %if.then1477
+    i32 15, label %if.then1477
+    i32 13, label %if.then1477
+    i32 10, label %if.then1477
+  ]
+
+for.cond1480.preheader:
+  br i1 undef, label %for.body1606.lr.ph, label %for.end1609
+
+if.then1477:
+  %p.1.lcssa3539 = phi i8* [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ %line, %while.body200 ]
+  %call1.i3057 = call i64 @"\01_write"(i32 undef, i8* undef, i64 1)
+  %sub.ptr.lhs.cast1717 = ptrtoint i8* %p.1.lcssa3539 to i64
+  %sub.ptr.sub1719 = sub i64 %sub.ptr.lhs.cast1717, %sub.ptr.rhs.cast646
+  %idx.neg1727 = sub i64 0, %sub.ptr.sub1719
+  br label %for.body1723
+
+for.body1606.lr.ph:
+  br label %for.end1609
+
+for.end1609:
+  br i1 undef, label %for.cond1659.preheader, label %land.lhs.true1614
+
+land.lhs.true1614:
+  br label %for.cond1659.preheader
+
+for.cond1659.preheader:
+  %cmp16623414 = icmp ult i8* undef, %add.ptr1603
+  br i1 %cmp16623414, label %for.body1664.lr.ph, label %while.body1703.lr.ph
+
+for.body1664.lr.ph:
+  %cmp16773405 = icmp slt i64 undef, undef
+  br i1 %cmp16773405, label %while.body1679, label %while.cond1683.preheader
+
+while.body1703.lr.ph:
+  unreachable
+
+while.cond1683.preheader:
+  br i1 undef, label %while.body1691, label %while.end1693
+
+while.body1679:
+  %oldc.43406 = phi i32 [ %inc, %syEchoch.exit3070 ], [ %oldc.1.lcssa, %for.body1664.lr.ph ]
+  %4 = load %struct.TMP.2** %echo.i3101, align 8, !tbaa !6
+  %call.i3062 = call i32 @fileno(%struct.TMP.2* %4)
+  br i1 undef, label %if.then.i3069, label %syEchoch.exit3070
+
+if.then.i3069:
+  br label %syEchoch.exit3070
+
+syEchoch.exit3070:
+  %inc = add i32 %oldc.43406, 1
+  %conv1672 = sext i32 %inc to i64
+  %cmp1677 = icmp slt i64 %conv1672, undef
+  br i1 %cmp1677, label %while.body1679, label %while.cond1683.preheader
+
+while.body1691:
+  unreachable
+
+while.end1693:
+  unreachable
+
+for.body1723:
+  %q.303203 = phi i8* [ getelementptr inbounds ([8192 x i8]* @syHistory, i64 0, i64 8189), %if.then1477 ], [ %incdec.ptr1730, %for.body1723 ]
+  %add.ptr1728 = getelementptr i8* %q.303203, i64 %idx.neg1727
+  %5 = load i8* %add.ptr1728, align 1, !tbaa !5
+  %incdec.ptr1730 = getelementptr i8* %q.303203, i64 -1
+  br label %for.body1723
+
+cleanup:
+  ret i8* undef
+}
+
+declare i32 @fileno(%struct.TMP.2* nocapture)
+declare i64 @"\01_write"(i32, i8*, i64)
+declare i32 @__maskrune(i32, i64)
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 (trunk 204257)"}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !3, metadata !3, i64 0}
+!6 = metadata !{metadata !7, metadata !8, i64 8}
+!7 = metadata !{metadata !"", metadata !8, i64 0, metadata !8, i64 8, metadata !3, i64 16}
+!8 = metadata !{metadata !"any pointer", metadata !3, i64 0}
diff --git a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
new file mode 100644
index 0000000..f3669fb
--- /dev/null
+++ b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
@@ -0,0 +1,168 @@
+; RUN: llc -regalloc=greedy -relocation-model=pic  < %s 2>&1 | FileCheck %s
+; Without the last chance recoloring, this test fails with:
+; "ran out of registers".
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx"
+
+@fp_dh_36985b17790d59a27994eaab5dcb00ee = external constant [499 x i32]
+@fp_dh_18716afa4a5354de0a302c8edb3b0ee1 = external global i32
+@fp_dh_20a33cdeefab8f4c8887e82766cb9dcb = external global i8*
+@fp_dh_9d93c897906e39883c58b034c8e786b2 = external global [5419648 x i8], align 16
+
+; Function Attrs: nounwind ssp
+; CHECK-NOT: ran out of registers during register allocation
+define void @fp_dh_f870bf31fd8ffe068450366e3f05389a(i8* %arg) #0 {
+bb:
+  indirectbr i8* undef, [label %bb85, label %bb206]
+
+bb85:                                             ; preds = %bb222, %bb85, %bb
+  store i8* blockaddress(@fp_dh_f870bf31fd8ffe068450366e3f05389a, %bb206), i8** undef, align 4
+  indirectbr i8* undef, [label %bb439, label %bb85]
+
+bb206:                                            ; preds = %bb
+  %tmp = getelementptr [499 x i32]* @fp_dh_36985b17790d59a27994eaab5dcb00ee, i32 0, i32 undef
+  %tmp207 = load i32* %tmp
+  %tmp208 = add i32 %tmp207, 1
+  %tmp209 = inttoptr i32 %tmp208 to i8*
+  indirectbr i8* %tmp209, [label %bb213]
+
+bb213:                                            ; preds = %bb206
+  %tmp214 = load i32* @fp_dh_18716afa4a5354de0a302c8edb3b0ee1, align 4
+  %tmp215 = load i8** @fp_dh_20a33cdeefab8f4c8887e82766cb9dcb, align 4
+  %tmp216 = urem i32 -717428541, %tmp214
+  %tmp217 = getelementptr i8* %tmp215, i32 %tmp216
+  %tmp218 = bitcast i8* %tmp217 to i32*
+  %tmp219 = load i32* %tmp218, align 4
+  store i32 %tmp219, i32* undef, align 4
+  %tmp220 = select i1 false, i32 359373646, i32 1677237955
+  %tmp221 = add i32 %tmp220, 0
+  indirectbr i8* undef, [label %bb432, label %bb222]
+
+bb222:                                            ; preds = %bb213
+  %tmp224 = load i32* undef, align 4
+  %tmp225 = load i32* undef, align 4
+  %tmp226 = xor i32 %tmp225, %tmp224
+  %tmp227 = shl i32 %tmp226, 1
+  %tmp228 = and i32 %tmp227, -2048880334
+  %tmp229 = sub i32 0, %tmp228
+  %tmp230 = add i32 0, %tmp229
+  %tmp231 = xor i32 %tmp230, 1059356227
+  %tmp232 = mul i32 %tmp231, 1603744721
+  %tmp233 = urem i32 %tmp232, 259
+  %tmp234 = getelementptr [259 x i8]* bitcast (i8* getelementptr inbounds ([5419648 x i8]* @fp_dh_9d93c897906e39883c58b034c8e786b2, i32 0, i32 2039075) to [259 x i8]*), i32 0, i32 %tmp233
+  %tmp235 = load i8* %tmp234, align 1
+  %tmp236 = add i32 %tmp233, 2
+  %tmp237 = getelementptr [264 x i8]* bitcast (i8* getelementptr inbounds ([5419648 x i8]* @fp_dh_9d93c897906e39883c58b034c8e786b2, i32 0, i32 3388166) to [264 x i8]*), i32 0, i32 %tmp236
+  %tmp238 = load i8* %tmp237, align 1
+  %tmp239 = getelementptr [265 x i8]* bitcast (i8* getelementptr inbounds ([5419648 x i8]* @fp_dh_9d93c897906e39883c58b034c8e786b2, i32 0, i32 1325165) to [265 x i8]*), i32 0, i32 0
+  %tmp240 = load i8* %tmp239, align 1
+  %tmp241 = add i32 %tmp233, 6
+  %tmp242 = trunc i32 %tmp241 to i8
+  %tmp243 = mul i8 %tmp242, -3
+  %tmp244 = add i8 %tmp243, 3
+  %tmp245 = mul i8 %tmp242, -6
+  %tmp246 = and i8 %tmp245, 6
+  %tmp247 = sub i8 0, %tmp246
+  %tmp248 = add i8 %tmp244, %tmp247
+  %tmp249 = load i8* undef, align 1
+  %tmp250 = xor i8 %tmp235, 17
+  %tmp251 = xor i8 %tmp250, %tmp238
+  %tmp252 = xor i8 %tmp251, %tmp240
+  %tmp253 = xor i8 %tmp252, %tmp249
+  %tmp254 = xor i8 %tmp253, %tmp248
+  %tmp255 = zext i8 %tmp254 to i16
+  %tmp256 = shl nuw i16 %tmp255, 8
+  %tmp257 = load i8* null, align 1
+  %tmp258 = load i32* @fp_dh_18716afa4a5354de0a302c8edb3b0ee1, align 4
+  %tmp259 = load i8** @fp_dh_20a33cdeefab8f4c8887e82766cb9dcb, align 4
+  %tmp260 = urem i32 -717428541, %tmp258
+  %tmp261 = getelementptr i8* %tmp259, i32 %tmp260
+  %tmp262 = bitcast i8* %tmp261 to i32*
+  %tmp263 = load i32* %tmp262, align 4
+  %tmp264 = xor i32 %tmp263, 0
+  %tmp265 = shl i32 %tmp264, 1
+  %tmp266 = and i32 %tmp265, -1312119832
+  %tmp267 = sub i32 0, %tmp266
+  %tmp268 = add i32 0, %tmp267
+  %tmp269 = xor i32 %tmp268, 623994670
+  %tmp270 = mul i32 %tmp269, 1603744721
+  %tmp271 = urem i32 %tmp270, 259
+  %tmp274 = add i32 %tmp271, 3
+  %tmp275 = getelementptr [265 x i8]* bitcast (i8* getelementptr inbounds ([5419648 x i8]* @fp_dh_9d93c897906e39883c58b034c8e786b2, i32 0, i32 1325165) to [265 x i8]*), i32 0, i32 %tmp274
+  %tmp276 = load i8* %tmp275, align 1
+  %tmp277 = add i32 %tmp271, 6
+  %tmp278 = trunc i32 %tmp277 to i8
+  %tmp279 = mul i8 %tmp278, -3
+  %tmp280 = add i8 %tmp279, 31
+  %tmp281 = add i8 %tmp280, 0
+  %tmp282 = xor i8 %tmp257, 13
+  %tmp283 = xor i8 %tmp282, 0
+  %tmp284 = xor i8 %tmp283, 0
+  %tmp285 = xor i8 %tmp284, %tmp276
+  %tmp286 = xor i8 %tmp285, %tmp281
+  %tmp287 = zext i8 %tmp286 to i16
+  %tmp288 = or i16 %tmp287, %tmp256
+  %tmp289 = xor i16 %tmp288, 14330
+  %tmp290 = add i16 0, %tmp289
+  %tmp291 = add i16 %tmp290, -14330
+  %tmp292 = zext i16 %tmp291 to i32
+  %tmp293 = add i16 %tmp290, -14330
+  %tmp294 = lshr i16 %tmp293, 12
+  %tmp295 = zext i16 %tmp294 to i32
+  %tmp296 = sub i32 0, %tmp295
+  %tmp297 = xor i32 %tmp296, 16
+  %tmp298 = add i32 0, %tmp297
+  %tmp299 = and i32 %tmp298, 31
+  %tmp300 = and i32 %tmp292, 30864
+  %tmp301 = shl i32 %tmp300, %tmp299
+  %tmp302 = xor i32 0, %tmp301
+  %tmp303 = add i32 0, %tmp302
+  %tmp304 = and i32 %tmp298, 31
+  %tmp305 = and i32 %tmp303, 25568
+  %tmp306 = lshr i32 %tmp305, %tmp304
+  %tmp307 = xor i32 0, %tmp306
+  %tmp308 = add i32 0, %tmp307
+  %tmp309 = trunc i32 %tmp308 to i16
+  %tmp310 = shl i16 %tmp309, 1
+  %tmp311 = and i16 %tmp310, -4648
+  %tmp312 = shl i16 %tmp309, 1
+  %tmp313 = and i16 %tmp312, 4646
+  %tmp314 = xor i16 %tmp311, 17700
+  %tmp315 = xor i16 %tmp313, 17700
+  %tmp316 = add i16 %tmp314, %tmp315
+  %tmp317 = and i16 %tmp314, %tmp315
+  %tmp318 = shl nuw i16 %tmp317, 1
+  %tmp319 = sub i16 0, %tmp318
+  %tmp320 = add i16 %tmp316, %tmp319
+  %tmp321 = and i16 %tmp320, 29906
+  %tmp322 = xor i16 %tmp309, 14953
+  %tmp323 = add i16 0, %tmp322
+  %tmp324 = sub i16 0, %tmp321
+  %tmp325 = xor i16 %tmp324, %tmp323
+  %tmp326 = add i16 0, %tmp325
+  %tmp327 = add i32 %tmp221, 1161362661
+  %tmp333 = icmp eq i16 %tmp326, 14953
+  %tmp334 = add i32 %tmp327, -1456704142
+  %tmp335 = zext i1 %tmp333 to i32
+  %tmp336 = add i32 %tmp334, %tmp335
+  %tmp337 = getelementptr [499 x i32]* @fp_dh_36985b17790d59a27994eaab5dcb00ee, i32 0, i32 %tmp336
+  %tmp338 = load i32* %tmp337
+  %tmp339 = add i32 %tmp338, 1
+  %tmp340 = inttoptr i32 %tmp339 to i8*
+  indirectbr i8* %tmp340, [label %bb85, label %bb439]
+
+bb432:                                            ; preds = %bb432, %bb213
+  %tmp433 = phi i32 [ %tmp221, %bb213 ], [ %tmp433, %bb432 ]
+  %tmp434 = add i32 %tmp433, 1022523279
+  %tmp435 = getelementptr [499 x i32]* @fp_dh_36985b17790d59a27994eaab5dcb00ee, i32 0, i32 %tmp434
+  %tmp436 = load i32* %tmp435
+  %tmp437 = add i32 %tmp436, 1
+  %tmp438 = inttoptr i32 %tmp437 to i8*
+  indirectbr i8* %tmp438, [label %bb432]
+
+bb439:                                            ; preds = %bb222, %bb85
+  ret void
+}
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/rot16.ll b/test/CodeGen/X86/rot16.ll
index 0293f4e..6d7c702 100644
--- a/test/CodeGen/X86/rot16.ll
+++ b/test/CodeGen/X86/rot16.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s
 
 define i16 @foo(i16 %x, i16 %y, i16 %z) nounwind readnone {
 entry:
diff --git a/test/CodeGen/X86/rotate3.ll b/test/CodeGen/X86/rotate3.ll
new file mode 100644
index 0000000..b92f7c2
--- /dev/null
+++ b/test/CodeGen/X86/rotate3.ll
@@ -0,0 +1,76 @@
+; Check that (or (shl x, y), (srl x, (sub 32, y))) is folded into (rotl x, y)
+; and (or (shl x, (sub 32, y)), (srl x, r)) into (rotr x, y) even if the
+; argument is zero extended. Fix for PR16726.
+
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+define zeroext i8 @rolbyte(i32 %nBits_arg, i8 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i8 %x_arg to i32
+  %tmp3 = shl i32 %tmp1, %nBits_arg
+  %tmp8 = sub i32 8, %nBits_arg
+  %tmp10 = lshr i32 %tmp1, %tmp8
+  %tmp11 = or i32 %tmp3, %tmp10
+  %tmp12 = trunc i32 %tmp11 to i8
+  ret i8 %tmp12
+}
+; CHECK:    rolb %cl, %{{[a-z0-9]+}}
+
+
+define zeroext i8 @rorbyte(i32 %nBits_arg, i8 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i8 %x_arg to i32
+  %tmp3 = lshr i32 %tmp1, %nBits_arg
+  %tmp8 = sub i32 8, %nBits_arg
+  %tmp10 = shl i32 %tmp1, %tmp8
+  %tmp11 = or i32 %tmp3, %tmp10
+  %tmp12 = trunc i32 %tmp11 to i8
+  ret i8 %tmp12
+}
+; CHECK:    rorb %cl, %{{[a-z0-9]+}}
+
+define zeroext i16 @rolword(i32 %nBits_arg, i16 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i16 %x_arg to i32
+  %tmp3 = shl i32 %tmp1, %nBits_arg
+  %tmp8 = sub i32 16, %nBits_arg
+  %tmp10 = lshr i32 %tmp1, %tmp8
+  %tmp11 = or i32 %tmp3, %tmp10
+  %tmp12 = trunc i32 %tmp11 to i16
+  ret i16 %tmp12
+}
+; CHECK:    rolw %cl, %{{[a-z0-9]+}}
+
+define zeroext i16 @rorword(i32 %nBits_arg, i16 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i16 %x_arg to i32
+  %tmp3 = lshr i32 %tmp1, %nBits_arg
+  %tmp8 = sub i32 16, %nBits_arg
+  %tmp10 = shl i32 %tmp1, %tmp8
+  %tmp11 = or i32 %tmp3, %tmp10
+  %tmp12 = trunc i32 %tmp11 to i16
+  ret i16 %tmp12
+}
+; CHECK:    rorw %cl, %{{[a-z0-9]+}}
+
+define i64 @roldword(i64 %nBits_arg, i32 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i32 %x_arg to i64
+  %tmp3 = shl i64 %tmp1, %nBits_arg
+  %tmp8 = sub i64 32, %nBits_arg
+  %tmp10 = lshr i64 %tmp1, %tmp8
+  %tmp11 = or i64 %tmp3, %tmp10
+  ret i64 %tmp11
+}
+; CHECK:    roll %cl, %{{[a-z0-9]+}}
+
+define zeroext i64 @rordword(i64 %nBits_arg, i32 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i32 %x_arg to i64
+  %tmp3 = lshr i64 %tmp1, %nBits_arg
+  %tmp8 = sub i64 32, %nBits_arg
+  %tmp10 = shl i64 %tmp1, %tmp8
+  %tmp11 = or i64 %tmp3, %tmp10
+  ret i64 %tmp11
+}
+; CHECK:    rorl %cl, %{{[a-z0-9]+}}
diff --git a/test/CodeGen/X86/rotate4.ll b/test/CodeGen/X86/rotate4.ll
new file mode 100644
index 0000000..5372612
--- /dev/null
+++ b/test/CodeGen/X86/rotate4.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=generic | FileCheck %s
+
+; Check that we recognize this idiom for rotation too:
+;    a << (b & (OpSize-1)) | a >> ((0 - b) & (OpSize-1))
+
+define i32 @rotate_left_32(i32 %a, i32 %b) {
+; CHECK-LABEL: rotate_left_32:
+; CHECK-NOT: and
+; CHECK: roll
+entry:
+  %and = and i32 %b, 31
+  %shl = shl i32 %a, %and
+  %0 = sub i32 0, %b
+  %and3 = and i32 %0, 31
+  %shr = lshr i32 %a, %and3
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i32 @rotate_right_32(i32 %a, i32 %b) {
+; CHECK-LABEL: rotate_right_32:
+; CHECK-NOT: and
+; CHECK: rorl
+entry:
+  %and = and i32 %b, 31
+  %shl = lshr i32 %a, %and
+  %0 = sub i32 0, %b
+  %and3 = and i32 %0, 31
+  %shr = shl i32 %a, %and3
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @rotate_left_64(i64 %a, i64 %b) {
+; CHECK-LABEL: rotate_left_64:
+; CHECK-NOT: and
+; CHECK: rolq
+entry:
+  %and = and i64 %b, 63
+  %shl = shl i64 %a, %and
+  %0 = sub i64 0, %b
+  %and3 = and i64 %0, 63
+  %shr = lshr i64 %a, %and3
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define i64 @rotate_right_64(i64 %a, i64 %b) {
+; CHECK-LABEL: rotate_right_64:
+; CHECK-NOT: and
+; CHECK: rorq
+entry:
+  %and = and i64 %b, 63
+  %shl = lshr i64 %a, %and
+  %0 = sub i64 0, %b
+  %and3 = and i64 %0, 63
+  %shr = shl i64 %a, %and3
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+; Also check mem operand.
+
+define void @rotate_left_m32(i32 *%pa, i32 %b) {
+; CHECK-LABEL: rotate_left_m32:
+; CHECK-NOT: and
+; CHECK: roll
+; no store:
+; CHECK-NOT: mov
+entry:
+  %a = load i32* %pa, align 16
+  %and = and i32 %b, 31
+  %shl = shl i32 %a, %and
+  %0 = sub i32 0, %b
+  %and3 = and i32 %0, 31
+  %shr = lshr i32 %a, %and3
+  %or = or i32 %shl, %shr
+  store i32 %or, i32* %pa, align 32
+  ret void
+}
+
+define void @rotate_right_m32(i32 *%pa, i32 %b) {
+; CHECK-LABEL: rotate_right_m32:
+; CHECK-NOT: and
+; CHECK: rorl
+; no store:
+; CHECK-NOT: mov
+entry:
+  %a = load i32* %pa, align 16
+  %and = and i32 %b, 31
+  %shl = lshr i32 %a, %and
+  %0 = sub i32 0, %b
+  %and3 = and i32 %0, 31
+  %shr = shl i32 %a, %and3
+  %or = or i32 %shl, %shr
+  store i32 %or, i32* %pa, align 32
+  ret void
+}
+
+define void @rotate_left_m64(i64 *%pa, i64 %b) {
+; CHECK-LABEL: rotate_left_m64:
+; CHECK-NOT: and
+; CHECK: rolq
+; no store:
+; CHECK-NOT: mov
+entry:
+  %a = load i64* %pa, align 16
+  %and = and i64 %b, 63
+  %shl = shl i64 %a, %and
+  %0 = sub i64 0, %b
+  %and3 = and i64 %0, 63
+  %shr = lshr i64 %a, %and3
+  %or = or i64 %shl, %shr
+  store i64 %or, i64* %pa, align 64
+  ret void
+}
+
+define void @rotate_right_m64(i64 *%pa, i64 %b) {
+; CHECK-LABEL: rotate_right_m64:
+; CHECK-NOT: and
+; CHECK: rorq
+; no store:
+; CHECK-NOT: mov
+entry:
+  %a = load i64* %pa, align 16
+  %and = and i64 %b, 63
+  %shl = lshr i64 %a, %and
+  %0 = sub i64 0, %b
+  %and3 = and i64 %0, 63
+  %shr = shl i64 %a, %and3
+  %or = or i64 %shl, %shr
+  store i64 %or, i64* %pa, align 64
+  ret void
+}
diff --git a/test/CodeGen/X86/saddo-redundant-add.ll b/test/CodeGen/X86/saddo-redundant-add.ll
new file mode 100644
index 0000000..c56c686
--- /dev/null
+++ b/test/CodeGen/X86/saddo-redundant-add.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+define void @redundant_add(i64 %n) {
+; Check that we don't create two additions for the sadd.with.overflow.
+; CHECK-LABEL: redundant_add
+; CHECK-NOT:  leaq
+; CHECK-NOT:  addq
+; CHECK:      incq
+; CHECK-NEXT: jno
+entry:
+  br label %exit_check
+
+exit_check:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %c = icmp slt i64 %i, %n
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %i.o = tail call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %i, i64 1)
+  %i.next = extractvalue { i64, i1 } %i.o, 0
+  %o = extractvalue { i64, i1 } %i.o, 1
+  br i1 %o, label %overflow, label %exit_check
+
+exit:
+  ret void
+
+overflow:
+  tail call void @llvm.trap()
+  unreachable
+}
+
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
+declare void @llvm.trap()
+
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 08a98ef..c02152b 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-FreeBSD
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW
 
 ; We used to crash with filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -filetype=obj
@@ -12,16 +13,14 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -segmented-stacks -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -segmented-stacks -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -segmented-stacks -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks -filetype=obj
 
 ; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris -segmented-stacks 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X64-Solaris
-; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks 2> %t.log
-; RUN: FileCheck %s -input-file=%t.log -check-prefix=X64-MinGW
 ; RUN: not llc < %s -mcpu=generic -mtriple=i686-freebsd -segmented-stacks 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X32-FreeBSD
 
 ; X64-Solaris: Segmented stacks not supported on this platform
-; X64-MinGW: Segmented stacks not supported on this platform
 ; X32-FreeBSD: Segmented stacks not supported on FreeBSD i386
 
 ; Just to prevent the alloca from being optimized away
@@ -83,6 +82,16 @@ define void @test_basic() {
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
+; X64-MinGW-LABEL:       test_basic:
+
+; X64-MinGW:       cmpq %gs:40, %rsp
+; X64-MinGW-NEXT:  ja      .LBB0_2
+
+; X64-MinGW:       movabsq $72, %r10
+; X64-MinGW-NEXT:  movabsq $32, %r11
+; X64-MinGW-NEXT:  callq __morestack
+; X64-MinGW-NEXT:  retq
+
 ; X64-FreeBSD-LABEL:       test_basic:
 
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
@@ -145,6 +154,17 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
+; X64-MinGW-LABEL: test_nested:
+; X64-MinGW:       cmpq %gs:40, %rsp
+; X64-MinGW-NEXT:  ja      .LBB1_2
+
+; X64-MinGW:       movq %r10, %rax
+; X64-MinGW-NEXT:  movabsq $0, %r10
+; X64-MinGW-NEXT:  movabsq $32, %r11
+; X64-MinGW-NEXT:  callq __morestack
+; X64-MinGW-NEXT:  retq
+; X64-MinGW-NEXT:  movq %rax, %r10
+
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
 ; X64-FreeBSD-NEXT:  ja      .LBB1_2
 
@@ -208,6 +228,16 @@ define void @test_large() {
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
+; X64-MinGW-LABEL: test_large:
+; X64-MinGW:       leaq -40040(%rsp), %r11
+; X64-MinGW-NEXT:  cmpq %gs:40, %r11
+; X64-MinGW-NEXT:  ja      .LBB2_2
+
+; X64-MinGW:       movabsq $40040, %r10
+; X64-MinGW-NEXT:  movabsq $32, %r11
+; X64-MinGW-NEXT:  callq __morestack
+; X64-MinGW-NEXT:  retq
+
 ; X64-FreeBSD:       leaq -40008(%rsp), %r11
 ; X64-FreeBSD-NEXT:  cmpq %fs:24, %r11
 ; X64-FreeBSD-NEXT:  ja      .LBB2_2
@@ -275,6 +305,16 @@ define fastcc void @test_fastcc() {
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
+; X64-MinGW-LABEL:       test_fastcc:
+
+; X64-MinGW:       cmpq %gs:40, %rsp
+; X64-MinGW-NEXT:  ja      .LBB3_2
+
+; X64-MinGW:       movabsq $72, %r10
+; X64-MinGW-NEXT:  movabsq $32, %r11
+; X64-MinGW-NEXT:  callq __morestack
+; X64-MinGW-NEXT:  retq
+
 ; X64-FreeBSD-LABEL:       test_fastcc:
 
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
@@ -348,6 +388,17 @@ define fastcc void @test_fastcc_large() {
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
+; X64-MinGW-LABEL:       test_fastcc_large:
+
+; X64-MinGW:       leaq -40040(%rsp), %r11
+; X64-MinGW-NEXT:  cmpq %gs:40, %r11
+; X64-MinGW-NEXT:  ja      .LBB4_2
+
+; X64-MinGW:       movabsq $40040, %r10
+; X64-MinGW-NEXT:  movabsq $32, %r11
+; X64-MinGW-NEXT:  callq __morestack
+; X64-MinGW-NEXT:  retq
+
 ; X64-FreeBSD-LABEL:       test_fastcc_large:
 
 ; X64-FreeBSD:       leaq -40008(%rsp), %r11
diff --git a/test/CodeGen/X86/setjmp-spills.ll b/test/CodeGen/X86/setjmp-spills.ll
new file mode 100644
index 0000000..c35caae
--- /dev/null
+++ b/test/CodeGen/X86/setjmp-spills.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -mtriple=i386-linux | FileCheck %s -check-prefix=X86-32
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86-64
+
+declare i32 @get_val()
+declare void @use_val(i32)
+declare i1 @setjmp()
+declare void @longjmp()
+declare void @personality()
+
+
+; Test that llc avoids reusing spill slots in functions that call
+; setjmp(), whether they use "call" or "invoke" for calling setjmp()
+; (PR18244).
+
+define void @setjmp_caller() {
+; X86-32-LABEL: setjmp_caller:
+; X86-64-LABEL: setjmp_caller:
+; This code keeps enough variables live across the setjmp() call that
+; they don't all fit in registers and the compiler will allocate a
+; spill slot.
+  %a1 = call i32 @get_val()
+  %a2 = call i32 @get_val()
+  %a3 = call i32 @get_val()
+  %a4 = call i32 @get_val()
+  %a5 = call i32 @get_val()
+  %a6 = call i32 @get_val()
+  %a7 = call i32 @get_val()
+  %a8 = call i32 @get_val()
+; X86-32: movl %eax, [[SPILL_SLOT:[0-9]+]](%esp)
+; X86-32: calll get_val
+; X86-64: movl %eax, [[SPILL_SLOT:[0-9]+]](%rsp)
+; X86-64: callq get_val
+
+  %setjmp_result = call i1 @setjmp() returns_twice
+  br i1 %setjmp_result, label %second, label %first
+; X86-32: calll setjmp
+; X86-64: callq setjmp
+
+; Again, keep enough variables live that they need spill slots.  Since
+; this function calls a returns_twice function (setjmp()), the
+; compiler should not reuse the spill slots.  longjmp() can return to
+; where the first spill slots were still live.
+first:
+  %b1 = call i32 @get_val()
+  %b2 = call i32 @get_val()
+  %b3 = call i32 @get_val()
+  %b4 = call i32 @get_val()
+  %b5 = call i32 @get_val()
+  %b6 = call i32 @get_val()
+  %b7 = call i32 @get_val()
+  %b8 = call i32 @get_val()
+  call void @use_val(i32 %b1)
+  call void @use_val(i32 %b2)
+  call void @use_val(i32 %b3)
+  call void @use_val(i32 %b4)
+  call void @use_val(i32 %b5)
+  call void @use_val(i32 %b6)
+  call void @use_val(i32 %b7)
+  call void @use_val(i32 %b8)
+  call void @longjmp()
+  unreachable
+; X86-32-NOT: movl {{.*}}, [[SPILL_SLOT]](%esp)
+; X86-64-NOT: movl {{.*}}, [[SPILL_SLOT]](%rsp)
+
+second:
+  call void @use_val(i32 %a1)
+  call void @use_val(i32 %a2)
+  call void @use_val(i32 %a3)
+  call void @use_val(i32 %a4)
+  call void @use_val(i32 %a5)
+  call void @use_val(i32 %a6)
+  call void @use_val(i32 %a7)
+  call void @use_val(i32 %a8)
+  ret void
+}
+
+
+; This is the same as above, but using "invoke" rather than "call" to
+; call setjmp().
+
+define void @setjmp_invoker() {
+; X86-32-LABEL: setjmp_invoker:
+; X86-64-LABEL: setjmp_invoker:
+  %a1 = call i32 @get_val()
+  %a2 = call i32 @get_val()
+  %a3 = call i32 @get_val()
+  %a4 = call i32 @get_val()
+  %a5 = call i32 @get_val()
+  %a6 = call i32 @get_val()
+  %a7 = call i32 @get_val()
+  %a8 = call i32 @get_val()
+; X86-32: movl %eax, [[SPILL_SLOT:[0-9]+]](%esp)
+; X86-32: calll get_val
+; X86-64: movl %eax, [[SPILL_SLOT:[0-9]+]](%rsp)
+; X86-64: callq get_val
+
+  %setjmp_result = invoke i1 @setjmp() returns_twice
+      to label %cont unwind label %lpad
+; X86-32: calll setjmp
+; X86-64: callq setjmp
+
+cont:
+  br i1 %setjmp_result, label %second, label %first
+
+lpad:
+  %lp = landingpad { i8*, i32 } personality void ()* @personality cleanup
+  unreachable
+
+first:
+  %b1 = call i32 @get_val()
+  %b2 = call i32 @get_val()
+  %b3 = call i32 @get_val()
+  %b4 = call i32 @get_val()
+  %b5 = call i32 @get_val()
+  %b6 = call i32 @get_val()
+  %b7 = call i32 @get_val()
+  %b8 = call i32 @get_val()
+  call void @use_val(i32 %b1)
+  call void @use_val(i32 %b2)
+  call void @use_val(i32 %b3)
+  call void @use_val(i32 %b4)
+  call void @use_val(i32 %b5)
+  call void @use_val(i32 %b6)
+  call void @use_val(i32 %b7)
+  call void @use_val(i32 %b8)
+  call void @longjmp()
+  unreachable
+; X86-32-NOT: movl {{.*}}, [[SPILL_SLOT]](%esp)
+; X86-64-NOT: movl {{.*}}, [[SPILL_SLOT]](%rsp)
+
+second:
+  call void @use_val(i32 %a1)
+  call void @use_val(i32 %a2)
+  call void @use_val(i32 %a3)
+  call void @use_val(i32 %a4)
+  call void @use_val(i32 %a5)
+  call void @use_val(i32 %a6)
+  call void @use_val(i32 %a7)
+  call void @use_val(i32 %a8)
+  ret void
+}
diff --git a/test/CodeGen/X86/shift-combine-crash.ll b/test/CodeGen/X86/shift-combine-crash.ll
new file mode 100644
index 0000000..a69a907
--- /dev/null
+++ b/test/CodeGen/X86/shift-combine-crash.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 > /dev/null
+
+; Verify that DAGCombiner doesn't crash with an assertion failure in the
+; attempt to cast a ISD::UNDEF node to a ConstantSDNode.
+
+; During type legalization, the vector shift operation in function @test1 is
+; split into two legal shifts that work on <2 x i64> elements.
+; The first shift of the legalized sequence would be a shift by all undefs.
+; DAGCombiner will then try to simplify the vector shift and check if the
+; vector of shift counts is a splat. Make sure that llc doesn't crash
+; at that stage.
+
+
+define <4 x i64> @test1(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 undef, i64 undef, i64 1, i64 2>
+  ret <4 x i64> %shl
+}
+
+; Also, verify that DAGCombiner doesn't crash when trying to combine shifts
+; with different combinations of undef elements in the vector shift count.
+
+define <4 x i64> @test2(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 2, i64 3, i64 undef, i64 undef>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test3(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 2, i64 undef, i64 3, i64 undef>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test4(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 undef, i64 2, i64 undef, i64 3>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test5(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 2, i64 undef, i64 undef, i64 undef>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test6(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 undef, i64 undef, i64 3, i64 undef>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test7(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 undef, i64 undef, i64 undef, i64 3>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test8(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 undef, i64 undef, i64 undef, i64 undef>
+  ret <4 x i64> %shl
+}
+
+
diff --git a/test/CodeGen/X86/shift-double.ll b/test/CodeGen/X86/shift-double.ll
index 8d2b290..fd4ba81 100644
--- a/test/CodeGen/X86/shift-double.ll
+++ b/test/CodeGen/X86/shift-double.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
+; RUN: llc < %s -march=x86 -mcpu=generic -x86-asm-syntax=intel | \
 ; RUN:   grep "sh[lr]d" | count 5
 
 define i64 @test1(i64 %X, i8 %C) {
diff --git a/test/CodeGen/X86/shift-pcmp.ll b/test/CodeGen/X86/shift-pcmp.ll
new file mode 100644
index 0000000..365c731
--- /dev/null
+++ b/test/CodeGen/X86/shift-pcmp.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+avx | FileCheck %s
+
+define <8 x i16> @foo(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-LABEL: {{^_?foo:}}
+; CHECK-NOT: psll
+entry:
+  %icmp = icmp eq <8 x i16> %a, %b
+  %zext = zext <8 x i1> %icmp to <8 x i16>
+  %shl = shl nuw nsw <8 x i16> %zext, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  ret <8 x i16> %shl
+}
+
+; Don't fail with an assert due to an undef in the buildvector
+define <8 x i16> @bar(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: bar
+entry:
+  %icmp = icmp eq <8 x i16> %a, %b
+  %zext = zext <8 x i1> %icmp to <8 x i16>
+  %shl = shl nuw nsw <8 x i16> %zext, <i16 5, i16 undef, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  ret <8 x i16> %shl
+}
diff --git a/test/CodeGen/X86/shl_undef.ll b/test/CodeGen/X86/shl_undef.ll
index 54b74cc..705af5b 100644
--- a/test/CodeGen/X86/shl_undef.ll
+++ b/test/CodeGen/X86/shl_undef.ll
@@ -1,15 +1,17 @@
-; RUN: llc < %s -O1 -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -O1 -mtriple=i386-apple-darwin -x86-asm-syntax=intel | FileCheck %s
 ;
 ; Interesting test case where %tmp1220 = xor i32 %tmp862, %tmp592 and
 ; %tmp1676 = xor i32 %tmp1634, %tmp1530 have zero demanded bits after
 ; DAGCombiner optimization pass.  These are changed to undef and in turn
 ; the successor shl(s) become shl undef, 1.  This pattern then matches
-; shl x, 1 -> add x, x.  add undef, undef doesn't guarentee the low
+; shl x, 1 -> add x, x.  add undef, undef doesn't guarantee the low
 ; order bit is zero and is incorrect.
 ;
 ; See rdar://9453156 and rdar://9487392.
 ;
 
+; Use intel syntax, or "shl" might hit "pushl".
+
 ; CHECK-NOT: shl
 define i32 @foo(i8* %a0, i32* %a2) nounwind {
 entry:
diff --git a/test/CodeGen/X86/shrink-compare.ll b/test/CodeGen/X86/shrink-compare.ll
index bb89201..fc7ee06 100644
--- a/test/CodeGen/X86/shrink-compare.ll
+++ b/test/CodeGen/X86/shrink-compare.ll
@@ -2,7 +2,7 @@
 
 declare void @bar()
 
-define void @test1(i32* nocapture %X) nounwind {
+define void @test1(i32* nocapture %X) nounwind minsize {
 entry:
   %tmp1 = load i32* %X, align 4
   %and = and i32 %tmp1, 255
@@ -19,7 +19,7 @@ if.end:
 ; CHECK: cmpb $47, (%{{rdi|rcx}})
 }
 
-define void @test2(i32 %X) nounwind {
+define void @test2(i32 %X) nounwind minsize {
 entry:
   %and = and i32 %X, 255
   %cmp = icmp eq i32 %and, 47
@@ -35,7 +35,7 @@ if.end:
 ; CHECK: cmpb $47, %{{dil|cl}}
 }
 
-define void @test3(i32 %X) nounwind {
+define void @test3(i32 %X) nounwind minsize {
 entry:
   %and = and i32 %X, 255
   %cmp = icmp eq i32 %and, 255
@@ -70,7 +70,7 @@ lor.end:                                          ; preds = %lor.rhs, %entry
 @x = global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 1 }, align 4
 
 ; PR16551
-define void @test5(i32 %X) nounwind {
+define void @test5(i32 %X) nounwind minsize {
 entry:
   %bf.load = load i56* bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @x to i56*), align 4
   %bf.lshr = lshr i56 %bf.load, 32
diff --git a/test/CodeGen/X86/sibcall-5.ll b/test/CodeGen/X86/sibcall-5.ll
index c479030..c04af23 100644
--- a/test/CodeGen/X86/sibcall-5.ll
+++ b/test/CodeGen/X86/sibcall-5.ll
@@ -8,7 +8,7 @@
 define double @foo(double %a) nounwind readonly ssp {
 entry:
 ; X32-LABEL: foo:
-; X32: jmp _sin$stub
+; X32: jmp L_sin$stub
 
 ; X64-LABEL: foo:
 ; X64: jmp _sin
@@ -18,7 +18,7 @@ entry:
 
 define float @bar(float %a) nounwind readonly ssp {
 ; X32-LABEL: bar:
-; X32: jmp _sinf$stub
+; X32: jmp L_sinf$stub
 
 ; X64-LABEL: bar:
 ; X64: jmp _sinf
@@ -27,6 +27,11 @@ entry:
   ret float %0
 }
 
+; X32-LABEL: L_sin$stub:
+; X32-NEXT:   .indirect_symbol        _sin
+; X32-LABEL: L_sinf$stub:
+; X32-NEXT:   .indirect_symbol        _sinf
+
 declare float @sinf(float) nounwind readonly
 
 declare double @sin(double) nounwind readonly
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index 589e9ec..28fc626 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -247,11 +247,11 @@ entry:
 define void @t15(%struct.foo* noalias sret %agg.result) nounwind  {
 ; 32-LABEL: t15:
 ; 32: calll {{_?}}f
-; 32: ret $4
+; 32: retl $4
 
 ; 64-LABEL: t15:
 ; 64: callq {{_?}}f
-; 64: ret
+; 64: retq
   tail call fastcc void @f(%struct.foo* noalias sret %agg.result) nounwind
   ret void
 }
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll
new file mode 100644
index 0000000..600ee1b
--- /dev/null
+++ b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll
@@ -0,0 +1,423 @@
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
+
+; Ensure that the backend selects SSE/AVX scalar fp instructions
+; from a packed fp instrution plus a vector insert.
+
+
+define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fadd <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test_add_ss
+; SSE2: addss   %xmm1, %xmm0
+; AVX: vaddss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fsub <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test_sub_ss
+; SSE2: subss   %xmm1, %xmm0
+; AVX: vsubss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fmul <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test_mul_ss
+; SSE2: mulss   %xmm1, %xmm0
+; AVX: vmulss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fdiv <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test_div_ss
+; SSE2: divss   %xmm1, %xmm0
+; AVX: vdivss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fadd <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test_add_sd
+; SSE2: addsd   %xmm1, %xmm0
+; AVX: vaddsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fsub <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test_sub_sd
+; SSE2: subsd   %xmm1, %xmm0
+; AVX: vsubsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fmul <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test_mul_sd
+; SSE2: mulsd   %xmm1, %xmm0
+; AVX: vmulsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fdiv <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test_div_sd
+; SSE2: divsd   %xmm1, %xmm0
+; AVX: vdivsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fadd <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test2_add_ss
+; SSE2: addss   %xmm0, %xmm1
+; AVX: vaddss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fsub <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test2_sub_ss
+; SSE2: subss   %xmm0, %xmm1
+; AVX: vsubss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fmul <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test2_mul_ss
+; SSE2: mulss   %xmm0, %xmm1
+; AVX: vmulss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fdiv <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test2_div_ss
+; SSE2: divss   %xmm0, %xmm1
+; AVX: vdivss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fadd <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test2_add_sd
+; SSE2: addsd   %xmm0, %xmm1
+; AVX: vaddsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fsub <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test2_sub_sd
+; SSE2: subsd   %xmm0, %xmm1
+; AVX: vsubsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fmul <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test2_mul_sd
+; SSE2: mulsd   %xmm0, %xmm1
+; AVX: vmulsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fdiv <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test2_div_sd
+; SSE2: divsd   %xmm0, %xmm1
+; AVX: vdivsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test3_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fadd <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_add_ss
+; SSE2: addss   %xmm1, %xmm0
+; AVX: vaddss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test3_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fsub <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_sub_ss
+; SSE2: subss   %xmm1, %xmm0
+; AVX: vsubss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test3_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fmul <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_mul_ss
+; SSE2: mulss   %xmm1, %xmm0
+; AVX: vmulss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test3_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fdiv <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_div_ss
+; SSE2: divss   %xmm1, %xmm0
+; AVX: vdivss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test3_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fadd <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_add_sd
+; SSE2: addsd   %xmm1, %xmm0
+; AVX: vaddsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test3_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fsub <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_sub_sd
+; SSE2: subsd   %xmm1, %xmm0
+; AVX: vsubsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test3_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fmul <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_mul_sd
+; SSE2: mulsd   %xmm1, %xmm0
+; AVX: vmulsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test3_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fdiv <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_div_sd
+; SSE2: divsd   %xmm1, %xmm0
+; AVX: vdivsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test4_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fadd <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_add_ss
+; SSE2: addss   %xmm0, %xmm1
+; AVX: vaddss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test4_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fsub <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_sub_ss
+; SSE2: subss   %xmm0, %xmm1
+; AVX: vsubss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test4_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fmul <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_mul_ss
+; SSE2: mulss   %xmm0, %xmm1
+; AVX: vmulss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test4_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fdiv <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_div_ss
+; SSE2: divss   %xmm0, %xmm1
+; AVX: vdivss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test4_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fadd <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_add_sd
+; SSE2: addsd   %xmm0, %xmm1
+; AVX: vaddsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test4_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fsub <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_sub_sd
+; SSE2: subsd   %xmm0, %xmm1
+; AVX: vsubsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test4_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fmul <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_mul_sd
+; SSE2: mulsd   %xmm0, %xmm1
+; AVX: vmulsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test4_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fdiv <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_div_sd
+; SSE2: divsd   %xmm0, %xmm1
+; AVX: vdivsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll
new file mode 100644
index 0000000..3949a83
--- /dev/null
+++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -0,0 +1,310 @@
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
+
+; Ensure that the backend no longer emits unnecessary vector insert
+; instructions immediately after SSE scalar fp instructions
+; like addss or mulss.
+
+
+define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %add = fadd float %2, %1
+  %3 = insertelement <4 x float> %a, float %add, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_add_ss
+; SSE2: addss   %xmm1, %xmm0
+; AVX: vaddss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %sub = fsub float %2, %1
+  %3 = insertelement <4 x float> %a, float %sub, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_sub_ss
+; SSE2: subss   %xmm1, %xmm0
+; AVX: vsubss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %mul = fmul float %2, %1
+  %3 = insertelement <4 x float> %a, float %mul, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_mul_ss
+; SSE2: mulss   %xmm1, %xmm0
+; AVX: vmulss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %div = fdiv float %2, %1
+  %3 = insertelement <4 x float> %a, float %div, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_div_ss
+; SSE2: divss   %xmm1, %xmm0
+; AVX: vdivss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %add = fadd double %2, %1
+  %3 = insertelement <2 x double> %a, double %add, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_add_sd
+; SSE2: addsd   %xmm1, %xmm0
+; AVX: vaddsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %sub = fsub double %2, %1
+  %3 = insertelement <2 x double> %a, double %sub, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_sub_sd
+; SSE2: subsd   %xmm1, %xmm0
+; AVX: vsubsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %mul = fmul double %2, %1
+  %3 = insertelement <2 x double> %a, double %mul, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_mul_sd
+; SSE2: mulsd   %xmm1, %xmm0
+; AVX: vmulsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %div = fdiv double %2, %1
+  %3 = insertelement <2 x double> %a, double %div, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_div_sd
+; SSE2: divsd   %xmm1, %xmm0
+; AVX: vdivsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %add = fadd float %1, %2
+  %3 = insertelement <4 x float> %b, float %add, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_add_ss
+; SSE2: addss   %xmm0, %xmm1
+; AVX: vaddss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %sub = fsub float %2, %1
+  %3 = insertelement <4 x float> %b, float %sub, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_sub_ss
+; SSE2: subss   %xmm0, %xmm1
+; AVX: vsubss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %mul = fmul float %1, %2
+  %3 = insertelement <4 x float> %b, float %mul, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_mul_ss
+; SSE2: mulss   %xmm0, %xmm1
+; AVX: vmulss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %div = fdiv float %2, %1
+  %3 = insertelement <4 x float> %b, float %div, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_div_ss
+; SSE2: divss   %xmm0, %xmm1
+; AVX: vdivss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %add = fadd double %1, %2
+  %3 = insertelement <2 x double> %b, double %add, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_add_sd
+; SSE2: addsd   %xmm0, %xmm1
+; AVX: vaddsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %sub = fsub double %2, %1
+  %3 = insertelement <2 x double> %b, double %sub, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_sub_sd
+; SSE2: subsd   %xmm0, %xmm1
+; AVX: vsubsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %mul = fmul double %1, %2
+  %3 = insertelement <2 x double> %b, double %mul, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_mul_sd
+; SSE2: mulsd   %xmm0, %xmm1
+; AVX: vmulsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %div = fdiv double %2, %1
+  %3 = insertelement <2 x double> %b, double %div, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_div_sd
+; SSE2: divsd   %xmm0, %xmm1
+; AVX: vdivsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %add = fadd float %2, %1
+  %add2 = fadd float %2, %add
+  %3 = insertelement <4 x float> %a, float %add2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_add_ss
+; CHECK: addss
+; CHECK: addss
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %sub = fsub float %2, %1
+  %sub2 = fsub float %2, %sub
+  %3 = insertelement <4 x float> %a, float %sub2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_sub_ss
+; CHECK: subss
+; CHECK: subss
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %mul = fmul float %2, %1
+  %mul2 = fmul float %2, %mul
+  %3 = insertelement <4 x float> %a, float %mul2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_mul_ss
+; CHECK: mulss
+; CHECK: mulss
+; CHECK-NOT: movss
+; CHECK: ret
+
+define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %div = fdiv float %2, %1
+  %div2 = fdiv float %2, %div
+  %3 = insertelement <4 x float> %a, float %div2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_div_ss
+; CHECK: divss
+; CHECK: divss
+; CHECK-NOT: movss
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index 47c6429..183297e 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -43,3 +43,17 @@ entry:
 ; CHECK-NOT: shufps	$16
 ; CHECK: ret
 }
+
+; We used to get stuck in type legalization for this example when lowering the
+; vselect. With SSE1 v4f32 is a legal type but v4i1 (or any vector integer type)
+; is not. We used to ping pong between splitting the vselect for the v4i
+; condition operand and widening the resulting vselect for the v4f32 result.
+; PR18036
+
+; CHECK-LABEL: vselect
+define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
+entry:
+  %a1 = icmp eq <4 x i32> %q, zeroinitializer
+  %a14 = select <4 x i1> %a1, <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+0> , <4 x float> zeroinitializer
+  ret <4 x float> %a14
+}
diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll
index 1ac9832..c63ff72 100644
--- a/test/CodeGen/X86/sse2-blend.ll
+++ b/test/CodeGen/X86/sse2-blend.ll
@@ -1,22 +1,22 @@
 ; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
 
-; CHECK: vsel_float
-; CHECK: pandn
-; CHECK: pand
-; CHECK: por
+; CHECK-LABEL: vsel_float
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NOT: orps
 ; CHECK: ret
 define void@vsel_float(<4 x float>* %v1, <4 x float>* %v2) {
   %A = load <4 x float>* %v1
   %B = load <4 x float>* %v2
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
+  %vsel = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
   store <4 x float > %vsel, <4 x float>* %v1
   ret void
 }
 
-; CHECK: vsel_i32
-; CHECK: pandn
-; CHECK: pand
-; CHECK: por
+; CHECK-LABEL: vsel_i32
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NOT: orps
 ; CHECK: ret
 define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
   %A = load <4 x i32>* %v1
@@ -27,7 +27,7 @@ define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
 }
 
 ; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK: vsel_i64
+; CHECK-LABEL: vsel_i64
 ; CHECK: andnps
 ; CHECK: orps
 ; CHECK: ret
@@ -41,7 +41,7 @@ define void@vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) {
 }
 
 ; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK: vsel_double
+; CHECK-LABEL: vsel_double
 ; CHECK: andnps
 ; CHECK: orps
 ; CHECK: ret
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index ff6c10b..cfc892d 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -710,3 +710,10 @@ define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
   ret i32 %res
 }
 declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define void @test_x86_sse2_pause() {
+  ; CHECK: pause
+  tail call void @llvm.x86.sse2.pause()
+  ret void 
+}
+declare void @llvm.x86.sse2.pause() nounwind
diff --git a/test/CodeGen/X86/sse2-vector-shifts.ll b/test/CodeGen/X86/sse2-vector-shifts.ll
index 462def9..7c8d5e5 100644
--- a/test/CodeGen/X86/sse2-vector-shifts.ll
+++ b/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -9,8 +9,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_sllw_1:
-; CHECK: psllw   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psllw   $0, %xmm0
+; CHECK: ret
 
 define <8 x i16> @test_sllw_2(<8 x i16> %InVec) {
 entry:
@@ -24,12 +24,12 @@ entry:
 
 define <8 x i16> @test_sllw_3(<8 x i16> %InVec) {
 entry:
-  %shl = shl <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = shl <8 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <8 x i16> %shl
 }
 
 ; CHECK-LABEL: test_sllw_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: psllw $15, %xmm0
 ; CHECK-NEXT: ret
 
 define <4 x i32> @test_slld_1(<4 x i32> %InVec) {
@@ -39,8 +39,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_slld_1:
-; CHECK: pslld   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: pslld   $0, %xmm0
+; CHECK: ret
 
 define <4 x i32> @test_slld_2(<4 x i32> %InVec) {
 entry:
@@ -54,12 +54,12 @@ entry:
 
 define <4 x i32> @test_slld_3(<4 x i32> %InVec) {
 entry:
-  %shl = shl <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
+  %shl = shl <4 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %shl
 }
 
 ; CHECK-LABEL: test_slld_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: pslld $31, %xmm0
 ; CHECK-NEXT: ret
 
 define <2 x i64> @test_sllq_1(<2 x i64> %InVec) {
@@ -69,8 +69,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_sllq_1:
-; CHECK: psllq   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psllq   $0, %xmm0
+; CHECK: ret
 
 define <2 x i64> @test_sllq_2(<2 x i64> %InVec) {
 entry:
@@ -84,12 +84,12 @@ entry:
 
 define <2 x i64> @test_sllq_3(<2 x i64> %InVec) {
 entry:
-  %shl = shl <2 x i64> %InVec, <i64 64, i64 64>
+  %shl = shl <2 x i64> %InVec, <i64 63, i64 63>
   ret <2 x i64> %shl
 }
 
 ; CHECK-LABEL: test_sllq_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: psllq $63, %xmm0
 ; CHECK-NEXT: ret
 
 ; SSE2 Arithmetic Shift
@@ -101,8 +101,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_sraw_1:
-; CHECK: psraw   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psraw   $0, %xmm0
+; CHECK: ret
 
 define <8 x i16> @test_sraw_2(<8 x i16> %InVec) {
 entry:
@@ -116,7 +116,7 @@ entry:
 
 define <8 x i16> @test_sraw_3(<8 x i16> %InVec) {
 entry:
-  %shl = ashr <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = ashr <8 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <8 x i16> %shl
 }
 
@@ -131,8 +131,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_srad_1:
-; CHECK: psrad   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psrad   $0, %xmm0
+; CHECK: ret
 
 define <4 x i32> @test_srad_2(<4 x i32> %InVec) {
 entry:
@@ -146,7 +146,7 @@ entry:
 
 define <4 x i32> @test_srad_3(<4 x i32> %InVec) {
 entry:
-  %shl = ashr <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
+  %shl = ashr <4 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %shl
 }
 
@@ -163,8 +163,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_srlw_1:
-; CHECK: psrlw   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psrlw   $0, %xmm0
+; CHECK: ret
 
 define <8 x i16> @test_srlw_2(<8 x i16> %InVec) {
 entry:
@@ -178,12 +178,12 @@ entry:
 
 define <8 x i16> @test_srlw_3(<8 x i16> %InVec) {
 entry:
-  %shl = lshr <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = lshr <8 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <8 x i16> %shl
 }
 
 ; CHECK-LABEL: test_srlw_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: psrlw $15, %xmm0
 ; CHECK-NEXT: ret
 
 define <4 x i32> @test_srld_1(<4 x i32> %InVec) {
@@ -193,8 +193,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_srld_1:
-; CHECK: psrld   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psrld   $0, %xmm0
+; CHECK: ret
 
 define <4 x i32> @test_srld_2(<4 x i32> %InVec) {
 entry:
@@ -208,12 +208,12 @@ entry:
 
 define <4 x i32> @test_srld_3(<4 x i32> %InVec) {
 entry:
-  %shl = lshr <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
+  %shl = lshr <4 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %shl
 }
 
 ; CHECK-LABEL: test_srld_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: psrld $31, %xmm0
 ; CHECK-NEXT: ret
 
 define <2 x i64> @test_srlq_1(<2 x i64> %InVec) {
@@ -223,8 +223,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_srlq_1:
-; CHECK: psrlq   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psrlq   $0, %xmm0
+; CHECK: ret
 
 define <2 x i64> @test_srlq_2(<2 x i64> %InVec) {
 entry:
@@ -238,10 +238,130 @@ entry:
 
 define <2 x i64> @test_srlq_3(<2 x i64> %InVec) {
 entry:
-  %shl = lshr <2 x i64> %InVec, <i64 64, i64 64>
+  %shl = lshr <2 x i64> %InVec, <i64 63, i64 63>
   ret <2 x i64> %shl
 }
 
 ; CHECK-LABEL: test_srlq_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: psrlq $63, %xmm0
+; CHECK-NEXT: ret
+
+
+; CHECK-LABEL: sra_sra_v4i32:
+; CHECK: psrad $6, %xmm0
+; CHECK-NEXT: retq
+define <4 x i32> @sra_sra_v4i32(<4 x i32> %x) nounwind {
+  %sra0 = ashr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
+  %sra1 = ashr <4 x i32> %sra0, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %sra1
+}
+
+; CHECK-LABEL: @srl_srl_v4i32
+; CHECK: psrld $6, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @srl_srl_v4i32(<4 x i32> %x) nounwind {
+  %srl0 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
+  %srl1 = lshr <4 x i32> %srl0, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %srl1
+}
+
+; CHECK-LABEL: @srl_shl_v4i32
+; CHECK: andps
+; CHECK-NEXT: retq
+define <4 x i32> @srl_shl_v4i32(<4 x i32> %x) nounwind {
+  %srl0 = shl <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
+  %srl1 = lshr <4 x i32> %srl0, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %srl1
+}
+
+; CHECK-LABEL: @srl_sra_31_v4i32
+; CHECK: psrld $31, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @srl_sra_31_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+  %sra = ashr <4 x i32> %x, %y
+  %srl1 = lshr <4 x i32> %sra, <i32 31, i32 31, i32 31, i32 31>
+  ret <4 x i32> %srl1
+}
+
+; CHECK-LABEL: @shl_shl_v4i32
+; CHECK: pslld $6, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @shl_shl_v4i32(<4 x i32> %x) nounwind {
+  %shl0 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
+  %shl1 = shl <4 x i32> %shl0, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %shl1
+}
+
+; CHECK-LABEL: @shl_sra_v4i32
+; CHECK: andps
+; CHECK-NEXT: ret
+define <4 x i32> @shl_sra_v4i32(<4 x i32> %x) nounwind {
+  %shl0 = ashr <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
+  %shl1 = shl <4 x i32> %shl0, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %shl1
+}
+
+; CHECK-LABEL: @shl_srl_v4i32
+; CHECK: pslld $3, %xmm0
+; CHECK-NEXT: pand
+; CHECK-NEXT: ret
+define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {
+  %shl0 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
+  %shl1 = shl <4 x i32> %shl0, <i32 5, i32 5, i32 5, i32 5>
+  ret <4 x i32> %shl1
+}
+
+; CHECK-LABEL: @shl_zext_srl_v4i32
+; CHECK: andps
 ; CHECK-NEXT: ret
+define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind {
+  %srl = lshr <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
+  %zext = zext <4 x i16> %srl to <4 x i32>
+  %shl = shl <4 x i32> %zext, <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i32> %shl
+}
+
+; CHECK: @sra_trunc_srl_v4i32
+; CHECK: psrad $19, %xmm0
+; CHECK-NEXT: retq
+define <4 x i16> @sra_trunc_srl_v4i32(<4 x i32> %x) nounwind {
+  %srl = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
+  %trunc = trunc <4 x i32> %srl to <4 x i16>
+  %sra = ashr <4 x i16> %trunc, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %sra
+}
+
+; CHECK-LABEL: @shl_zext_shl_v4i32
+; CHECK: pand
+; CHECK-NEXT: pslld $19, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @shl_zext_shl_v4i32(<4 x i16> %x) nounwind {
+  %shl0 = shl <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
+  %ext = zext <4 x i16> %shl0 to <4 x i32>
+  %shl1 = shl <4 x i32> %ext, <i32 17, i32 17, i32 17, i32 17>
+  ret <4 x i32> %shl1
+}
+
+; CHECK-LABEL: @sra_v4i32
+; CHECK: psrad $3, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @sra_v4i32(<4 x i32> %x) nounwind {
+  %sra = ashr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %sra
+}
+
+; CHECK-LABEL: @srl_v4i32
+; CHECK: psrld $3, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @srl_v4i32(<4 x i32> %x) nounwind {
+  %sra = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %sra
+}
+
+; CHECK-LABEL: @shl_v4i32
+; CHECK: pslld $3, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @shl_v4i32(<4 x i32> %x) nounwind {
+  %sra = shl <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %sra
+}
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index 9147c22..628dba0 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -9,10 +9,10 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
 	ret void
 
 ; CHECK-LABEL: test1:
-; CHECK: 	movl	8(%esp), %eax
-; CHECK-NEXT: 	movapd	(%eax), %xmm0
+; CHECK: 	movl	4(%esp), %eax
+; CHECK-NEXT: 	movl	8(%esp), %ecx
+; CHECK-NEXT: 	movapd	(%ecx), %xmm0
 ; CHECK-NEXT: 	movlpd	12(%esp), %xmm0
-; CHECK-NEXT: 	movl	4(%esp), %eax
 ; CHECK-NEXT: 	movapd	%xmm0, (%eax)
 ; CHECK-NEXT: 	ret
 }
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index a32f5de..4681fde 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -4,7 +4,7 @@
 ;CHECK: blendvps
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
   ret <4 x float> %vsel
 }
 
@@ -13,7 +13,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 ;CHECK: blendvps
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
+  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
   ret <4 x i8> %vsel
 }
 
@@ -21,7 +21,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 ;CHECK: blendvps
 ;CHECK: ret
 define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i16> %v1, <4 x i16> %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
   ret <4 x i16> %vsel
 }
 
@@ -30,13 +30,13 @@ define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
 ;CHECK: blendvps
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %v1, <4 x i32> %v2
+  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
   ret <4 x i32> %vsel
 }
 
 
 ;CHECK-LABEL: vsel_double:
-;CHECK: blendvpd
+;CHECK: movsd
 ;CHECK: ret
 define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> %v1, <4 x double> %v2
@@ -45,7 +45,7 @@ define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) {
 
 
 ;CHECK-LABEL: vsel_i64:
-;CHECK: blendvpd
+;CHECK: movsd
 ;CHECK: ret
 define <4 x i64> @vsel_i64(<4 x i64> %v1, <4 x i64> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %v1, <4 x i64> %v2
diff --git a/test/CodeGen/X86/ssp-data-layout.ll b/test/CodeGen/X86/ssp-data-layout.ll
new file mode 100644
index 0000000..e76ad7b
--- /dev/null
+++ b/test/CodeGen/X86/ssp-data-layout.ll
@@ -0,0 +1,510 @@
+; RUN: llc < %s -disable-fp-elim -mtriple=x86_64-pc-linux-gnu -mcpu=corei7 -o - | FileCheck %s
+;  This test is fairly fragile.  The goal is to ensure that "large" stack
+;  objects are allocated closest to the stack protector (i.e., farthest away 
+;  from the Stack Pointer.)  In standard SSP mode this means that large (>=
+;  ssp-buffer-size) arrays and structures containing such arrays are
+;  closet to the protector.  With sspstrong and sspreq this means large
+;  arrays/structures-with-arrays are closest, followed by small (< ssp-buffer-size)
+;  arrays/structures-with-arrays, and then addr-taken variables.
+;
+;  Ideally, we only want verify that the objects appear in the correct groups
+;  and that the groups have the correct relative stack offset.  The ordering
+;  within a group is not relevant to this test.  Unfortunately, there is not
+;  an elegant way to do this, so just match the offset for each object.
+; RUN: llc < %s -disable-fp-elim -mtriple=x86_64-unknown-unknown -O0 -mcpu=corei7 -o - \
+; RUN:   | FileCheck --check-prefix=FAST-NON-LIN %s
+; FastISel was not setting the StackProtectorIndex when lowering
+; Intrinsic::stackprotector and as a result the stack re-arrangement code was
+; never applied.  This problem only shows up on non-Linux platforms because on
+; Linux the stack protector cookie is loaded from a special address space which
+; always triggers standard ISel.  Run a basic test to ensure that at -O0
+; on a non-linux target the data layout rules are triggered.
+
+%struct.struct_large_char = type { [8 x i8] }
+%struct.struct_small_char = type { [2 x i8] }
+%struct.struct_large_nonchar = type { [8 x i32] }
+%struct.struct_small_nonchar = type { [2 x i16] }
+
+define void @layout_ssp() ssp {
+entry:
+; Expected stack layout for ssp is
+;  -16 large_char          . Group 1, nested arrays, arrays >= ssp-buffer-size
+;  -24 struct_large_char   .
+;  -28 scalar1             | Everything else
+;  -32 scalar2
+;  -36 scalar3
+;  -40 addr-of
+;  -44 small_nonchar
+;  -80 large_nonchar
+;  -82 small_char
+;  -88 struct_small_char
+;  -120 struct_large_nonchar
+;  -128 struct_small_nonchar
+
+; CHECK: layout_ssp:
+; CHECK: call{{l|q}} get_scalar1
+; CHECK: movl %eax, -28(
+; CHECK: call{{l|q}} end_scalar1
+
+; CHECK: call{{l|q}} get_scalar2
+; CHECK: movl %eax, -32(
+; CHECK: call{{l|q}} end_scalar2
+
+; CHECK: call{{l|q}} get_scalar3
+; CHECK: movl %eax, -36(
+; CHECK: call{{l|q}} end_scalar3
+
+; CHECK: call{{l|q}} get_addrof
+; CHECK: movl %eax, -40(
+; CHECK: call{{l|q}} end_addrof
+
+; CHECK: get_small_nonchar
+; CHECK: movw %ax, -44(
+; CHECK: call{{l|q}} end_small_nonchar
+
+; CHECK: call{{l|q}} get_large_nonchar
+; CHECK: movl %eax, -80(
+; CHECK: call{{l|q}} end_large_nonchar
+
+; CHECK: call{{l|q}} get_small_char
+; CHECK: movb %al, -82(
+; CHECK: call{{l|q}} end_small_char
+
+; CHECK: call{{l|q}} get_large_char
+; CHECK: movb %al, -16(
+; CHECK: call{{l|q}} end_large_char
+
+; CHECK: call{{l|q}} get_struct_large_char
+; CHECK: movb %al, -24(
+; CHECK: call{{l|q}} end_struct_large_char
+
+; CHECK: call{{l|q}} get_struct_small_char
+; CHECK: movb %al, -88(
+; CHECK: call{{l|q}} end_struct_small_char
+
+; CHECK: call{{l|q}} get_struct_large_nonchar
+; CHECK: movl %eax, -120(
+; CHECK: call{{l|q}} end_struct_large_nonchar
+
+; CHECK: call{{l|q}} get_struct_small_nonchar
+; CHECK: movw %ax, -128(
+; CHECK: call{{l|q}} end_struct_small_nonchar
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  %ptr = alloca i32, align 4
+  %small2 = alloca [2 x i16], align 2
+  %large2 = alloca [8 x i32], align 16
+  %small = alloca [2 x i8], align 1
+  %large = alloca [8 x i8], align 1
+  %a = alloca %struct.struct_large_char, align 1
+  %b = alloca %struct.struct_small_char, align 1
+  %c = alloca %struct.struct_large_nonchar, align 8
+  %d = alloca %struct.struct_small_nonchar, align 2
+  %call = call i32 @get_scalar1()
+  store i32 %call, i32* %x, align 4
+  call void @end_scalar1()
+  %call1 = call i32 @get_scalar2()
+  store i32 %call1, i32* %y, align 4
+  call void @end_scalar2()
+  %call2 = call i32 @get_scalar3()
+  store i32 %call2, i32* %z, align 4
+  call void @end_scalar3()
+  %call3 = call i32 @get_addrof()
+  store i32 %call3, i32* %ptr, align 4
+  call void @end_addrof()
+  %call4 = call signext i16 @get_small_nonchar()
+  %arrayidx = getelementptr inbounds [2 x i16]* %small2, i32 0, i64 0
+  store i16 %call4, i16* %arrayidx, align 2
+  call void @end_small_nonchar()
+  %call5 = call i32 @get_large_nonchar()
+  %arrayidx6 = getelementptr inbounds [8 x i32]* %large2, i32 0, i64 0
+  store i32 %call5, i32* %arrayidx6, align 4
+  call void @end_large_nonchar()
+  %call7 = call signext i8 @get_small_char()
+  %arrayidx8 = getelementptr inbounds [2 x i8]* %small, i32 0, i64 0
+  store i8 %call7, i8* %arrayidx8, align 1
+  call void @end_small_char()
+  %call9 = call signext i8 @get_large_char()
+  %arrayidx10 = getelementptr inbounds [8 x i8]* %large, i32 0, i64 0
+  store i8 %call9, i8* %arrayidx10, align 1
+  call void @end_large_char()
+  %call11 = call signext i8 @get_struct_large_char()
+  %foo = getelementptr inbounds %struct.struct_large_char* %a, i32 0, i32 0
+  %arrayidx12 = getelementptr inbounds [8 x i8]* %foo, i32 0, i64 0
+  store i8 %call11, i8* %arrayidx12, align 1
+  call void @end_struct_large_char()
+  %call13 = call signext i8 @get_struct_small_char()
+  %foo14 = getelementptr inbounds %struct.struct_small_char* %b, i32 0, i32 0
+  %arrayidx15 = getelementptr inbounds [2 x i8]* %foo14, i32 0, i64 0
+  store i8 %call13, i8* %arrayidx15, align 1
+  call void @end_struct_small_char()
+  %call16 = call i32 @get_struct_large_nonchar()
+  %foo17 = getelementptr inbounds %struct.struct_large_nonchar* %c, i32 0, i32 0
+  %arrayidx18 = getelementptr inbounds [8 x i32]* %foo17, i32 0, i64 0
+  store i32 %call16, i32* %arrayidx18, align 4
+  call void @end_struct_large_nonchar()
+  %call19 = call signext i16 @get_struct_small_nonchar()
+  %foo20 = getelementptr inbounds %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %arrayidx21 = getelementptr inbounds [2 x i16]* %foo20, i32 0, i64 0
+  store i16 %call19, i16* %arrayidx21, align 2
+  call void @end_struct_small_nonchar()
+  %arraydecay = getelementptr inbounds [8 x i8]* %large, i32 0, i32 0
+  %arraydecay22 = getelementptr inbounds [2 x i8]* %small, i32 0, i32 0
+  %arraydecay23 = getelementptr inbounds [8 x i32]* %large2, i32 0, i32 0
+  %arraydecay24 = getelementptr inbounds [2 x i16]* %small2, i32 0, i32 0
+  %0 = load i32* %x, align 4
+  %1 = load i32* %y, align 4
+  %2 = load i32* %z, align 4
+  %coerce.dive = getelementptr %struct.struct_large_char* %a, i32 0, i32 0
+  %3 = bitcast [8 x i8]* %coerce.dive to i64*
+  %4 = load i64* %3, align 1
+  %coerce.dive25 = getelementptr %struct.struct_small_char* %b, i32 0, i32 0
+  %5 = bitcast [2 x i8]* %coerce.dive25 to i16*
+  %6 = load i16* %5, align 1
+  %coerce.dive26 = getelementptr %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
+  %8 = load i32* %7, align 1
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  ret void
+}
+
+define void @layout_sspstrong() nounwind uwtable sspstrong {
+entry:
+; Expected stack layout for sspstrong is
+;   -48   large_nonchar          . Group 1, nested arrays,
+;   -56   large_char             .  arrays >= ssp-buffer-size
+;   -64   struct_large_char      .
+;   -96   struct_large_nonchar   .
+;   -100  small_non_char         | Group 2, nested arrays, 
+;   -102  small_char             |  arrays < ssp-buffer-size
+;   -104  struct_small_char      |
+;   -112  struct_small_nonchar   |
+;   -116  addrof                 * Group 3, addr-of local
+;   -120  scalar                 + Group 4, everything else
+;   -124  scalar                 +
+;   -128  scalar                 +
+;   
+; CHECK: layout_sspstrong:
+; CHECK: call{{l|q}} get_scalar1
+; CHECK: movl %eax, -120(
+; CHECK: call{{l|q}} end_scalar1
+
+; CHECK: call{{l|q}} get_scalar2
+; CHECK: movl %eax, -124(
+; CHECK: call{{l|q}} end_scalar2
+
+; CHECK: call{{l|q}} get_scalar3
+; CHECK: movl %eax, -128(
+; CHECK: call{{l|q}} end_scalar3
+
+; CHECK: call{{l|q}} get_addrof
+; CHECK: movl %eax, -116(
+; CHECK: call{{l|q}} end_addrof
+
+; CHECK: get_small_nonchar
+; CHECK: movw %ax, -100(
+; CHECK: call{{l|q}} end_small_nonchar
+
+; CHECK: call{{l|q}} get_large_nonchar
+; CHECK: movl %eax, -48(
+; CHECK: call{{l|q}} end_large_nonchar
+
+; CHECK: call{{l|q}} get_small_char
+; CHECK: movb %al, -102(
+; CHECK: call{{l|q}} end_small_char
+
+; CHECK: call{{l|q}} get_large_char
+; CHECK: movb %al, -56(
+; CHECK: call{{l|q}} end_large_char
+
+; CHECK: call{{l|q}} get_struct_large_char
+; CHECK: movb %al, -64(
+; CHECK: call{{l|q}} end_struct_large_char
+
+; CHECK: call{{l|q}} get_struct_small_char
+; CHECK: movb %al, -104(
+; CHECK: call{{l|q}} end_struct_small_char
+
+; CHECK: call{{l|q}} get_struct_large_nonchar
+; CHECK: movl %eax, -96(
+; CHECK: call{{l|q}} end_struct_large_nonchar
+
+; CHECK: call{{l|q}} get_struct_small_nonchar
+; CHECK: movw %ax, -112(
+; CHECK: call{{l|q}} end_struct_small_nonchar
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  %ptr = alloca i32, align 4
+  %small2 = alloca [2 x i16], align 2
+  %large2 = alloca [8 x i32], align 16
+  %small = alloca [2 x i8], align 1
+  %large = alloca [8 x i8], align 1
+  %a = alloca %struct.struct_large_char, align 1
+  %b = alloca %struct.struct_small_char, align 1
+  %c = alloca %struct.struct_large_nonchar, align 8
+  %d = alloca %struct.struct_small_nonchar, align 2
+  %call = call i32 @get_scalar1()
+  store i32 %call, i32* %x, align 4
+  call void @end_scalar1()
+  %call1 = call i32 @get_scalar2()
+  store i32 %call1, i32* %y, align 4
+  call void @end_scalar2()
+  %call2 = call i32 @get_scalar3()
+  store i32 %call2, i32* %z, align 4
+  call void @end_scalar3()
+  %call3 = call i32 @get_addrof()
+  store i32 %call3, i32* %ptr, align 4
+  call void @end_addrof()
+  %call4 = call signext i16 @get_small_nonchar()
+  %arrayidx = getelementptr inbounds [2 x i16]* %small2, i32 0, i64 0
+  store i16 %call4, i16* %arrayidx, align 2
+  call void @end_small_nonchar()
+  %call5 = call i32 @get_large_nonchar()
+  %arrayidx6 = getelementptr inbounds [8 x i32]* %large2, i32 0, i64 0
+  store i32 %call5, i32* %arrayidx6, align 4
+  call void @end_large_nonchar()
+  %call7 = call signext i8 @get_small_char()
+  %arrayidx8 = getelementptr inbounds [2 x i8]* %small, i32 0, i64 0
+  store i8 %call7, i8* %arrayidx8, align 1
+  call void @end_small_char()
+  %call9 = call signext i8 @get_large_char()
+  %arrayidx10 = getelementptr inbounds [8 x i8]* %large, i32 0, i64 0
+  store i8 %call9, i8* %arrayidx10, align 1
+  call void @end_large_char()
+  %call11 = call signext i8 @get_struct_large_char()
+  %foo = getelementptr inbounds %struct.struct_large_char* %a, i32 0, i32 0
+  %arrayidx12 = getelementptr inbounds [8 x i8]* %foo, i32 0, i64 0
+  store i8 %call11, i8* %arrayidx12, align 1
+  call void @end_struct_large_char()
+  %call13 = call signext i8 @get_struct_small_char()
+  %foo14 = getelementptr inbounds %struct.struct_small_char* %b, i32 0, i32 0
+  %arrayidx15 = getelementptr inbounds [2 x i8]* %foo14, i32 0, i64 0
+  store i8 %call13, i8* %arrayidx15, align 1
+  call void @end_struct_small_char()
+  %call16 = call i32 @get_struct_large_nonchar()
+  %foo17 = getelementptr inbounds %struct.struct_large_nonchar* %c, i32 0, i32 0
+  %arrayidx18 = getelementptr inbounds [8 x i32]* %foo17, i32 0, i64 0
+  store i32 %call16, i32* %arrayidx18, align 4
+  call void @end_struct_large_nonchar()
+  %call19 = call signext i16 @get_struct_small_nonchar()
+  %foo20 = getelementptr inbounds %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %arrayidx21 = getelementptr inbounds [2 x i16]* %foo20, i32 0, i64 0
+  store i16 %call19, i16* %arrayidx21, align 2
+  call void @end_struct_small_nonchar()
+  %arraydecay = getelementptr inbounds [8 x i8]* %large, i32 0, i32 0
+  %arraydecay22 = getelementptr inbounds [2 x i8]* %small, i32 0, i32 0
+  %arraydecay23 = getelementptr inbounds [8 x i32]* %large2, i32 0, i32 0
+  %arraydecay24 = getelementptr inbounds [2 x i16]* %small2, i32 0, i32 0
+  %0 = load i32* %x, align 4
+  %1 = load i32* %y, align 4
+  %2 = load i32* %z, align 4
+  %coerce.dive = getelementptr %struct.struct_large_char* %a, i32 0, i32 0
+  %3 = bitcast [8 x i8]* %coerce.dive to i64*
+  %4 = load i64* %3, align 1
+  %coerce.dive25 = getelementptr %struct.struct_small_char* %b, i32 0, i32 0
+  %5 = bitcast [2 x i8]* %coerce.dive25 to i16*
+  %6 = load i16* %5, align 1
+  %coerce.dive26 = getelementptr %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
+  %8 = load i32* %7, align 1
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  ret void
+}
+
+define void @layout_sspreq() nounwind uwtable sspreq {
+entry:
+; Expected stack layout for sspreq is the same as sspstrong
+;   
+; CHECK: layout_sspreq:
+; CHECK: call{{l|q}} get_scalar1
+; CHECK: movl %eax, -120(
+; CHECK: call{{l|q}} end_scalar1
+
+; CHECK: call{{l|q}} get_scalar2
+; CHECK: movl %eax, -124(
+; CHECK: call{{l|q}} end_scalar2
+
+; CHECK: call{{l|q}} get_scalar3
+; CHECK: movl %eax, -128(
+; CHECK: call{{l|q}} end_scalar3
+
+; CHECK: call{{l|q}} get_addrof
+; CHECK: movl %eax, -116(
+; CHECK: call{{l|q}} end_addrof
+
+; CHECK: get_small_nonchar
+; CHECK: movw %ax, -100(
+; CHECK: call{{l|q}} end_small_nonchar
+
+; CHECK: call{{l|q}} get_large_nonchar
+; CHECK: movl %eax, -48(
+; CHECK: call{{l|q}} end_large_nonchar
+
+; CHECK: call{{l|q}} get_small_char
+; CHECK: movb %al, -102(
+; CHECK: call{{l|q}} end_small_char
+
+; CHECK: call{{l|q}} get_large_char
+; CHECK: movb %al, -56(
+; CHECK: call{{l|q}} end_large_char
+
+; CHECK: call{{l|q}} get_struct_large_char
+; CHECK: movb %al, -64(
+; CHECK: call{{l|q}} end_struct_large_char
+
+; CHECK: call{{l|q}} get_struct_small_char
+; CHECK: movb %al, -104(
+; CHECK: call{{l|q}} end_struct_small_char
+
+; CHECK: call{{l|q}} get_struct_large_nonchar
+; CHECK: movl %eax, -96(
+; CHECK: call{{l|q}} end_struct_large_nonchar
+
+; CHECK: call{{l|q}} get_struct_small_nonchar
+; CHECK: movw %ax, -112(
+; CHECK: call{{l|q}} end_struct_small_nonchar
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  %ptr = alloca i32, align 4
+  %small2 = alloca [2 x i16], align 2
+  %large2 = alloca [8 x i32], align 16
+  %small = alloca [2 x i8], align 1
+  %large = alloca [8 x i8], align 1
+  %a = alloca %struct.struct_large_char, align 1
+  %b = alloca %struct.struct_small_char, align 1
+  %c = alloca %struct.struct_large_nonchar, align 8
+  %d = alloca %struct.struct_small_nonchar, align 2
+  %call = call i32 @get_scalar1()
+  store i32 %call, i32* %x, align 4
+  call void @end_scalar1()
+  %call1 = call i32 @get_scalar2()
+  store i32 %call1, i32* %y, align 4
+  call void @end_scalar2()
+  %call2 = call i32 @get_scalar3()
+  store i32 %call2, i32* %z, align 4
+  call void @end_scalar3()
+  %call3 = call i32 @get_addrof()
+  store i32 %call3, i32* %ptr, align 4
+  call void @end_addrof()
+  %call4 = call signext i16 @get_small_nonchar()
+  %arrayidx = getelementptr inbounds [2 x i16]* %small2, i32 0, i64 0
+  store i16 %call4, i16* %arrayidx, align 2
+  call void @end_small_nonchar()
+  %call5 = call i32 @get_large_nonchar()
+  %arrayidx6 = getelementptr inbounds [8 x i32]* %large2, i32 0, i64 0
+  store i32 %call5, i32* %arrayidx6, align 4
+  call void @end_large_nonchar()
+  %call7 = call signext i8 @get_small_char()
+  %arrayidx8 = getelementptr inbounds [2 x i8]* %small, i32 0, i64 0
+  store i8 %call7, i8* %arrayidx8, align 1
+  call void @end_small_char()
+  %call9 = call signext i8 @get_large_char()
+  %arrayidx10 = getelementptr inbounds [8 x i8]* %large, i32 0, i64 0
+  store i8 %call9, i8* %arrayidx10, align 1
+  call void @end_large_char()
+  %call11 = call signext i8 @get_struct_large_char()
+  %foo = getelementptr inbounds %struct.struct_large_char* %a, i32 0, i32 0
+  %arrayidx12 = getelementptr inbounds [8 x i8]* %foo, i32 0, i64 0
+  store i8 %call11, i8* %arrayidx12, align 1
+  call void @end_struct_large_char()
+  %call13 = call signext i8 @get_struct_small_char()
+  %foo14 = getelementptr inbounds %struct.struct_small_char* %b, i32 0, i32 0
+  %arrayidx15 = getelementptr inbounds [2 x i8]* %foo14, i32 0, i64 0
+  store i8 %call13, i8* %arrayidx15, align 1
+  call void @end_struct_small_char()
+  %call16 = call i32 @get_struct_large_nonchar()
+  %foo17 = getelementptr inbounds %struct.struct_large_nonchar* %c, i32 0, i32 0
+  %arrayidx18 = getelementptr inbounds [8 x i32]* %foo17, i32 0, i64 0
+  store i32 %call16, i32* %arrayidx18, align 4
+  call void @end_struct_large_nonchar()
+  %call19 = call signext i16 @get_struct_small_nonchar()
+  %foo20 = getelementptr inbounds %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %arrayidx21 = getelementptr inbounds [2 x i16]* %foo20, i32 0, i64 0
+  store i16 %call19, i16* %arrayidx21, align 2
+  call void @end_struct_small_nonchar()
+  %arraydecay = getelementptr inbounds [8 x i8]* %large, i32 0, i32 0
+  %arraydecay22 = getelementptr inbounds [2 x i8]* %small, i32 0, i32 0
+  %arraydecay23 = getelementptr inbounds [8 x i32]* %large2, i32 0, i32 0
+  %arraydecay24 = getelementptr inbounds [2 x i16]* %small2, i32 0, i32 0
+  %0 = load i32* %x, align 4
+  %1 = load i32* %y, align 4
+  %2 = load i32* %z, align 4
+  %coerce.dive = getelementptr %struct.struct_large_char* %a, i32 0, i32 0
+  %3 = bitcast [8 x i8]* %coerce.dive to i64*
+  %4 = load i64* %3, align 1
+  %coerce.dive25 = getelementptr %struct.struct_small_char* %b, i32 0, i32 0
+  %5 = bitcast [2 x i8]* %coerce.dive25 to i16*
+  %6 = load i16* %5, align 1
+  %coerce.dive26 = getelementptr %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
+  %8 = load i32* %7, align 1
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  ret void
+}
+
+define void @fast_non_linux() ssp {
+entry:
+; FAST-NON-LIN: fast_non_linux:
+; FAST-NON-LIN: call{{l|q}} get_scalar1
+; FAST-NON-LIN: movl %eax, -20(
+; FAST-NON-LIN: call{{l|q}} end_scalar1
+
+; FAST-NON-LIN: call{{l|q}} get_large_char
+; FAST-NON-LIN: movb %al, -16(
+; FAST-NON-LIN: call{{l|q}} end_large_char
+  %x = alloca i32, align 4
+  %large = alloca [8 x i8], align 1
+  %call = call i32 @get_scalar1()
+  store i32 %call, i32* %x, align 4
+  call void @end_scalar1()
+  %call1 = call signext i8 @get_large_char()
+  %arrayidx = getelementptr inbounds [8 x i8]* %large, i32 0, i64 0
+  store i8 %call1, i8* %arrayidx, align 1
+  call void @end_large_char()
+  %0 = load i32* %x, align 4
+  %arraydecay = getelementptr inbounds [8 x i8]* %large, i32 0, i32 0
+  call void @takes_two(i32 %0, i8* %arraydecay)
+  ret void
+}
+
+declare i32 @get_scalar1()
+declare void @end_scalar1()
+
+declare i32 @get_scalar2()
+declare void @end_scalar2()
+
+declare i32 @get_scalar3()
+declare void @end_scalar3()
+
+declare i32 @get_addrof()
+declare void @end_addrof()
+
+declare signext i16 @get_small_nonchar()
+declare void @end_small_nonchar()
+
+declare i32 @get_large_nonchar()
+declare void @end_large_nonchar()
+
+declare signext i8 @get_small_char()
+declare void @end_small_char()
+
+declare signext i8 @get_large_char()
+declare void @end_large_char()
+
+declare signext i8 @get_struct_large_char()
+declare void @end_struct_large_char()
+
+declare signext i8 @get_struct_small_char()
+declare void @end_struct_small_char()
+
+declare i32 @get_struct_large_nonchar()
+declare void @end_struct_large_nonchar()
+
+declare signext i16 @get_struct_small_nonchar()
+declare void @end_struct_small_nonchar()
+
+declare void @takes_all(i64, i16, %struct.struct_large_nonchar* byval align 8, i32, i8*, i8*, i32*, i16*, i32*, i32, i32, i32)
+declare void @takes_two(i32, i8*)
diff --git a/test/CodeGen/X86/stack-align-memcpy.ll b/test/CodeGen/X86/stack-align-memcpy.ll
index 87bb85f..0cc3aa8 100644
--- a/test/CodeGen/X86/stack-align-memcpy.ll
+++ b/test/CodeGen/X86/stack-align-memcpy.ll
@@ -2,6 +2,9 @@
 
 %struct.foo = type { [88 x i8] }
 
+declare void @bar(i8* nocapture, %struct.foo* align 4 byval) nounwind
+declare void @baz(i8*) nounwind
+
 ; PR15249
 ; We can't use rep;movsl here because it clobbers the base pointer in %esi.
 define void @test1(%struct.foo* nocapture %x, i32 %y) nounwind {
@@ -15,4 +18,26 @@ define void @test1(%struct.foo* nocapture %x, i32 %y) nounwind {
 ; CHECK-NOT: rep;movsl
 }
 
-declare void @bar(i8* nocapture, %struct.foo* align 4 byval) nounwind
+; PR19012
+; Also don't clobber %esi if the dynamic alloca comes after the memcpy.
+define void @test2(%struct.foo* nocapture %x, i32 %y, i8* %z) nounwind {
+  call void @bar(i8* %z, %struct.foo* align 4 byval %x)
+  %dynalloc = alloca i8, i32 %y, align 1
+  call void @baz(i8* %dynalloc)
+  ret void
+
+; CHECK-LABEL: test2:
+; CHECK: movl %esp, %esi
+; CHECK-NOT: rep;movsl
+}
+
+; Check that we do use rep movs if we make the alloca static.
+define void @test3(%struct.foo* nocapture %x, i32 %y, i8* %z) nounwind {
+  call void @bar(i8* %z, %struct.foo* align 4 byval %x)
+  %statalloc = alloca i8, i32 8, align 1
+  call void @baz(i8* %statalloc)
+  ret void
+
+; CHECK-LABEL: test3:
+; CHECK: rep;movsl
+}
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index bd27ac3..fb7e2db 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -30,7 +30,7 @@ attributes #0 = { sspreq }
 !2 = metadata !{metadata !3}
 !3 = metadata !{i32 786436, metadata !1, metadata !4, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
 !4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 19, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{i32 0}
+!5 = metadata !{}
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
 !8 = metadata !{metadata !9}
diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll
new file mode 100644
index 0000000..9ce5254
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-liveness.ll
@@ -0,0 +1,245 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-stackmap-liveness| FileCheck -check-prefix=STACK %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-patchpoint-liveness| FileCheck -check-prefix=PATCH %s
+;
+; Note: Print verbose stackmaps using -debug-only=stackmaps.
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:   __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 2
+; Num LargeConstants
+; CHECK-NEXT:   .long   0
+; Num Callsites
+; CHECK-NEXT:   .long   5
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _stackmap_liveness
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _mixed_liveness
+; CHECK-NEXT:   .quad 8
+
+define void @stackmap_liveness() {
+entry:
+  %a1 = call <2 x double> asm sideeffect "", "={xmm2}"() nounwind
+; StackMap 1 (no liveness information available)
+; CHECK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+; Padding
+; CHECK-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; CHECK-NEXT:   .short  0
+; Align
+; CHECK-NEXT:   .align  3
+
+; StackMap 1 (stackmap liveness information enabled)
+; STACK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .short  0
+; Padding
+; STACK-NEXT:   .short  0
+; Num LiveOut Entries: 2
+; STACK-NEXT:   .short  2
+; LiveOut Entry 1: %RSP (8 bytes)
+; STACK-NEXT:   .short  7
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 8
+; LiveOut Entry 2: %YMM2 (16 bytes) --> %XMM2
+; STACK-NEXT:   .short  19
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 16
+; Align
+; STACK-NEXT:   .align  3
+
+; StackMap 1 (patchpoint liveness information enabled)
+; PATCH-LABEL:  .long L{{.*}}-_stackmap_liveness
+; PATCH-NEXT:   .short  0
+; PATCH-NEXT:   .short  0
+; Padding
+; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; PATCH-NEXT:   .short  0
+; Align
+; PATCH-NEXT:   .align  3
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 5)
+  %a2 = call i64 asm sideeffect "", "={r8}"() nounwind
+  %a3 = call i8 asm sideeffect "", "={ah}"() nounwind
+  %a4 = call <4 x double> asm sideeffect "", "={ymm0}"() nounwind
+  %a5 = call <4 x double> asm sideeffect "", "={ymm1}"() nounwind
+
+; StackMap 2 (no liveness information available)
+; CHECK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+; Padding
+; CHECK-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; CHECK-NEXT:   .short  0
+; Align
+; CHECK-NEXT:   .align  3
+
+; StackMap 2 (stackmap liveness information enabled)
+; STACK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .short  0
+; Padding
+; STACK-NEXT:   .short  0
+; Num LiveOut Entries: 6
+; STACK-NEXT:   .short  6
+; LiveOut Entry 1: %RAX (1 bytes) --> %AL or %AH
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 1
+; LiveOut Entry 2: %RSP (8 bytes)
+; STACK-NEXT:   .short  7
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 8
+; LiveOut Entry 3: %R8 (8 bytes)
+; STACK-NEXT:   .short  8
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 8
+; LiveOut Entry 4: %YMM0 (32 bytes)
+; STACK-NEXT:   .short  17
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 32
+; LiveOut Entry 5: %YMM1 (32 bytes)
+; STACK-NEXT:   .short  18
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 32
+; LiveOut Entry 6: %YMM2 (16 bytes) --> %XMM2
+; STACK-NEXT:   .short  19
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 16
+; Align
+; STACK-NEXT:   .align  3
+
+; StackMap 2 (patchpoint liveness information enabled)
+; PATCH-LABEL:  .long L{{.*}}-_stackmap_liveness
+; PATCH-NEXT:   .short  0
+; PATCH-NEXT:   .short  0
+; Padding
+; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; PATCH-NEXT:   .short  0
+; Align
+; PATCH-NEXT:   .align  3
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 2, i32 5)
+  call void asm sideeffect "", "{r8},{ah},{ymm0},{ymm1}"(i64 %a2, i8 %a3, <4 x double> %a4, <4 x double> %a5) nounwind
+
+; StackMap 3 (no liveness information available)
+; CHECK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+; Padding
+; CHECK-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; CHECK-NEXT:   .short  0
+; Align
+; CHECK-NEXT:   .align  3
+
+; StackMap 3 (stackmap liveness information enabled)
+; STACK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .short  0
+; Padding
+; STACK-NEXT:   .short  0
+; Num LiveOut Entries: 2
+; STACK-NEXT:   .short  2
+; LiveOut Entry 1: %RSP (8 bytes)
+; STACK-NEXT:   .short  7
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 8
+; LiveOut Entry 2: %YMM2 (16 bytes) --> %XMM2
+; STACK-NEXT:   .short  19
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 16
+; Align
+; STACK-NEXT:   .align  3
+
+; StackMap 3 (patchpoint liveness information enabled)
+; PATCH-LABEL:  .long L{{.*}}-_stackmap_liveness
+; PATCH-NEXT:   .short  0
+; PATCH-NEXT:   .short  0
+; Padding
+; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; PATCH-NEXT:   .short  0
+; Align
+; PATCH-NEXT:   .align  3
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 5)
+  call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
+  ret void
+}
+
+define void @mixed_liveness() {
+entry:
+  %a1 = call <2 x double> asm sideeffect "", "={xmm2}"() nounwind
+; StackMap 4 (stackmap liveness information enabled)
+; STACK-LABEL:  .long L{{.*}}-_mixed_liveness
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .short  0
+; Padding
+; STACK-NEXT:   .short  0
+; Num LiveOut Entries: 1
+; STACK-NEXT:   .short  1
+; LiveOut Entry 1: %YMM2 (16 bytes) --> %XMM2
+; STACK-NEXT:   .short  19
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 16
+; Align
+; STACK-NEXT:   .align  3
+
+
+; StackMap 5 (stackmap liveness information enabled)
+; STACK-LABEL:  .long L{{.*}}-_mixed_liveness
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .short  0
+; Padding
+; STACK-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; STACK-NEXT:   .short  0
+; Align
+; STACK-NEXT:   .align  3
+
+; StackMap 4 (patchpoint liveness information enabled)
+; PATCH-LABEL:  .long L{{.*}}-_mixed_liveness
+; PATCH-NEXT:   .short  0
+; PATCH-NEXT:   .short  0
+; Padding
+; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; PATCH-NEXT:   .short  0
+; Align
+; PATCH-NEXT:   .align  3
+
+; StackMap 5 (patchpoint liveness information enabled)
+; PATCH-LABEL:  .long L{{.*}}-_mixed_liveness
+; PATCH-NEXT:   .short  0
+; PATCH-NEXT:   .short  0
+; Padding
+; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 2
+; PATCH-NEXT:   .short  2
+; LiveOut Entry 1: %RSP (8 bytes)
+; PATCH-NEXT:   .short  7
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 2: %YMM2 (16 bytes) --> %XMM2
+; PATCH-NEXT:   .short  19
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 16
+; Align
+; PATCH-NEXT:   .align  3
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 5)
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 0, i8* null, i32 0)
+  call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/stackmap-nops.ll b/test/CodeGen/X86/stackmap-nops.ll
new file mode 100644
index 0000000..5a78f24
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-nops.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+
+define void @nop_test() {
+entry:
+; CHECK-LABEL: nop_test:
+; CHECK:      nop
+; CHECK:      xchgw %ax, %ax
+; CHECK:      nopl (%rax)
+; CHECK:      nopl 8(%rax)
+; CHECK:      nopl 8(%rax,%rax)
+; CHECK:      nopw 8(%rax,%rax)
+; CHECK:      nopl 512(%rax)
+; CHECK:      nopl 512(%rax,%rax)
+; CHECK:      nopw 512(%rax,%rax)
+; CHECK:      nopw %cs:512(%rax,%rax)
+
+; 11
+; CHECK:      .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 12
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 13
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 14
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 15
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 16
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nop
+
+; 17
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: xchgw %ax, %ax
+
+; 18
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopl (%rax)
+
+; 19
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopl 8(%rax)
+
+; 20
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopl 8(%rax,%rax)
+
+; 21
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopw 8(%rax,%rax)
+
+; 22
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopl 512(%rax)
+
+; 23
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopl 512(%rax,%rax)
+
+; 24
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopw 512(%rax,%rax)
+
+; 25
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 26
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 27
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 28
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+;29
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 30
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  0, i32  0)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  1, i32  1)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  2, i32  2)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  3, i32  3)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  4, i32  4)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  5, i32  5)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  6, i32  6)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  7, i32  7)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  8, i32  8)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  9, i32  9)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 10, i32 10)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 11, i32 11)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 12)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 13, i32 13)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 14)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 15)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 16)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 17)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 18, i32 18)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 19, i32 19)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 20, i32 20)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 21, i32 21)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 22, i32 22)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 23, i32 23)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 24, i32 24)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 25, i32 25)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 26, i32 26)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 27, i32 27)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 28, i32 28)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 29, i32 29)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 30, i32 30)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
diff --git a/test/CodeGen/X86/stackmap.ll b/test/CodeGen/X86/stackmap.ll
index ed95583..8567037 100644
--- a/test/CodeGen/X86/stackmap.ll
+++ b/test/CodeGen/X86/stackmap.ll
@@ -1,27 +1,74 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
 ; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
 ; CHECK-NEXT:  __LLVM_StackMaps:
-; CHECK-NEXT:   .long   0
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 15
 ; Num LargeConstants
-; CHECK-NEXT:   .long   1
-; CHECK-NEXT:   .quad   4294967296
+; CHECK-NEXT:   .long 3
 ; Num Callsites
-; CHECK-NEXT:   .long   11
+; CHECK-NEXT:   .long 19
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _constantargs
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _osrinline
+; CHECK-NEXT:   .quad 24
+; CHECK-NEXT:   .quad _osrcold
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _propertyRead
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _propertyWrite
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _jsVoidCall
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _jsIntCall
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _spilledValue
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _spilledStackMapValue
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _spillSubReg
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _subRegOffset
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _liveConstant
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _directFrameIdx
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _longid
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _clobberScratch
+; CHECK-NEXT:   .quad 56
+
+; Large Constants
+; CHECK-NEXT:   .quad   2147483648
+; CHECK-NEXT:   .quad   4294967295
+; CHECK-NEXT:   .quad   4294967296
 
+; Callsites
 ; Constant arguments
 ;
-; CHECK-NEXT:   .long   1
+; CHECK-NEXT:   .quad   1
 ; CHECK-NEXT:   .long   L{{.*}}-_constantargs
 ; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  4
+; CHECK-NEXT:   .short  12
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   8
 ; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65535
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   8
@@ -31,24 +78,58 @@
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   8
 ; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2000000000
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2147483647
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
 ; LargeConstant at index 0
 ; CHECK-NEXT:   .byte   5
 ; CHECK-NEXT:   .byte   8
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .long   0
+; LargeConstant at index 1
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   1
+; LargeConstant at index 2
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
 
 define void @constantargs() {
 entry:
   %0 = inttoptr i64 12345 to i8*
-  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 1, i32 15, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 15, i8* %0, i32 0, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
   ret void
 }
 
 ; Inline OSR Exit
 ;
-; CHECK-NEXT:   .long   3
-; CHECK-NEXT:   .long   L{{.*}}-_osrinline
+; CHECK-LABEL:  .long   L{{.*}}-_osrinline
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  2
 ; CHECK-NEXT:   .byte   1
@@ -64,7 +145,7 @@ entry:
   ; Runtime void->void call.
   call void inttoptr (i64 -559038737 to void ()*)()
   ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
-  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 3, i32 12, i64 %a, i64 %b)
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
   ret void
 }
 
@@ -72,8 +153,7 @@ entry:
 ;
 ; 2 live variables in register.
 ;
-; CHECK-NEXT:   .long  4
-; CHECK-NEXT:   .long   L{{.*}}-_osrcold
+; CHECK-LABEL:  .long   L{{.*}}-_osrcold
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  2
 ; CHECK-NEXT:   .byte   1
@@ -83,7 +163,7 @@ entry:
 ; CHECK-NEXT:   .byte   1
 ; CHECK-NEXT:   .byte   8
 ; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long  0
+; CHECK-NEXT:   .long   0
 define void @osrcold(i64 %a, i64 %b) {
 entry:
   %test = icmp slt i64 %a, %b
@@ -91,40 +171,48 @@ entry:
 cold:
   ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
   %thunk = inttoptr i64 -559038737 to i8*
-  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 4, i32 15, i8* %thunk, i32 0, i64 %a, i64 %b)
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 15, i8* %thunk, i32 0, i64 %a, i64 %b)
   unreachable
 ret:
   ret void
 }
 
 ; Property Read
-; CHECK-NEXT:  .long  5
-; CHECK-NEXT:   .long   L{{.*}}-_propertyRead
-; CHECK-NEXT:  .short  0
-; CHECK-NEXT:  .short  0
-;
-; FIXME: There are currently no stackmap entries. After moving to
-; AnyRegCC, we will have entries for the object and return value.
+; CHECK-LABEL:  .long   L{{.*}}-_propertyRead
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
 define i64 @propertyRead(i64* %obj) {
 entry:
   %resolveRead = inttoptr i64 -559038737 to i8*
-  %result = call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %resolveRead, i32 1, i64* %obj)
+  %result = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveRead, i32 1, i64* %obj)
   %add = add i64 %result, 3
   ret i64 %add
 }
 
 ; Property Write
-; CHECK-NEXT:  .long  6
-; CHECK-NEXT:   .long   L{{.*}}-_propertyWrite
-; CHECK-NEXT:  .short  0
-; CHECK-NEXT:  .short  0
-;
-; FIXME: There are currently no stackmap entries. After moving to
-; AnyRegCC, we will have entries for the object and return value.
+; CHECK-LABEL:  .long   L{{.*}}-_propertyWrite
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
 define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
 entry:
   %resolveWrite = inttoptr i64 -559038737 to i8*
-  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 6, i32 15, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
   ret void
 }
 
@@ -132,8 +220,7 @@ entry:
 ;
 ; 2 live variables in registers.
 ;
-; CHECK-NEXT:   .long  7
-; CHECK-NEXT:   .long   L{{.*}}-_jsVoidCall
+; CHECK-LABEL:  .long   L{{.*}}-_jsVoidCall
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  2
 ; CHECK-NEXT:   .byte   1
@@ -147,7 +234,7 @@ entry:
 define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 -559038737 to i8*
-  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 7, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   ret void
 }
 
@@ -155,8 +242,7 @@ entry:
 ;
 ; 2 live variables in registers.
 ;
-; CHECK:        .long  8
-; CHECK-NEXT:   .long   L{{.*}}-_jsIntCall
+; CHECK-LABEL:  .long   L{{.*}}-_jsIntCall
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  2
 ; CHECK-NEXT:   .byte   1
@@ -170,7 +256,7 @@ entry:
 define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 -559038737 to i8*
-  %result = call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 8, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -179,19 +265,18 @@ entry:
 ;
 ; Verify 17 stack map entries.
 ;
-; CHECK:      .long 11
-; CHECK-NEXT: .long L{{.*}}-_spilledValue
-; CHECK-NEXT: .short 0
-; CHECK-NEXT: .short 17
+; CHECK-LABEL:  .long L{{.*}}-_spilledValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 17
 ;
 ; Check that at least one is a spilled entry from RBP.
 ; Location: Indirect RBP + ...
-; CHECK:      .byte 3
-; CHECK-NEXT: .byte 8
-; CHECK-NEXT: .short 6
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 6
 define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
 entry:
-  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 11, i32 15, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 15, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
   ret void
 }
 
@@ -199,35 +284,33 @@ entry:
 ;
 ; Verify 17 stack map entries.
 ;
-; CHECK:       .long 12
-; CHECK-LABEL: .long L{{.*}}-_spilledStackMapValue
-; CHECK-NEXT:  .short 0
-; CHECK-NEXT:  .short 17
+; CHECK-LABEL:  .long L{{.*}}-_spilledStackMapValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 17
 ;
 ; Check that at least one is a spilled entry from RBP.
 ; Location: Indirect RBP + ...
-; CHECK:      .byte 3
-; CHECK-NEXT: .byte 8
-; CHECK-NEXT: .short 6
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 6
 define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
 entry:
-  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 12, i32 15, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 15, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
   ret void
 }
 
 ; Spill a subregister stackmap operand.
 ;
-; CHECK:       .long 13
-; CHECK-LABEL: .long L{{.*}}-_spillSubReg
-; CHECK-NEXT:  .short 0
+; CHECK-LABEL:  .long L{{.*}}-_spillSubReg
+; CHECK-NEXT:   .short 0
 ; 4 locations
-; CHECK-NEXT:  .short 1
+; CHECK-NEXT:   .short 1
 ;
 ; Check that the subregister operand is a 4-byte spill.
 ; Location: Indirect, 4-byte, RBP + ...
-; CHECK:      .byte 3
-; CHECK-NEXT: .byte 4
-; CHECK-NEXT: .short 6
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short 6
 define void @spillSubReg(i64 %arg) #0 {
 bb:
   br i1 undef, label %bb1, label %bb2
@@ -248,7 +331,7 @@ bb17:
 
 bb60:
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 13, i32 5, i32 %tmp32)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 13, i32 5, i32 %tmp32)
   unreachable
 
 bb61:
@@ -258,24 +341,23 @@ bb61:
 ; Map a single byte subregister. There is no DWARF register number, so
 ; we expect the register to be encoded with the proper size and spill offset. We don't know which
 ;
-; CHECK:       .long 14
-; CHECK-LABEL: .long L{{.*}}-_subRegOffset
-; CHECK-NEXT:  .short 0
+; CHECK-LABEL:  .long L{{.*}}-_subRegOffset
+; CHECK-NEXT:   .short 0
 ; 2 locations
-; CHECK-NEXT:  .short 2
+; CHECK-NEXT:   .short 2
 ;
 ; Check that the subregister operands are 1-byte spills.
 ; Location 0: Register, 4-byte, AL
-; CHECK-NEXT: .byte 1
-; CHECK-NEXT: .byte 1
-; CHECK-NEXT: .short 0
-; CHECK-NEXT: .long 0
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .long 0
 ;
 ; Location 1: Register, 4-byte, BL
-; CHECK-NEXT: .byte 1
-; CHECK-NEXT: .byte 1
-; CHECK-NEXT: .short 3
-; CHECK-NEXT: .long 0
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .short 3
+; CHECK-NEXT:   .long 0
 define void @subRegOffset(i16 %arg) {
   %v = mul i16 %arg, 5
   %a0 = trunc i16 %v to i8
@@ -283,10 +365,105 @@ define void @subRegOffset(i16 %arg) {
   %arghi = lshr i16 %v, 8
   %a1 = trunc i16 %arghi to i8
   tail call void asm sideeffect "nop", "~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 5, i8 %a0, i8 %a1)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 5, i8 %a0, i8 %a1)
+  ret void
+}
+
+; Map a constant value.
+;
+; CHECK-LABEL:  .long L{{.*}}-_liveConstant
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   33
+
+define void @liveConstant() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
+  ret void
+}
+
+; Directly map an alloca's address.
+;
+; Callsite 16
+; CHECK-LABEL:  .long L{{.*}}-_directFrameIdx
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short	1
+; Loc 0: Direct RBP - ofs
+; CHECK-NEXT:   .byte	2
+; CHECK-NEXT:   .byte	8
+; CHECK-NEXT:   .short	6
+; CHECK-NEXT:   .long
+
+; Callsite 17
+; CHECK-LABEL:  .long	L{{.*}}-_directFrameIdx
+; CHECK-NEXT:   .short	0
+; 2 locations
+; CHECK-NEXT:   .short	2
+; Loc 0: Direct RBP - ofs
+; CHECK-NEXT:   .byte	2
+; CHECK-NEXT:   .byte	8
+; CHECK-NEXT:   .short	6
+; CHECK-NEXT:   .long
+; Loc 1: Direct RBP - ofs
+; CHECK-NEXT:   .byte	2
+; CHECK-NEXT:   .byte	8
+; CHECK-NEXT:   .short	6
+; CHECK-NEXT:   .long
+define void @directFrameIdx() {
+entry:
+  %metadata1 = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata1
+  store i64 12, i64* %metadata1
+  store i64 13, i64* %metadata1
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
+  %metadata2 = alloca i8, i32 4, align 8
+  %metadata3 = alloca i16, i32 4, align 8
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 17, i32 5, i8* null, i32 0, i8* %metadata2, i16* %metadata3)
+  ret void
+}
+
+; Test a 64-bit ID.
+;
+; CHECK:        .quad 4294967295
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad 4294967296
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad 9223372036854775807
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad -1
+; CHECK-LABEL:  .long L{{.*}}-_longid
+define void @longid() {
+entry:
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4294967295, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4294967296, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 9223372036854775807, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 -1, i32 0, i8* null, i32 0)
+  ret void
+}
+
+; Map a value when R11 is the only free register.
+; The scratch register should not be used for a live stackmap value.
+;
+; CHECK-LABEL:  .long L{{.*}}-_clobberScratch
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: Indirect fp - offset
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .short  6
+; CHECK-NEXT:   .long   -{{[0-9]+}}
+define void @clobberScratch(i32 %a) {
+  tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r12},~{r13},~{r14},~{r15}"() nounwind
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
   ret void
 }
 
-declare void @llvm.experimental.stackmap(i32, i32, ...)
-declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/stdcall-notailcall.ll b/test/CodeGen/X86/stdcall-notailcall.ll
index 8f522cd..448db4c 100644
--- a/test/CodeGen/X86/stdcall-notailcall.ll
+++ b/test/CodeGen/X86/stdcall-notailcall.ll
@@ -4,10 +4,18 @@
 define x86_stdcallcc void @bar(%struct.I* nocapture %this) ssp align 2 {
 ; CHECK-LABEL: bar:
 ; CHECK-NOT: jmp
-; CHECK: ret $4
+; CHECK: retl $4
 entry:
   tail call void @foo()
   ret void
 }
 
+define x86_thiscallcc void @test2(%struct.I*  %this, i32 %a) {
+; CHECK-LABEL: test2:
+; CHECK: calll _foo
+; CHECK: retl $4
+  tail call void @foo()
+  ret void
+}
+
 declare void @foo()
diff --git a/test/CodeGen/X86/stdcall.ll b/test/CodeGen/X86/stdcall.ll
index 73826ed..3cefe14 100644
--- a/test/CodeGen/X86/stdcall.ll
+++ b/test/CodeGen/X86/stdcall.ll
@@ -6,14 +6,14 @@
 define internal x86_stdcallcc void @MyFunc() nounwind {
 entry:
 ; CHECK: MyFunc@0:
-; CHECK: ret
+; CHECK: retl
   ret void
 }
 
 ; PR14410
 define x86_stdcallcc i32 @"\01DoNotMangle"(i32 %a) {
 ; CHECK: DoNotMangle:
-; CHECK: ret $4
+; CHECK: retl $4
 entry:
   ret i32 %a
 }
diff --git a/test/CodeGen/X86/stores-merging.ll b/test/CodeGen/X86/stores-merging.ll
new file mode 100644
index 0000000..61dea08
--- /dev/null
+++ b/test/CodeGen/X86/stores-merging.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%structTy = type { i8, i32, i32 }
+
+@e = common global %structTy zeroinitializer, align 4
+
+; CHECK-LABEL: f
+define void @f() {
+entry:
+
+; CHECK:   movabsq	$528280977409, %rax
+; CHECK:   movq    %rax, e+4(%rip)
+; CHECK:   movl    $456, e+8(%rip)
+
+  store i32 1, i32* getelementptr inbounds (%structTy* @e, i64 0, i32 1), align 4
+  store i32 123, i32* getelementptr inbounds (%structTy* @e, i64 0, i32 2), align 4
+  store i32 456, i32* getelementptr inbounds (%structTy* @e, i64 0, i32 2), align 4
+  ret void
+}
+
diff --git a/test/CodeGen/X86/sunkaddr-ext.ll b/test/CodeGen/X86/sunkaddr-ext.ll
new file mode 100644
index 0000000..6d23867
--- /dev/null
+++ b/test/CodeGen/X86/sunkaddr-ext.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+
+; Test to make sure that if math that can roll over has been used we don't
+; use the potential overflow as the basis for an address calculation later by
+; sinking it into a different basic block.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @test_sink(i8* %arg1, i32 %arg2, i8 %arg3) #0 {
+  %tmp1 = add i32 -2147483648, %arg2
+  %tmp2 = add i32 -2147483648, %tmp1
+  %tmp3 = getelementptr i8* %arg1, i32 %arg2
+  br label %bb1
+
+bb1:
+  %tmp4 = getelementptr i8* %arg1, i32 %tmp2
+  store i8 %arg3, i8* %tmp4
+  ret void;
+}
+
+; CHECK-LABEL: test_sink:
+; CHECK:   movslq  %esi, [[TEMP:%[a-z0-9]+]]
+; CHECK:   movb    %dl, (%rdi,[[TEMP]])
+; CHECK:   retq
diff --git a/test/CodeGen/X86/tbm-intrinsics-x86_64.ll b/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
index 1bc6175..1beee72 100644
--- a/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
@@ -34,7 +34,7 @@ declare i64 @llvm.x86.tbm.bextri.u64(i64, i64) nounwind readnone
 
 define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind readonly {
 entry:
-  ; CHECK-LABEl: test_x86_tbm_bextri_u64_m:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
   ; CHECK-NOT: mov
   ; CHECK: bextr $
   %tmp1 = load i64* %a, align 8
diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll
index f2bebf5..dab5e7b 100644
--- a/test/CodeGen/X86/v2f32.ll
+++ b/test/CodeGen/X86/v2f32.ll
@@ -24,9 +24,9 @@ define void @test1(<2 x float> %Q, float *%P2) nounwind {
 ; W64-NEXT: ret
 
 ; X32-LABEL: test1:
+; X32-NEXT: movl	4(%esp), %eax
 ; X32-NEXT: pshufd	$1, %xmm0, %xmm1
 ; X32-NEXT: addss	%xmm0, %xmm1
-; X32-NEXT: movl	4(%esp), %eax
 ; X32-NEXT: movss	%xmm1, (%eax)
 ; X32-NEXT: ret
 }
diff --git a/test/CodeGen/X86/v4i32load-crash.ll b/test/CodeGen/X86/v4i32load-crash.ll
index 052c4c3..3e7f9e6 100644
--- a/test/CodeGen/X86/v4i32load-crash.ll
+++ b/test/CodeGen/X86/v4i32load-crash.ll
@@ -1,10 +1,11 @@
-; RUN: llc --mcpu=x86-64 --mattr=ssse3 < %s
+; RUN: llc --march=x86 --mcpu=x86-64 --mattr=ssse3 < %s
+; RUN: llc --march=x86-64 --mcpu=x86-64 --mattr=ssse3 < %s
 
 ;PR18045:
 ;Issue of selection for 'v4i32 load'.
 ;This instruction is not legal for X86 CPUs with sse < 'sse4.1'.
 ;This node was generated by X86ISelLowering.cpp, EltsFromConsecutiveLoads
-;static function after legilize stage.
+;static function after legalize stage.
 
 @e = external global [4 x i32], align 4
 @f = external global [4 x i32], align 4
diff --git a/test/CodeGen/X86/vaargs.ll b/test/CodeGen/X86/vaargs.ll
new file mode 100644
index 0000000..ddeb7a3
--- /dev/null
+++ b/test/CodeGen/X86/vaargs.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mcpu=corei7-avx %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=NO-FLAGS
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%struct.__va_list_tag = type { i32, i32, i8*, i8* }
+
+; Check that vastart gets the right thing.
+define i32 @sum(i32 %count, ...) nounwind optsize ssp uwtable {
+; CHECK:      testb   %al, %al
+; CHECK-NEXT: je
+; CHECK-NEXT: ## BB#{{[0-9]+}}:
+; CHECK-NEXT: vmovaps %xmm0, 48(%rsp)
+; CHECK-NEXT: vmovaps %xmm1, 64(%rsp)
+; CHECK-NEXT: vmovaps %xmm2, 80(%rsp)
+; CHECK-NEXT: vmovaps %xmm3, 96(%rsp)
+; CHECK-NEXT: vmovaps %xmm4, 112(%rsp)
+; CHECK-NEXT: vmovaps %xmm5, 128(%rsp)
+; CHECK-NEXT: vmovaps %xmm6, 144(%rsp)
+; CHECK-NEXT: vmovaps %xmm7, 160(%rsp)
+
+; Check that [EFLAGS] hasn't been pulled in.
+; NO-FLAGS-NOT: %flags
+
+  %ap = alloca [1 x %struct.__va_list_tag], align 16
+  %1 = bitcast [1 x %struct.__va_list_tag]* %ap to i8*
+  call void @llvm.va_start(i8* %1)
+  %2 = icmp sgt i32 %count, 0
+  br i1 %2, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %3 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0, i32 0
+  %4 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0, i32 2
+  %.pre = load i32* %3, align 16
+  br label %5
+
+; <label>:5                                       ; preds = %.lr.ph, %13
+  %6 = phi i32 [ %.pre, %.lr.ph ], [ %14, %13 ]
+  %.01 = phi i32 [ %count, %.lr.ph ], [ %15, %13 ]
+  %7 = icmp ult i32 %6, 41
+  br i1 %7, label %8, label %10
+
+; <label>:8                                       ; preds = %5
+  %9 = add i32 %6, 8
+  store i32 %9, i32* %3, align 16
+  br label %13
+
+; <label>:10                                      ; preds = %5
+  %11 = load i8** %4, align 8
+  %12 = getelementptr i8* %11, i64 8
+  store i8* %12, i8** %4, align 8
+  br label %13
+
+; <label>:13                                      ; preds = %10, %8
+  %14 = phi i32 [ %6, %10 ], [ %9, %8 ]
+  %15 = add nsw i32 %.01, 1
+  %16 = icmp sgt i32 %15, 0
+  br i1 %16, label %5, label %._crit_edge
+
+._crit_edge:                                      ; preds = %13, %0
+  %.0.lcssa = phi i32 [ %count, %0 ], [ %15, %13 ]
+  call void @llvm.va_end(i8* %1)
+  ret i32 %.0.lcssa
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare void @llvm.va_end(i8*) nounwind
diff --git a/test/CodeGen/X86/vastart-defs-eflags.ll b/test/CodeGen/X86/vastart-defs-eflags.ll
new file mode 100644
index 0000000..6017753
--- /dev/null
+++ b/test/CodeGen/X86/vastart-defs-eflags.ll
@@ -0,0 +1,23 @@
+; RUN: llc %s -o - | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Check that vastart handling doesn't get between testb and je for the branch.
+define i32 @check_flag(i32 %flags, ...) nounwind {
+entry:
+; CHECK: {{^}} testb $2, %bh
+; CHECK-NOT: test
+; CHECK: {{^}} je
+  %and = and i32 %flags, 512
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %hasflag = phi i32 [ 1, %if.then ], [ 0, %entry ]
+  ret i32 %hasflag
+}
+
diff --git a/test/CodeGen/X86/vbinop-simplify-bug.ll b/test/CodeGen/X86/vbinop-simplify-bug.ll
new file mode 100644
index 0000000..3a89cd7
--- /dev/null
+++ b/test/CodeGen/X86/vbinop-simplify-bug.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 -mcpu=corei7 -o /dev/null
+
+; Revision 199135 introduced a wrong check in method
+; DAGCombiner::SimplifyVBinOp in an attempt to refactor some code
+; using the new method 'BuildVectorSDNode::isConstant' when possible.
+; 
+; However the modified code in method SimplifyVBinOp now wrongly
+; checks that the operands of a vector bin-op are both constants.
+;
+; With that wrong change, this test started failing because of a
+; 'fatal error in the backend':
+;   Cannot select: 0x2e329d0: v4i32 = BUILD_VECTOR 0x2e2ea00, 0x2e2ea00, 0x2e2ea00, 0x2e2ea00
+;       0x2e2ea00: i32 = Constant<1> [ID=4]
+;       0x2e2ea00: i32 = Constant<1> [ID=4]
+;       0x2e2ea00: i32 = Constant<1> [ID=4]
+;       0x2e2ea00: i32 = Constant<1> [ID=4]
+
+define <8 x i32> @reduced_test_case() {
+  %Shuff = shufflevector <8 x i32> zeroinitializer, <8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 undef, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %B23 = sub <8 x i32> %Shuff, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <8 x i32> %B23
+}
+
diff --git a/test/CodeGen/X86/vec_round.ll b/test/CodeGen/X86/vec_round.ll
index baa2f58..9258f9e 100644
--- a/test/CodeGen/X86/vec_round.ll
+++ b/test/CodeGen/X86/vec_round.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-unknown-linux-gnu"
 declare void @use(<2 x double>)
 
 ; CHECK-LABEL: @test
-; CHECK callq round
+; CHECK: callq round
 
 ; Function Attrs: nounwind uwtable
 define void @test() {
diff --git a/test/CodeGen/X86/vec_setcc-2.ll b/test/CodeGen/X86/vec_setcc-2.ll
new file mode 100644
index 0000000..ef916dc
--- /dev/null
+++ b/test/CodeGen/X86/vec_setcc-2.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s -o - -mcpu=generic -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=generic -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s
+
+; For a setult against a constant, turn it into a setule and lower via psubusw.
+
+define void @loop_no_const_reload(<2 x i64>*  %in, <2 x i64>* %out, i32 %n) {
+; CHECK: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-LABEL: loop_no_const_reload:
+; CHECK: psubusw
+
+; Constant is no longer clobbered so no need to reload it in the loop.
+
+; CHECK-NOT: movdqa {{%xmm[0-9]+}}, {{%xmm[0-9]+}}
+
+entry:
+  %cmp9 = icmp eq i32 %n, 0
+  br i1 %cmp9, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %in, i64 %indvars.iv
+  %arrayidx1.val = load <2 x i64>* %arrayidx1, align 16
+  %0 = bitcast <2 x i64> %arrayidx1.val to <8 x i16>
+  %cmp.i.i = icmp ult <8 x i16> %0, <i16 26, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26>
+  %sext.i.i = sext <8 x i1> %cmp.i.i to <8 x i16>
+  %1 = bitcast <8 x i16> %sext.i.i to <2 x i64>
+  %arrayidx5 = getelementptr inbounds <2 x i64>* %out, i64 %indvars.iv
+  store <2 x i64> %1, <2 x i64>* %arrayidx5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Be careful if decrementing the constant would undeflow.
+
+define void @loop_const_folding_underflow(<2 x i64>*  %in, <2 x i64>* %out, i32 %n) {
+; CHECK-NOT: .short 25
+; CHECK-LABEL: loop_const_folding_underflow:
+; CHECK-NOT: psubusw
+entry:
+  %cmp9 = icmp eq i32 %n, 0
+  br i1 %cmp9, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %in, i64 %indvars.iv
+  %arrayidx1.val = load <2 x i64>* %arrayidx1, align 16
+  %0 = bitcast <2 x i64> %arrayidx1.val to <8 x i16>
+  %cmp.i.i = icmp ult <8 x i16> %0, <i16 0, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26>
+  %sext.i.i = sext <8 x i1> %cmp.i.i to <8 x i16>
+  %1 = bitcast <8 x i16> %sext.i.i to <2 x i64>
+  %arrayidx5 = getelementptr inbounds <2 x i64>* %out, i64 %indvars.iv
+  store <2 x i64> %1, <2 x i64>* %arrayidx5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test for PSUBUSB
+
+define <16 x i8> @test_ult_byte(<16 x i8> %a) {
+; CHECK: .space 16,10
+; CHECK-LABEL: test_ult_byte:
+; CHECK: psubus
+entry:
+  %icmp = icmp ult <16 x i8> %a, <i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11>
+  %sext = sext <16 x i1> %icmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+
+; Only do this when we can turn the comparison into a setule.  I.e. not for
+; register operands.
+
+define <8 x i16> @test_ult_register(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_ult_register:
+; CHECK-NOT: psubus
+entry:
+  %icmp = icmp ult <8 x i16> %a, %b
+  %sext = sext <8 x i1> %icmp to <8 x i16>
+  ret <8 x i16> %sext
+}
diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll
index fc8a56d..322dbae 100644
--- a/test/CodeGen/X86/vec_setcc.ll
+++ b/test/CodeGen/X86/vec_setcc.ll
@@ -42,12 +42,9 @@ define <8 x i16> @v8i16_icmp_uge(<8 x i16> %a, <8 x i16> %b) nounwind readnone s
   %2 = sext <8 x i1> %1 to <8 x i16>
   ret <8 x i16> %2
 ; SSE2-LABEL: v8i16_icmp_uge:
-; SSE2: movdqa  {{.*}}(%rip), %xmm2
-; SEE2: pxor    %xmm2, %xmm0
-; SSE2: pxor    %xmm1, %xmm2
-; SSE2: pcmpgtw %xmm0, %xmm2
-; SSE2: pcmpeqd %xmm0, %xmm0
-; SSE2: pxor    %xmm2, %xmm0
+; SSE2: psubusw %xmm0, %xmm1
+; SEE2: pxor    %xmm0, %xmm0
+; SSE2: pcmpeqw %xmm1, %xmm0
 
 ; SSE41-LABEL: v8i16_icmp_uge:
 ; SSE41: pmaxuw  %xmm0, %xmm1
@@ -63,12 +60,9 @@ define <8 x i16> @v8i16_icmp_ule(<8 x i16> %a, <8 x i16> %b) nounwind readnone s
   %2 = sext <8 x i1> %1 to <8 x i16>
   ret <8 x i16> %2
 ; SSE2-LABEL: v8i16_icmp_ule:
-; SSE2: movdqa  {{.*}}(%rip), %xmm2
-; SSE2: pxor    %xmm2, %xmm1
-; SSE2: pxor    %xmm2, %xmm0
-; SSE2: pcmpgtw %xmm1, %xmm0
-; SSE2: pcmpeqd %xmm1, %xmm1
-; SSE2: pxor    %xmm0, %xmm1
+; SSE2: psubusw %xmm1, %xmm0
+; SSE2: pxor    %xmm1, %xmm1
+; SSE2: pcmpeqw %xmm0, %xmm1
 ; SSE2: movdqa  %xmm1, %xmm0
 
 ; SSE41-LABEL: v8i16_icmp_ule:
diff --git a/test/CodeGen/X86/vec_shift4.ll b/test/CodeGen/X86/vec_shift4.ll
index e2fe45c..b266a69 100644
--- a/test/CodeGen/X86/vec_shift4.ll
+++ b/test/CodeGen/X86/vec_shift4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.1 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
 
 define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/vec_shift5.ll b/test/CodeGen/X86/vec_shift5.ll
new file mode 100644
index 0000000..2e98003
--- /dev/null
+++ b/test/CodeGen/X86/vec_shift5.ll
@@ -0,0 +1,160 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=-sse4.1 < %s | FileCheck %s
+
+; Verify that we correctly fold target specific packed vector shifts by
+; immediate count into a simple build_vector when the elements of the vector
+; in input to the packed shift are all constants or undef.
+
+define <8 x i16> @test1() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 1, i16 2, i16 4, i16 8>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test2() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test3() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test4() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 1, i32 2, i32 4, i32 8>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test5() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test6() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test7() {
+  %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 1, i64 2>, i32 3)
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test8() {
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 8, i64 16>, i32 3)
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test9() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test9
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test10() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test11() {
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test11
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test12() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test12
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test13() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test14() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test15() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test15
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test16() {
+  %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test16
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32)
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32)
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32)
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32)
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32)
+
diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll
new file mode 100644
index 0000000..df2d9cb
--- /dev/null
+++ b/test/CodeGen/X86/vec_shift6.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX2ONLY
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX512
+
+
+; Verify that we don't scalarize a packed vector shift left of 16-bit
+; signed integers if the amount is a constant build_vector.
+; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
+
+define <8 x i16> @test1(<8 x i16> %a) {
+  %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+  ret <8 x i16> %shl
+}
+; CHECK-LABEL: test1
+; CHECK: pmullw
+; CHECK-NEXT: ret
+
+
+define <8 x i16> @test2(<8 x i16> %a) {
+  %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
+  ret <8 x i16> %shl
+}
+; CHECK-LABEL: test2
+; CHECK: pmullw
+; CHECK-NEXT: ret
+
+
+; Verify that a vector shift left of 32-bit signed integers is simply expanded
+; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
+; counts is a constant build_vector.
+
+define <4 x i32> @test3(<4 x i32> %a) {
+  %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
+  ret <4 x i32> %shl
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: cvttps2dq
+; SSE: pmulld
+; AVX2: vpsllvd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a) {
+  %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
+  ret <4 x i32> %shl
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: cvttps2dq
+; SSE: pmulld
+; AVX2: vpsllvd
+; CHECK-NEXT: ret
+
+
+; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
+; into two pmullw instructions. With AVX2, the test case below would produce
+; a single vpmullw.
+
+define <16 x i16> @test5(<16 x i16> %a) {
+  %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+  ret <16 x i16> %shl
+}
+; CHECK-LABEL: test5
+; SSE: pmullw
+; SSE-NEXT: pmullw
+; AVX2: vpmullw
+; AVX2-NOT: vpmullw
+; CHECK: ret
+
+
+; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
+; into two pmulld instructions. With AVX2, the test case below would produce
+; a single vpsllvd instead.
+
+define <8 x i32> @test6(<8 x i32> %a) {
+  %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
+  ret <8 x i32> %shl
+}
+; CHECK-LABEL: test6
+; SSE: pmulld
+; SSE-NEXT: pmulld
+; AVX2: vpsllvd
+; CHECK: ret
+
+
+; With AVX2 and AVX512, the test case below should produce a sequence of
+; two vpmullw instructions. On SSE2 instead, we split the shift in four
+; parts and then we convert each part into a pmullw.
+
+define <32 x i16> @test7(<32 x i16> %a) {
+  %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+  ret <32 x i16> %shl
+}
+; CHECK-LABEL: test7
+; SSE: pmullw
+; SSE-NEXT: pmullw
+; SSE-NEXT: pmullw
+; SSE-NEXT: pmullw
+; AVX2: vpmullw
+; AVX2-NEXT: vpmullw
+; CHECK: ret
+
+
+; Similar to test7; the difference is that with AVX512 support
+; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
+
+define <16 x i32> @test8(<16 x i32> %a) {
+  %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
+  ret <16 x i32> %shl
+}
+; CHECK-LABEL: test8
+; SSE: pmulld
+; SSE-NEXT: pmulld
+; SSE-NEXT: pmulld
+; SSE-NEXT: pmulld
+; AVX2ONLY: vpsllvd
+; AVX2ONLY-NEXT: vpsllvd
+; AVX512: vpsllvd
+; AVX512-NOT: vpsllvd
+; CHECK: ret
+
+
+; The shift from 'test9' gets scalarized if we don't have AVX2/AVX512f support.
+
+define <8 x i64> @test9(<8 x i64> %a) {
+  %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
+  ret <8 x i64> %shl
+}
+; CHECK-LABEL: test9
+; AVX2ONLY: vpsllvq
+; AVX2ONLY-NEXT: vpsllvq
+; AVX512: vpsllvq
+; AVX512-NOT: vpsllvq
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/vec_shuf-insert.ll b/test/CodeGen/X86/vec_shuf-insert.ll
new file mode 100644
index 0000000..2e1a1d6
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuf-insert.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s
+
+; These tests check that an insert_subvector which replaces one of the halves
+; of a concat_vectors is optimized into a single vinsertf128.
+
+
+declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
+
+define <8 x float> @lower_half(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) {
+  %1 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %1, <4 x float> %v3, i8 0)
+  ret <8 x float> %2
+
+; CHECK-LABEL: lower_half
+; CHECK-NOT: vinsertf128
+; CHECK: vinsertf128 $1, %xmm1, %ymm2, %ymm0
+; CHECK-NEXT: ret
+}
+
+define <8 x float> @upper_half(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) {
+  %1 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %1, <4 x float> %v3, i8 1)
+  ret <8 x float> %2
+
+; CHECK-LABEL: upper_half
+; CHECK-NOT: vinsertf128
+; CHECK: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/vec_shuffle-40.ll b/test/CodeGen/X86/vec_shuffle-40.ll
new file mode 100644
index 0000000..75b45e3
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuffle-40.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
+
+define void @shuffle_v16i16(<16 x i16>* %a) {
+; CHECK-LABEL: shuffle_v16i16:
+; CHECK: vpshufb {{.*}}%ymm
+; CHECK-NOT: vpshufb {{.*}}%xmm
+entry:
+  %0 = load <16 x i16>* %a, align 32
+  %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  store <16 x i16> %shuffle, <16 x i16>* %a, align 32
+  ret void
+}
+
+define void @shuffle_v16i16_lanecrossing(<16 x i16>* %a) {
+; CHECK-LABEL: shuffle_v16i16_lanecrossing:
+; CHECK-NOT: vpshufb {{.*}}%ymm
+entry:
+  %0 = load <16 x i16>* %a, align 32
+  %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 13, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  store <16 x i16> %shuffle, <16 x i16>* %a, align 32
+  ret void
+}
diff --git a/test/CodeGen/X86/vector-gep.ll b/test/CodeGen/X86/vector-gep.ll
index b87d844..9c68f44 100644
--- a/test/CodeGen/X86/vector-gep.ll
+++ b/test/CodeGen/X86/vector-gep.ll
@@ -4,22 +4,26 @@
 ;CHECK-LABEL: AGEP0:
 define <4 x i32*> @AGEP0(i32* %ptr) nounwind {
 entry:
+;CHECK-LABEL: AGEP0
+;CHECK: vbroadcast
+;CHECK-NEXT: vpaddd
+;CHECK-NEXT: ret
   %vecinit.i = insertelement <4 x i32*> undef, i32* %ptr, i32 0
   %vecinit2.i = insertelement <4 x i32*> %vecinit.i, i32* %ptr, i32 1
   %vecinit4.i = insertelement <4 x i32*> %vecinit2.i, i32* %ptr, i32 2
   %vecinit6.i = insertelement <4 x i32*> %vecinit4.i, i32* %ptr, i32 3
-;CHECK: padd
   %A2 = getelementptr <4 x i32*> %vecinit6.i, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-;CHECK: padd
   %A3 = getelementptr <4 x i32*> %A2, <4 x i32> <i32 10, i32 14, i32 19, i32 233>
   ret <4 x i32*> %A3
-;CHECK: ret
 }
 
 ;CHECK-LABEL: AGEP1:
 define i32 @AGEP1(<4 x i32*> %param) nounwind {
 entry:
-;CHECK: padd
+;CHECK-LABEL: AGEP1
+;CHECK: vpaddd
+;CHECK-NEXT: vpextrd
+;CHECK-NEXT: movl
   %A2 = getelementptr <4 x i32*> %param, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   %k = extractelement <4 x i32*> %A2, i32 3
   %v = load i32* %k
@@ -30,8 +34,9 @@ entry:
 ;CHECK-LABEL: AGEP2:
 define i32 @AGEP2(<4 x i32*> %param, <4 x i32> %off) nounwind {
 entry:
-;CHECK: pslld $2
-;CHECK: padd
+;CHECK-LABEL: AGEP2
+;CHECK: vpslld $2
+;CHECK-NEXT: vpadd
   %A2 = getelementptr <4 x i32*> %param, <4 x i32> %off
   %k = extractelement <4 x i32*> %A2, i32 3
   %v = load i32* %k
@@ -42,8 +47,9 @@ entry:
 ;CHECK-LABEL: AGEP3:
 define <4 x i32*> @AGEP3(<4 x i32*> %param, <4 x i32> %off) nounwind {
 entry:
-;CHECK: pslld $2
-;CHECK: padd
+;CHECK-LABEL: AGEP3
+;CHECK: vpslld $2
+;CHECK-NEXT: vpadd
   %A2 = getelementptr <4 x i32*> %param, <4 x i32> %off
   %v = alloca i32
   %k = insertelement <4 x i32*> %A2, i32* %v, i32 3
@@ -54,10 +60,11 @@ entry:
 ;CHECK-LABEL: AGEP4:
 define <4 x i16*> @AGEP4(<4 x i16*> %param, <4 x i32> %off) nounwind {
 entry:
+;CHECK-LABEL: AGEP4
 ; Multiply offset by two (add it to itself).
-;CHECK: padd
+;CHECK: vpadd
 ; add the base to the offset
-;CHECK: padd
+;CHECK-NEXT: vpadd
   %A = getelementptr <4 x i16*> %param, <4 x i32> %off
   ret <4 x i16*> %A
 ;CHECK: ret
@@ -66,7 +73,8 @@ entry:
 ;CHECK-LABEL: AGEP5:
 define <4 x i8*> @AGEP5(<4 x i8*> %param, <4 x i8> %off) nounwind {
 entry:
-;CHECK: paddd
+;CHECK-LABEL: AGEP5
+;CHECK: vpaddd
   %A = getelementptr <4 x i8*> %param, <4 x i8> %off
   ret <4 x i8*> %A
 ;CHECK: ret
@@ -77,6 +85,7 @@ entry:
 ;CHECK-LABEL: AGEP6:
 define <4 x i8*> @AGEP6(<4 x i8*> %param, <4 x i32> %off) nounwind {
 entry:
+;CHECK-LABEL: AGEP6
 ;CHECK-NOT: pslld
   %A = getelementptr <4 x i8*> %param, <4 x i32> %off
   ret <4 x i8*> %A
diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll
index 0be00da..d9f2cb0 100644
--- a/test/CodeGen/X86/viabs.ll
+++ b/test/CodeGen/X86/viabs.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=x86-64 -mcpu=x86-64 | FileCheck %s -check-prefix=SSE2
 ; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s -check-prefix=SSSE3
 ; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s -check-prefix=AVX2
+; RUN: llc < %s -march=x86-64 -mcpu=knl | FileCheck %s -check-prefix=AVX512
 
 define <4 x i32> @test1(<4 x i32> %a) nounwind {
 ; SSE2-LABEL: test1:
@@ -17,6 +18,10 @@ define <4 x i32> @test1(<4 x i32> %a) nounwind {
 ; AVX2-LABEL: test1:
 ; AVX2: vpabsd
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test1:
+; AVX512: vpabsd
+; AVX512-NEXT: ret
         %tmp1neg = sub <4 x i32> zeroinitializer, %a
         %b = icmp sgt <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
         %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
@@ -38,6 +43,10 @@ define <4 x i32> @test2(<4 x i32> %a) nounwind {
 ; AVX2-LABEL: test2:
 ; AVX2: vpabsd
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test2:
+; AVX512: vpabsd
+; AVX512-NEXT: ret
         %tmp1neg = sub <4 x i32> zeroinitializer, %a
         %b = icmp sge <4 x i32> %a, zeroinitializer
         %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
@@ -59,6 +68,10 @@ define <8 x i16> @test3(<8 x i16> %a) nounwind {
 ; AVX2-LABEL: test3:
 ; AVX2: vpabsw
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test3:
+; AVX512: vpabsw
+; AVX512-NEXT: ret
         %tmp1neg = sub <8 x i16> zeroinitializer, %a
         %b = icmp sgt <8 x i16> %a, zeroinitializer
         %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
@@ -80,6 +93,10 @@ define <16 x i8> @test4(<16 x i8> %a) nounwind {
 ; AVX2-LABEL: test4:
 ; AVX2: vpabsb
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test4:
+; AVX512: vpabsb
+; AVX512-NEXT: ret
         %tmp1neg = sub <16 x i8> zeroinitializer, %a
         %b = icmp slt <16 x i8> %a, zeroinitializer
         %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
@@ -101,6 +118,10 @@ define <4 x i32> @test5(<4 x i32> %a) nounwind {
 ; AVX2-LABEL: test5:
 ; AVX2: vpabsd
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test5:
+; AVX512: vpabsd
+; AVX512-NEXT: ret
         %tmp1neg = sub <4 x i32> zeroinitializer, %a
         %b = icmp sle <4 x i32> %a, zeroinitializer
         %abs = select <4 x i1> %b, <4 x i32> %tmp1neg, <4 x i32> %a
@@ -116,6 +137,10 @@ define <8 x i32> @test6(<8 x i32> %a) nounwind {
 ; AVX2-LABEL: test6:
 ; AVX2: vpabsd {{.*}}%ymm
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test6:
+; AVX512: vpabsd {{.*}}%ymm
+; AVX512-NEXT: ret
         %tmp1neg = sub <8 x i32> zeroinitializer, %a
         %b = icmp sgt <8 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
         %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
@@ -131,6 +156,10 @@ define <8 x i32> @test7(<8 x i32> %a) nounwind {
 ; AVX2-LABEL: test7:
 ; AVX2: vpabsd {{.*}}%ymm
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test7:
+; AVX512: vpabsd {{.*}}%ymm
+; AVX512-NEXT: ret
         %tmp1neg = sub <8 x i32> zeroinitializer, %a
         %b = icmp sge <8 x i32> %a, zeroinitializer
         %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
@@ -146,6 +175,10 @@ define <16 x i16> @test8(<16 x i16> %a) nounwind {
 ; AVX2-LABEL: test8:
 ; AVX2: vpabsw {{.*}}%ymm
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test8:
+; AVX512: vpabsw {{.*}}%ymm
+; AVX512-NEXT: ret
         %tmp1neg = sub <16 x i16> zeroinitializer, %a
         %b = icmp sgt <16 x i16> %a, zeroinitializer
         %abs = select <16 x i1> %b, <16 x i16> %a, <16 x i16> %tmp1neg
@@ -161,6 +194,10 @@ define <32 x i8> @test9(<32 x i8> %a) nounwind {
 ; AVX2-LABEL: test9:
 ; AVX2: vpabsb {{.*}}%ymm
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test9:
+; AVX512: vpabsb {{.*}}%ymm
+; AVX512-NEXT: ret
         %tmp1neg = sub <32 x i8> zeroinitializer, %a
         %b = icmp slt <32 x i8> %a, zeroinitializer
         %abs = select <32 x i1> %b, <32 x i8> %tmp1neg, <32 x i8> %a
@@ -176,8 +213,58 @@ define <8 x i32> @test10(<8 x i32> %a) nounwind {
 ; AVX2-LABEL: test10:
 ; AVX2: vpabsd {{.*}}%ymm
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test10:
+; AVX512: vpabsd {{.*}}%ymm
+; AVX512-NEXT: ret
         %tmp1neg = sub <8 x i32> zeroinitializer, %a
         %b = icmp sle <8 x i32> %a, zeroinitializer
         %abs = select <8 x i1> %b, <8 x i32> %tmp1neg, <8 x i32> %a
         ret <8 x i32> %abs
 }
+
+define <16 x i32> @test11(<16 x i32> %a) nounwind {
+; AVX2-LABEL: test11:
+; AVX2: vpabsd
+; AVX2: vpabsd
+; AVX2-NEXT: ret
+
+; AVX512-LABEL: test11:
+; AVX512: vpabsd {{.*}}%zmm
+; AVX512-NEXT: ret
+        %tmp1neg = sub <16 x i32> zeroinitializer, %a
+        %b = icmp sle <16 x i32> %a, zeroinitializer
+        %abs = select <16 x i1> %b, <16 x i32> %tmp1neg, <16 x i32> %a
+        ret <16 x i32> %abs
+}
+
+define <8 x i64> @test12(<8 x i64> %a) nounwind {
+; AVX2-LABEL: test12:
+; AVX2: vpxor
+; AVX2: vpxor
+; AVX2-NEXT: ret
+
+; AVX512-LABEL: test12:
+; AVX512: vpabsq {{.*}}%zmm
+; AVX512-NEXT: ret
+        %tmp1neg = sub <8 x i64> zeroinitializer, %a
+        %b = icmp sle <8 x i64> %a, zeroinitializer
+        %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
+        ret <8 x i64> %abs
+}
+
+define <8 x i64> @test13(<8 x i64>* %a.ptr) nounwind {
+; AVX2-LABEL: test13:
+; AVX2: vpxor
+; AVX2: vpxor
+; AVX2-NEXT: ret
+
+; AVX512-LABEL: test13:
+; AVX512: vpabsq (%
+; AVX512-NEXT: ret
+        %a = load <8 x i64>* %a.ptr, align 8
+        %tmp1neg = sub <8 x i64> zeroinitializer, %a
+        %b = icmp sle <8 x i64> %a, zeroinitializer
+        %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
+        ret <8 x i64> %abs
+}
diff --git a/test/CodeGen/X86/vselect-2.ll b/test/CodeGen/X86/vselect-2.ll
new file mode 100644
index 0000000..50da32c
--- /dev/null
+++ b/test/CodeGen/X86/vselect-2.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=sse2 | FileCheck %s
+
+define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
+  %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B
+  ret <4 x i32> %select
+}
+; CHECK-LABEL: test1
+; CHECK: movsd
+; CHECK: ret
+
+define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
+  %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x i32> %A, <4 x i32> %B
+  ret <4 x i32> %select
+}
+; CHECK-LABEL: test2
+; CHECK: movsd
+; CHECK-NEXT: ret
+
+define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
+  %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
+  ret <4 x float> %select
+}
+; CHECK-LABEL: test3
+; CHECK: movsd
+; CHECK: ret
+
+define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
+  %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
+  ret <4 x float> %select
+}
+; CHECK-LABEL: test4
+; CHECK: movsd
+; CHECK-NEXT: ret
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
new file mode 100644
index 0000000..0cf03fc
--- /dev/null
+++ b/test/CodeGen/X86/vselect.ll
@@ -0,0 +1,264 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=corei7 -mattr=-sse4.1 < %s | FileCheck %s
+
+; Verify that we don't emit packed vector shifts instructions if the
+; condition used by the vector select is a vector of constants.
+
+
+define <4 x float> @test1(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test4(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: movaps  %xmm1, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a, <8 x i16> %a
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test9
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: movaps  %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test11
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 false, i1 undef, i1 false, i1 false, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test12
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+; Fold (vselect (build_vector AllOnes), N1, N2) -> N1
+
+define <4 x float> @test14(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 undef, i1 true, i1 undef>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: pcmpeq
+; CHECK: ret
+
+define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 undef, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test15
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: pcmpeq
+; CHECK: ret
+
+; Fold (vselect (build_vector AllZeros), N1, N2) -> N2
+
+define <4 x float> @test16(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 false, i1 undef, i1 false, i1 undef>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+} 
+; CHECK-LABEL: test16
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: ret 
+
+define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test17
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: ret
+
+define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test18
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+  %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test19
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
+  %1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test20
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
+define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+  %1 = select <2 x i1> <i1 false, i1 true>, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test21
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
+define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test22
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
+  %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test23
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
+  %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test24
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
+define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) {
+  %1 = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test25
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/vshift-4.ll b/test/CodeGen/X86/vshift-4.ll
index 4363cd9..a060cf8 100644
--- a/test/CodeGen/X86/vshift-4.ll
+++ b/test/CodeGen/X86/vshift-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
 
 ; test vector shifts converted to proper SSE2 vector shifts when the shift
 ; amounts are the same when using a shuffle splat.
diff --git a/test/CodeGen/X86/vshift-6.ll b/test/CodeGen/X86/vshift-6.ll
new file mode 100644
index 0000000..f50d9a6
--- /dev/null
+++ b/test/CodeGen/X86/vshift-6.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mcpu=corei7 -march=x86-64 -mattr=+sse2  | FileCheck %s
+
+; This test makes sure that the compiler does not crash with an
+; assertion failure when trying to fold a vector shift left
+; by immediate count if the type of the input vector is different
+; to the result type.
+;
+; This happens for example when lowering a shift left of a MVT::v16i8 vector.
+; This is custom lowered into the following sequence:
+;     count << 5
+;     A =  VSHLI(MVT::v8i16, r & (char16)15, 4)
+;     B = BITCAST MVT::v16i8, A
+;     VSELECT(r, B, count);
+;     count += count
+;     C = VSHLI(MVT::v8i16, r & (char16)63, 2)
+;     D = BITCAST MVT::v16i8, C
+;     r = VSELECT(r, C, count);
+;     count += count
+;     VSELECT(r, r+r, count);
+;     count = count << 5;
+;
+; Where 'r' is a vector of type MVT::v16i8, and
+; 'count' is the vector shift count.
+
+define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
+entry:
+  store i8 %5, i8* %0
+  %L5 = load i8* %0
+  %I8 = insertelement <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i8 %L5, i32 7
+  %B51 = shl <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, %I8
+  ret <16 x i8> %B51
+}
+
+; CHECK-LABEL: do_not_crash
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/warn-stack.ll b/test/CodeGen/X86/warn-stack.ll
index 5979f45..a76fd28 100644
--- a/test/CodeGen/X86/warn-stack.ll
+++ b/test/CodeGen/X86/warn-stack.ll
@@ -12,7 +12,7 @@ entry:
   ret void
 }
 
-; CHECK: warning: Stack size limit exceeded (104) in warn.
+; CHECK: warning: stack size limit exceeded (104) in warn
 define void @warn() nounwind ssp {
 entry:
   %buffer = alloca [80 x i8], align 1
diff --git a/test/CodeGen/X86/weak_def_can_be_hidden.ll b/test/CodeGen/X86/weak_def_can_be_hidden.ll
index f78f357..b17f372 100644
--- a/test/CodeGen/X86/weak_def_can_be_hidden.ll
+++ b/test/CodeGen/X86/weak_def_can_be_hidden.ll
@@ -1,26 +1,51 @@
-; RUN: llc -mtriple=x86_64-apple-darwin  -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin11 -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin10 -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin9 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
+; RUN: llc -mtriple=i686-apple-darwin9 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
+; RUN: llc -mtriple=i686-apple-darwin8 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
 
-@v1 = linkonce_odr global i32 32
+@v1 = linkonce_odr constant i32 32
 ; CHECK: .globl  _v1
 ; CHECK: .weak_def_can_be_hidden _v1
 
+; CHECK-D89: .globl  _v1
+; CHECK-D89: .weak_definition _v1
+
 define i32 @f1() {
   %x = load i32 * @v1
   ret i32 %x
 }
 
-@v2 = linkonce_odr global i32 32
+@v2 = linkonce_odr constant i32 32
 ; CHECK: .globl  _v2
 ; CHECK: .weak_definition _v2
 
-@v3 = linkonce_odr unnamed_addr global i32 32
-; CHECK: .globl  _v3
-; CHECK: .weak_def_can_be_hidden _v3
+; CHECK-D89: .globl  _v2
+; CHECK-D89: .weak_definition _v2
 
 define i32* @f2() {
   ret i32* @v2
 }
 
+@v3 = linkonce_odr unnamed_addr global i32 32
+; CHECK: .globl  _v3
+; CHECK: .weak_def_can_be_hidden _v3
+
+; CHECK-D89: .globl  _v3
+; CHECK-D89: .weak_definition _v3
+
 define i32* @f3() {
   ret i32* @v3
 }
+
+@v4 = linkonce_odr global i32 32
+; CHECK: .globl  _v4
+; CHECK: .weak_definition _v4
+
+; CHECK-D89: .globl  _v4
+; CHECK-D89: .weak_definition _v4
+
+define i32 @f4() {
+  %x = load i32 * @v4
+  ret i32 %x
+}
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 26815a4..41bea85 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -149,9 +149,9 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
 ; CHECK: movdqa
 ; CHECK: paddb
 ; CHECK: paddb
-; CHECK: movq
 ; CHECK: pextrb
 ; CHECK: pextrw
+; CHECK: movq
 ; CHECK: ret
 	%a = load %i8vec31* %ap, align 16
 	%b = load %i8vec31* %bp, align 16
diff --git a/test/CodeGen/X86/win32_sret.ll b/test/CodeGen/X86/win32_sret.ll
index a24963a..d8ecd44 100644
--- a/test/CodeGen/X86/win32_sret.ll
+++ b/test/CodeGen/X86/win32_sret.ll
@@ -1,11 +1,11 @@
 ; We specify -mcpu explicitly to avoid instruction reordering that happens on
 ; some setups (e.g., Atom) from affecting the output.
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32
-; RUN: llc < %s -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
-; RUN: llc < %s -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
+; RUN: llc < %s -mcpu=core2 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32
-; RUN: llc < %s -O0 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
-; RUN: llc < %s -O0 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
+; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
 
 ; The SysV ABI used by most Unixes and Mingw on x86 specifies that an sret pointer
 ; is callee-cleanup. However, in MSVC's cdecl calling convention, sret pointer
@@ -13,16 +13,16 @@
 
 define void @sret1(i8* sret %x) nounwind {
 entry:
-; WIN32:      sret1
+; WIN32-LABEL:      _sret1:
 ; WIN32:      movb $42, (%eax)
 ; WIN32-NOT:  popl %eax
-; WIN32:    {{ret$}}
+; WIN32:    {{retl$}}
 
-; MINGW_X86:  sret1
-; MINGW_X86:  ret $4
+; MINGW_X86-LABEL:  _sret1:
+; MINGW_X86:  {{retl$}}
 
-; LINUX:      sret1
-; LINUX:      ret $4
+; LINUX-LABEL:      sret1:
+; LINUX:      retl $4
 
   store i8 42, i8* %x, align 4
   ret void
@@ -30,16 +30,16 @@ entry:
 
 define void @sret2(i8* sret %x, i8 %y) nounwind {
 entry:
-; WIN32:      sret2
+; WIN32-LABEL:      _sret2:
 ; WIN32:      movb {{.*}}, (%eax)
 ; WIN32-NOT:  popl %eax
-; WIN32:    {{ret$}}
+; WIN32:    {{retl$}}
 
-; MINGW_X86:  sret2
-; MINGW_X86:  ret $4
+; MINGW_X86-LABEL:  _sret2:
+; MINGW_X86:  {{retl$}}
 
-; LINUX:      sret2
-; LINUX:      ret $4
+; LINUX-LABEL:      sret2:
+; LINUX:      retl $4
 
   store i8 %y, i8* %x
   ret void
@@ -47,17 +47,17 @@ entry:
 
 define void @sret3(i8* sret %x, i8* %y) nounwind {
 entry:
-; WIN32:      sret3
+; WIN32-LABEL:      _sret3:
 ; WIN32:      movb $42, (%eax)
 ; WIN32-NOT:  movb $13, (%eax)
 ; WIN32-NOT:  popl %eax
-; WIN32:    {{ret$}}
+; WIN32:    {{retl$}}
 
-; MINGW_X86:  sret3
-; MINGW_X86:  ret $4
+; MINGW_X86-LABEL:  _sret3:
+; MINGW_X86:  {{retl$}}
 
-; LINUX:      sret3
-; LINUX:      ret $4
+; LINUX-LABEL:      sret3:
+; LINUX:      retl $4
 
   store i8 42, i8* %x
   store i8 13, i8* %y
@@ -69,16 +69,16 @@ entry:
 
 define void @sret4(%struct.S4* noalias sret %agg.result) {
 entry:
-; WIN32:     sret4
+; WIN32-LABEL:     _sret4:
 ; WIN32:     movl $42, (%eax)
 ; WIN32-NOT: popl %eax
-; WIN32:   {{ret$}}
+; WIN32:   {{retl$}}
 
-; MINGW_X86: sret4
-; MINGW_X86: ret $4
+; MINGW_X86-LABEL: _sret4:
+; MINGW_X86: {{retl$}}
 
-; LINUX:     sret4
-; LINUX:     ret $4
+; LINUX-LABEL:     sret4:
+; LINUX:     retl $4
 
   %x = getelementptr inbounds %struct.S4* %agg.result, i32 0, i32 0
   store i32 42, i32* %x, align 4
@@ -96,14 +96,16 @@ entry:
   %x = getelementptr inbounds %struct.S5* %agg.result, i32 0, i32 0
   store i32 42, i32* %x, align 4
   ret void
-; WIN32:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
+; WIN32-LABEL:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
+; MINGW_X86-LABEL: {{^}}"?foo@C5@@QAE?AUS5@@XZ":
+; LINUX-LABEL:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
 
 ; The address of the return structure is passed as an implicit parameter.
 ; In the -O0 build, %eax is spilled at the beginning of the function, hence we
 ; should match both 4(%esp) and 8(%esp).
 ; WIN32:     {{[48]}}(%esp), %eax
 ; WIN32:     movl $42, (%eax)
-; WIN32:     ret $4
+; WIN32:     retl $4
 }
 
 define void @call_foo5() {
@@ -111,7 +113,10 @@ entry:
   %c = alloca %class.C5, align 1
   %s = alloca %struct.S5, align 4
   call x86_thiscallcc void @"\01?foo@C5@@QAE?AUS5@@XZ"(%struct.S5* sret %s, %class.C5* %c)
-; WIN32:      {{^}}_call_foo5:
+; WIN32-LABEL:      {{^}}_call_foo5:
+; MINGW_X86-LABEL:  {{^}}_call_foo5:
+; LINUX-LABEL:      {{^}}call_foo5:
+
 
 ; Load the address of the result and put it onto stack
 ; (through %ecx in the -O0 build).
@@ -121,6 +126,35 @@ entry:
 ; The this pointer goes to ECX.
 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT: calll "?foo@C5@@QAE?AUS5@@XZ"
-; WIN32:      ret
+; WIN32:      retl
+  ret void
+}
+
+
+%struct.test6 = type { i32, i32, i32 }
+define void @test6_f(%struct.test6* %x) nounwind {
+; WIN32-LABEL: _test6_f:
+; MINGW_X86-LABEL: _test6_f:
+; LINUX-LABEL: test6_f:
+
+; The %x argument is moved to %ecx. It will be the this pointer.
+; WIN32: movl    8(%ebp), %ecx
+
+; The %x argument is moved to (%esp). It will be the this pointer. With -O0
+; we copy esp to ecx and use (ecx) instead of (esp).
+; MINGW_X86: movl    8(%ebp), %eax
+; MINGW_X86: movl    %eax, (%e{{([a-d]x)|(sp)}})
+
+; The sret pointer is (%esp)
+; WIN32:          leal    8(%esp), %[[REG:e[a-d]x]]
+; WIN32-NEXT:     movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
+
+; The sret pointer is %ecx
+; MINGW_X86-NEXT: leal    8(%esp), %ecx
+; MINGW_X86-NEXT: calll   _test6_g
+
+  %tmp = alloca %struct.test6, align 4
+  call x86_thiscallcc void @test6_g(%struct.test6* sret %tmp, %struct.test6* %x)
   ret void
 }
+declare x86_thiscallcc void @test6_g(%struct.test6* sret, %struct.test6*)
diff --git a/test/CodeGen/X86/win64_alloca_dynalloca.ll b/test/CodeGen/X86/win64_alloca_dynalloca.ll
index aff5305..a6b6536 100644
--- a/test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ b/test/CodeGen/X86/win64_alloca_dynalloca.ll
@@ -12,11 +12,11 @@ entry:
 
   %buf0 = alloca i8, i64 4096, align 1
 
-; ___chkstk must adjust %rsp.
+; ___chkstk_ms does not adjust %rsp.
 ; M64: movq  %rsp, %rbp
 ; M64:       $4096, %rax
-; M64: callq ___chkstk
-; M64-NOT:   %rsp
+; M64: callq ___chkstk_ms
+; M64: subq  %rax, %rsp
 
 ; __chkstk does not adjust %rsp.
 ; W64: movq  %rsp, %rbp
diff --git a/test/CodeGen/X86/win_chkstk.ll b/test/CodeGen/X86/win_chkstk.ll
index 3f522ea..0c02c1a 100644
--- a/test/CodeGen/X86/win_chkstk.ll
+++ b/test/CodeGen/X86/win_chkstk.ll
@@ -17,7 +17,7 @@ entry:
 ; WIN_X32:    calll __chkstk
 ; WIN_X64:    callq __chkstk
 ; MINGW_X32:  calll __alloca
-; MINGW_X64:  callq ___chkstk
+; MINGW_X64:  callq ___chkstk_ms
 ; LINUX-NOT:  call __chkstk
   %array4096 = alloca [4096 x i8], align 16       ; <[4096 x i8]*> [#uses=0]
   ret i32 0
@@ -36,7 +36,7 @@ entry:
 ; WIN_X64:       ret
 
 ; MINGW_X64:     # BB#0:
-; MINGW_X64-NOT: callq _alloca
+; MINGW_X64-NOT: callq ___chkstk_ms
 ; MINGW_X64:     ret
 
 ; LINUX:         # BB#0:
@@ -53,7 +53,7 @@ entry:
 ; WIN_X32:    calll __chkstk
 ; WIN_X64:    callq __chkstk
 ; MINGW_X32:  calll __alloca
-; MINGW_X64:  callq ___chkstk
+; MINGW_X64:  callq ___chkstk_ms
 ; LINUX-NOT:  call __chkstk
   %array4096 = alloca [4096 x i8], align 16       ; <[4096 x i8]*> [#uses=0]
   ret i32 0
diff --git a/test/CodeGen/X86/x86-64-double-precision-shift-left.ll b/test/CodeGen/X86/x86-64-double-precision-shift-left.ll
new file mode 100644
index 0000000..f2380f2
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-double-precision-shift-left.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; Verify that for the architectures that are known to have poor latency
+; double precision shift instructions we generate alternative sequence 
+; of instructions with lower latencies instead of shld instruction.
+
+;uint64_t lshift1(uint64_t a, uint64_t b)
+;{
+;    return (a << 1) | (b >> 63);
+;}
+
+; CHECK:             lshift1:
+; CHECK:             addq    {{.*}},{{.*}}
+; CHECK-NEXT:        shrq    $63, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @lshift1(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 1
+  %shr = lshr i64 %b, 63
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t lshift2(uint64_t a, uint64_t b)
+;{
+;    return (a << 2) | (b >> 62);
+;}
+
+; CHECK:             lshift2:
+; CHECK:             shlq    $2, {{.*}}
+; CHECK-NEXT:        shrq    $62, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift2(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 2
+  %shr = lshr i64 %b, 62
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t lshift7(uint64_t a, uint64_t b)
+;{
+;    return (a << 7) | (b >> 57);
+;}
+
+; CHECK:             lshift7:
+; CHECK:             shlq    $7, {{.*}}
+; CHECK-NEXT:        shrq    $57, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift7(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 7
+  %shr = lshr i64 %b, 57
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t lshift63(uint64_t a, uint64_t b)
+;{
+;    return (a << 63) | (b >> 1);
+;}
+
+; CHECK:             lshift63:
+; CHECK:             shlq    $63, {{.*}}
+; CHECK-NEXT:        shrq    {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift63(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 63
+  %shr = lshr i64 %b, 1
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
diff --git a/test/CodeGen/X86/x86-64-double-precision-shift-right.ll b/test/CodeGen/X86/x86-64-double-precision-shift-right.ll
new file mode 100644
index 0000000..5edaad8
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-double-precision-shift-right.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; Verify that for the architectures that are known to have poor latency
+; double precision shift instructions we generate alternative sequence 
+; of instructions with lower latencies instead of shrd instruction.
+
+;uint64_t rshift1(uint64_t a, uint64_t b)
+;{
+;    return (a >> 1) | (b << 63);
+;}
+
+; CHECK:             rshift1:
+; CHECK:             shrq    {{.*}}
+; CHECK-NEXT:        shlq    $63, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @rshift1(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 1
+  %2 = shl i64 %b, 63
+  %3 = or i64 %2, %1
+  ret i64 %3
+}
+
+;uint64_t rshift2(uint64_t a, uint64_t b)
+;{
+;    return (a >> 2) | (b << 62);
+;}
+
+; CHECK:             rshift2:
+; CHECK:             shrq    $2, {{.*}}
+; CHECK-NEXT:        shlq    $62, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @rshift2(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 2
+  %2 = shl i64 %b, 62
+  %3 = or i64 %2, %1
+  ret i64 %3
+}
+
+;uint64_t rshift7(uint64_t a, uint64_t b)
+;{
+;    return (a >> 7) | (b << 57);
+;}
+
+; CHECK:             rshift7:
+; CHECK:             shrq    $7, {{.*}}
+; CHECK-NEXT:        shlq    $57, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @rshift7(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 7
+  %2 = shl i64 %b, 57
+  %3 = or i64 %2, %1
+  ret i64 %3
+}
+
+;uint64_t rshift63(uint64_t a, uint64_t b)
+;{
+;    return (a >> 63) | (b << 1);
+;}
+
+; CHECK:             rshift63:
+; CHECK:             shrq    $63, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+; CHECK-NEXT:        orq     {{.*}}, {{.*}}
+
+define i64 @rshift63(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 63
+  %2 = shl i64 %b, 1
+  %3 = or i64 %2, %1
+  ret i64 %3
+}
diff --git a/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
new file mode 100644
index 0000000..5d7a10b
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+
+; clang -Oz -c test1.cpp -emit-llvm -S -o
+; Verify that we generate shld insruction when we are optimizing for size,
+; even for X86_64 processors that are known to have poor latency double 
+; precision shift instuctions.
+; uint64_t lshift10(uint64_t a, uint64_t b)
+; {
+;     return (a << 10) | (b >> 54);
+; }
+
+; Function Attrs: minsize nounwind optsize readnone uwtable
+define i64 @_Z8lshift10mm(i64 %a, i64 %b) #0 {
+entry:
+; CHECK:   shldq   $10
+  %shl = shl i64 %a, 10
+  %shr = lshr i64 %b, 54
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+attributes #0 = { minsize nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
+; clang -Os -c test2.cpp -emit-llvm -S
+; Verify that we generate shld insruction when we are optimizing for size,
+; even for X86_64 processors that are known to have poor latency double
+; precision shift instuctions.
+; uint64_t lshift11(uint64_t a, uint64_t b)
+; {
+;     return (a << 11) | (b >> 53);
+; }
+
+; Function Attrs: nounwind optsize readnone uwtable
+define i64 @_Z8lshift11mm(i64 %a, i64 %b) #1 {
+entry:
+; CHECK:   shldq   $11
+  %shl = shl i64 %a, 11
+  %shr = lshr i64 %b, 53
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; clang -O2 -c test2.cpp -emit-llvm -S
+; Verify that we do not generate shld insruction when we are not optimizing
+; for size for X86_64 processors that are known to have poor latency double
+; precision shift instuctions.
+; uint64_t lshift12(uint64_t a, uint64_t b)
+; {
+;     return (a << 12) | (b >> 52);
+; }
+
+; Function Attrs: nounwind optsize readnone uwtable
+define i64 @_Z8lshift12mm(i64 %a, i64 %b) #2 {
+entry:
+; CHECK:       shlq    $12
+; CHECK-NEXT:  shrq    $52
+  %shl = shl i64 %a, 12
+  %shr = lshr i64 %b, 52
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
diff --git a/test/CodeGen/X86/x86-64-double-shifts-var.ll b/test/CodeGen/X86/x86-64-double-shifts-var.ll
new file mode 100644
index 0000000..5bab434
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-double-shifts-var.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -march=x86-64 -mcpu=athlon | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-tbird | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-4 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-xp | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-mp | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=k8 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=opteron | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-fx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=k8-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=opteron-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon64-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=amdfam10 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=btver1 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=btver2 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=bdver2 | FileCheck %s
+
+; Verify that for the X86_64 processors that are known to have poor latency 
+; double precision shift instructions we do not generate 'shld' or 'shrd'
+; instructions.
+
+;uint64_t lshift(uint64_t a, uint64_t b, int c)
+;{
+;    return (a << c) | (b >> (64-c));
+;}
+
+define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
+entry:
+; CHECK-NOT: shld 
+  %sh_prom = zext i32 %c to i64
+  %shl = shl i64 %a, %sh_prom
+  %sub = sub nsw i32 64, %c
+  %sh_prom1 = zext i32 %sub to i64
+  %shr = lshr i64 %b, %sh_prom1
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t rshift(uint64_t a, uint64_t b, int c)
+;{
+;    return (a >> c) | (b << (64-c));
+;}
+
+define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
+entry:
+; CHECK-NOT: shrd
+  %sh_prom = zext i32 %c to i64
+  %shr = lshr i64 %a, %sh_prom
+  %sub = sub nsw i32 64, %c
+  %sh_prom1 = zext i32 %sub to i64
+  %shl = shl i64 %b, %sh_prom1
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+
diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll
index 2f3adb8..ec47933 100644
--- a/test/CodeGen/X86/x86-shifts.ll
+++ b/test/CodeGen/X86/x86-shifts.ll
@@ -100,7 +100,7 @@ entry:
   ret <8 x i16> %K
 }
 
-; non splat test
+; non-splat test
 
 
 define <8 x i16> @sll8_nosplat(<8 x i16> %A) nounwind {
diff --git a/test/CodeGen/X86/zlib-longest-match.ll b/test/CodeGen/X86/zlib-longest-match.ll
new file mode 100644
index 0000000..d1598dc
--- /dev/null
+++ b/test/CodeGen/X86/zlib-longest-match.ll
@@ -0,0 +1,240 @@
+; RUN: llc -march=x86-64 < %s -block-placement-exit-block-bias=20 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; This is longest_match, the hot function from zlib's deflate implementation.
+
+%struct.internal_state = type { %struct.z_stream_s*, i32, i8*, i64, i8*, i32, i32, %struct.gz_header_s*, i32, i8, i32, i32, i32, i32, i8*, i64, i16*, i16*, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [573 x %struct.ct_data_s], [61 x %struct.ct_data_s], [39 x %struct.ct_data_s], %struct.tree_desc_s, %struct.tree_desc_s, %struct.tree_desc_s, [16 x i16], [573 x i32], i32, i32, [573 x i8], i8*, i32, i32, i16*, i64, i64, i32, i32, i16, i32, i64 }
+%struct.z_stream_s = type { i8*, i32, i64, i8*, i32, i64, i8*, %struct.internal_state*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8*, i32, i64, i64 }
+%struct.gz_header_s = type { i32, i64, i32, i32, i8*, i32, i32, i8*, i32, i8*, i32, i32, i32 }
+%struct.ct_data_s = type { %union.anon, %union.anon.0 }
+%union.anon = type { i16 }
+%union.anon.0 = type { i16 }
+%struct.tree_desc_s = type { %struct.ct_data_s*, i32, %struct.static_tree_desc_s* }
+%struct.static_tree_desc_s = type { i32 }
+
+; CHECK-LABEL: longest_match:
+
+; Verify that there are no spills or reloads in the loop exit block. This loop
+; is mostly cold, only %do.cond125 and %land.rhs131 are hot.
+; CHECK: %do.cond125
+; CHECK-NOT: {{Spill|Reload}}
+; CHECK: jbe
+
+; Verify that block placement doesn't destroy source order. It's important that
+; the two hot blocks are laid out close to each other.
+; CHECK-NEXT: %land.rhs131
+; CHECK: jne
+; CHECK: jmp
+define i32 @longest_match(%struct.internal_state* nocapture %s, i32 %cur_match) nounwind {
+entry:
+  %max_chain_length = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 31
+  %0 = load i32* %max_chain_length, align 4
+  %window = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 14
+  %1 = load i8** %window, align 8
+  %strstart = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 27
+  %2 = load i32* %strstart, align 4
+  %idx.ext = zext i32 %2 to i64
+  %add.ptr = getelementptr inbounds i8* %1, i64 %idx.ext
+  %prev_length = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 30
+  %3 = load i32* %prev_length, align 4
+  %nice_match1 = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 36
+  %4 = load i32* %nice_match1, align 4
+  %w_size = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 11
+  %5 = load i32* %w_size, align 4
+  %sub = add i32 %5, -262
+  %cmp = icmp ugt i32 %2, %sub
+  %sub6 = sub i32 %2, %sub
+  %sub6. = select i1 %cmp, i32 %sub6, i32 0
+  %prev7 = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 16
+  %6 = load i16** %prev7, align 8
+  %w_mask = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 13
+  %7 = load i32* %w_mask, align 4
+  %add.ptr11.sum = add i64 %idx.ext, 258
+  %add.ptr12 = getelementptr inbounds i8* %1, i64 %add.ptr11.sum
+  %sub13 = add nsw i32 %3, -1
+  %idxprom = sext i32 %sub13 to i64
+  %add.ptr.sum = add i64 %idxprom, %idx.ext
+  %arrayidx = getelementptr inbounds i8* %1, i64 %add.ptr.sum
+  %8 = load i8* %arrayidx, align 1
+  %idxprom14 = sext i32 %3 to i64
+  %add.ptr.sum213 = add i64 %idxprom14, %idx.ext
+  %arrayidx15 = getelementptr inbounds i8* %1, i64 %add.ptr.sum213
+  %9 = load i8* %arrayidx15, align 1
+  %good_match = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 35
+  %10 = load i32* %good_match, align 4
+  %cmp17 = icmp ult i32 %3, %10
+  %shr = lshr i32 %0, 2
+  %chain_length.0 = select i1 %cmp17, i32 %0, i32 %shr
+  %lookahead = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 29
+  %11 = load i32* %lookahead, align 4
+  %cmp18 = icmp ugt i32 %4, %11
+  %. = select i1 %cmp18, i32 %11, i32 %4
+  %match_start = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 28
+  %add.ptr.sum217 = add i64 %idx.ext, 1
+  %arrayidx44 = getelementptr inbounds i8* %1, i64 %add.ptr.sum217
+  %add.ptr.sum218 = add i64 %idx.ext, 2
+  %add.ptr50 = getelementptr inbounds i8* %1, i64 %add.ptr.sum218
+  %sub.ptr.lhs.cast = ptrtoint i8* %add.ptr12 to i64
+  br label %do.body
+
+do.body:                                          ; preds = %land.rhs131, %entry
+  %best_len.0 = phi i32 [ %best_len.1, %land.rhs131 ], [ %3, %entry ]
+  %chain_length.1 = phi i32 [ %dec, %land.rhs131 ], [ %chain_length.0, %entry ]
+  %cur_match.addr.0 = phi i32 [ %conv128, %land.rhs131 ], [ %cur_match, %entry ]
+  %scan_end1.0 = phi i8 [ %scan_end1.1, %land.rhs131 ], [ %8, %entry ]
+  %scan_end.0 = phi i8 [ %scan_end.1, %land.rhs131 ], [ %9, %entry ]
+  %idx.ext23 = zext i32 %cur_match.addr.0 to i64
+  %add.ptr24 = getelementptr inbounds i8* %1, i64 %idx.ext23
+  %idxprom25 = sext i32 %best_len.0 to i64
+  %add.ptr24.sum = add i64 %idx.ext23, %idxprom25
+  %arrayidx26 = getelementptr inbounds i8* %1, i64 %add.ptr24.sum
+  %12 = load i8* %arrayidx26, align 1
+  %cmp28 = icmp eq i8 %12, %scan_end.0
+  br i1 %cmp28, label %lor.lhs.false, label %do.cond125
+
+lor.lhs.false:                                    ; preds = %do.body
+  %sub30 = add nsw i32 %best_len.0, -1
+  %idxprom31 = sext i32 %sub30 to i64
+  %add.ptr24.sum214 = add i64 %idx.ext23, %idxprom31
+  %arrayidx32 = getelementptr inbounds i8* %1, i64 %add.ptr24.sum214
+  %13 = load i8* %arrayidx32, align 1
+  %cmp35 = icmp eq i8 %13, %scan_end1.0
+  br i1 %cmp35, label %lor.lhs.false37, label %do.cond125
+
+lor.lhs.false37:                                  ; preds = %lor.lhs.false
+  %14 = load i8* %add.ptr24, align 1
+  %15 = load i8* %add.ptr, align 1
+  %cmp40 = icmp eq i8 %14, %15
+  br i1 %cmp40, label %lor.lhs.false42, label %do.cond125
+
+lor.lhs.false42:                                  ; preds = %lor.lhs.false37
+  %add.ptr24.sum215 = add i64 %idx.ext23, 1
+  %incdec.ptr = getelementptr inbounds i8* %1, i64 %add.ptr24.sum215
+  %16 = load i8* %incdec.ptr, align 1
+  %17 = load i8* %arrayidx44, align 1
+  %cmp46 = icmp eq i8 %16, %17
+  br i1 %cmp46, label %if.end49, label %do.cond125
+
+if.end49:                                         ; preds = %lor.lhs.false42
+  %incdec.ptr.sum = add i64 %idx.ext23, 2
+  %incdec.ptr51 = getelementptr inbounds i8* %1, i64 %incdec.ptr.sum
+  br label %do.cond
+
+do.cond:                                          ; preds = %land.lhs.true100, %if.end49
+  %match.0 = phi i8* [ %incdec.ptr51, %if.end49 ], [ %incdec.ptr103, %land.lhs.true100 ]
+  %scan.1 = phi i8* [ %add.ptr50, %if.end49 ], [ %incdec.ptr101, %land.lhs.true100 ]
+  %incdec.ptr53 = getelementptr inbounds i8* %scan.1, i64 1
+  %18 = load i8* %incdec.ptr53, align 1
+  %incdec.ptr55 = getelementptr inbounds i8* %match.0, i64 1
+  %19 = load i8* %incdec.ptr55, align 1
+  %cmp57 = icmp eq i8 %18, %19
+  br i1 %cmp57, label %land.lhs.true, label %do.end
+
+land.lhs.true:                                    ; preds = %do.cond
+  %incdec.ptr59 = getelementptr inbounds i8* %scan.1, i64 2
+  %20 = load i8* %incdec.ptr59, align 1
+  %incdec.ptr61 = getelementptr inbounds i8* %match.0, i64 2
+  %21 = load i8* %incdec.ptr61, align 1
+  %cmp63 = icmp eq i8 %20, %21
+  br i1 %cmp63, label %land.lhs.true65, label %do.end
+
+land.lhs.true65:                                  ; preds = %land.lhs.true
+  %incdec.ptr66 = getelementptr inbounds i8* %scan.1, i64 3
+  %22 = load i8* %incdec.ptr66, align 1
+  %incdec.ptr68 = getelementptr inbounds i8* %match.0, i64 3
+  %23 = load i8* %incdec.ptr68, align 1
+  %cmp70 = icmp eq i8 %22, %23
+  br i1 %cmp70, label %land.lhs.true72, label %do.end
+
+land.lhs.true72:                                  ; preds = %land.lhs.true65
+  %incdec.ptr73 = getelementptr inbounds i8* %scan.1, i64 4
+  %24 = load i8* %incdec.ptr73, align 1
+  %incdec.ptr75 = getelementptr inbounds i8* %match.0, i64 4
+  %25 = load i8* %incdec.ptr75, align 1
+  %cmp77 = icmp eq i8 %24, %25
+  br i1 %cmp77, label %land.lhs.true79, label %do.end
+
+land.lhs.true79:                                  ; preds = %land.lhs.true72
+  %incdec.ptr80 = getelementptr inbounds i8* %scan.1, i64 5
+  %26 = load i8* %incdec.ptr80, align 1
+  %incdec.ptr82 = getelementptr inbounds i8* %match.0, i64 5
+  %27 = load i8* %incdec.ptr82, align 1
+  %cmp84 = icmp eq i8 %26, %27
+  br i1 %cmp84, label %land.lhs.true86, label %do.end
+
+land.lhs.true86:                                  ; preds = %land.lhs.true79
+  %incdec.ptr87 = getelementptr inbounds i8* %scan.1, i64 6
+  %28 = load i8* %incdec.ptr87, align 1
+  %incdec.ptr89 = getelementptr inbounds i8* %match.0, i64 6
+  %29 = load i8* %incdec.ptr89, align 1
+  %cmp91 = icmp eq i8 %28, %29
+  br i1 %cmp91, label %land.lhs.true93, label %do.end
+
+land.lhs.true93:                                  ; preds = %land.lhs.true86
+  %incdec.ptr94 = getelementptr inbounds i8* %scan.1, i64 7
+  %30 = load i8* %incdec.ptr94, align 1
+  %incdec.ptr96 = getelementptr inbounds i8* %match.0, i64 7
+  %31 = load i8* %incdec.ptr96, align 1
+  %cmp98 = icmp eq i8 %30, %31
+  br i1 %cmp98, label %land.lhs.true100, label %do.end
+
+land.lhs.true100:                                 ; preds = %land.lhs.true93
+  %incdec.ptr101 = getelementptr inbounds i8* %scan.1, i64 8
+  %32 = load i8* %incdec.ptr101, align 1
+  %incdec.ptr103 = getelementptr inbounds i8* %match.0, i64 8
+  %33 = load i8* %incdec.ptr103, align 1
+  %cmp105 = icmp eq i8 %32, %33
+  %cmp107 = icmp ult i8* %incdec.ptr101, %add.ptr12
+  %or.cond = and i1 %cmp105, %cmp107
+  br i1 %or.cond, label %do.cond, label %do.end
+
+do.end:                                           ; preds = %land.lhs.true100, %land.lhs.true93, %land.lhs.true86, %land.lhs.true79, %land.lhs.true72, %land.lhs.true65, %land.lhs.true, %do.cond
+  %scan.2 = phi i8* [ %incdec.ptr101, %land.lhs.true100 ], [ %incdec.ptr94, %land.lhs.true93 ], [ %incdec.ptr87, %land.lhs.true86 ], [ %incdec.ptr80, %land.lhs.true79 ], [ %incdec.ptr73, %land.lhs.true72 ], [ %incdec.ptr66, %land.lhs.true65 ], [ %incdec.ptr59, %land.lhs.true ], [ %incdec.ptr53, %do.cond ]
+  %sub.ptr.rhs.cast = ptrtoint i8* %scan.2 to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %conv109 = trunc i64 %sub.ptr.sub to i32
+  %sub110 = sub nsw i32 258, %conv109
+  %cmp112 = icmp sgt i32 %sub110, %best_len.0
+  br i1 %cmp112, label %if.then114, label %do.cond125
+
+if.then114:                                       ; preds = %do.end
+  store i32 %cur_match.addr.0, i32* %match_start, align 4
+  %cmp115 = icmp slt i32 %sub110, %.
+  br i1 %cmp115, label %if.end118, label %do.end135
+
+if.end118:                                        ; preds = %if.then114
+  %sub119 = add nsw i32 %sub110, -1
+  %idxprom120 = sext i32 %sub119 to i64
+  %add.ptr111.sum = add i64 %idxprom120, %idx.ext
+  %arrayidx121 = getelementptr inbounds i8* %1, i64 %add.ptr111.sum
+  %34 = load i8* %arrayidx121, align 1
+  %idxprom122 = sext i32 %sub110 to i64
+  %add.ptr111.sum216 = add i64 %idxprom122, %idx.ext
+  %arrayidx123 = getelementptr inbounds i8* %1, i64 %add.ptr111.sum216
+  %35 = load i8* %arrayidx123, align 1
+  br label %do.cond125
+
+do.cond125:                                       ; preds = %if.end118, %do.end, %lor.lhs.false42, %lor.lhs.false37, %lor.lhs.false, %do.body
+  %best_len.1 = phi i32 [ %best_len.0, %do.body ], [ %best_len.0, %lor.lhs.false ], [ %best_len.0, %lor.lhs.false37 ], [ %best_len.0, %lor.lhs.false42 ], [ %sub110, %if.end118 ], [ %best_len.0, %do.end ]
+  %scan_end1.1 = phi i8 [ %scan_end1.0, %do.body ], [ %scan_end1.0, %lor.lhs.false ], [ %scan_end1.0, %lor.lhs.false37 ], [ %scan_end1.0, %lor.lhs.false42 ], [ %34, %if.end118 ], [ %scan_end1.0, %do.end ]
+  %scan_end.1 = phi i8 [ %scan_end.0, %do.body ], [ %scan_end.0, %lor.lhs.false ], [ %scan_end.0, %lor.lhs.false37 ], [ %scan_end.0, %lor.lhs.false42 ], [ %35, %if.end118 ], [ %scan_end.0, %do.end ]
+  %and = and i32 %cur_match.addr.0, %7
+  %idxprom126 = zext i32 %and to i64
+  %arrayidx127 = getelementptr inbounds i16* %6, i64 %idxprom126
+  %36 = load i16* %arrayidx127, align 2
+  %conv128 = zext i16 %36 to i32
+  %cmp129 = icmp ugt i32 %conv128, %sub6.
+  br i1 %cmp129, label %land.rhs131, label %do.end135
+
+land.rhs131:                                      ; preds = %do.cond125
+  %dec = add i32 %chain_length.1, -1
+  %cmp132 = icmp eq i32 %dec, 0
+  br i1 %cmp132, label %do.end135, label %do.body
+
+do.end135:                                        ; preds = %land.rhs131, %do.cond125, %if.then114
+  %best_len.2 = phi i32 [ %best_len.1, %land.rhs131 ], [ %best_len.1, %do.cond125 ], [ %sub110, %if.then114 ]
+  %cmp137 = icmp ugt i32 %best_len.2, %11
+  %.best_len.2 = select i1 %cmp137, i32 %11, i32 %best_len.2
+  ret i32 %.best_len.2
+}
diff --git a/test/CodeGen/XCore/align.ll b/test/CodeGen/XCore/align.ll
new file mode 100644
index 0000000..2878a64
--- /dev/null
+++ b/test/CodeGen/XCore/align.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+
+; CHECK: .align 4
+; CHECK-LABEL: f:
+define void @f() nounwind {
+entry:
+  ret void
+}
+
+; CHECK: .align 2
+; CHECK-LABEL: g:
+define void @g() nounwind optsize {
+entry:
+  ret void
+}
diff --git a/test/CodeGen/XCore/atomic.ll b/test/CodeGen/XCore/atomic.ll
index 95fca9a..58ef38b 100644
--- a/test/CodeGen/XCore/atomic.ll
+++ b/test/CodeGen/XCore/atomic.ll
@@ -14,3 +14,79 @@ entry:
   fence seq_cst
   ret void
 }
+
+@pool = external global i64
+
+define void @atomicloadstore() nounwind {
+entry:
+; CHECK-LABEL: atomicloadstore
+
+; CHECK: ldw r[[R0:[0-9]+]], dp[pool]
+; CHECK-NEXT: #MEMBARRIER
+  %0 = load atomic i32* bitcast (i64* @pool to i32*) acquire, align 4
+
+; CHECK-NEXT: ldaw r[[R1:[0-9]+]], dp[pool]
+; CHECK-NEXT: ldc r[[R2:[0-9]+]], 0
+
+; CHECK-NEXT: ld16s r3, r[[R1]][r[[R2]]]
+; CHECK-NEXT: #MEMBARRIER
+  %1 = load atomic i16* bitcast (i64* @pool to i16*) acquire, align 2
+
+; CHECK-NEXT: ld8u r11, r[[R1]][r[[R2]]]
+; CHECK-NEXT: #MEMBARRIER
+  %2 = load atomic i8* bitcast (i64* @pool to i8*) acquire, align 1
+
+; CHECK-NEXT: ldw r4, dp[pool]
+; CHECK-NEXT: #MEMBARRIER
+  %3 = load atomic i32* bitcast (i64* @pool to i32*) seq_cst, align 4
+
+; CHECK-NEXT: ld16s r5, r[[R1]][r[[R2]]]
+; CHECK-NEXT: #MEMBARRIER
+  %4 = load atomic i16* bitcast (i64* @pool to i16*) seq_cst, align 2
+
+; CHECK-NEXT: ld8u r6, r[[R1]][r[[R2]]]
+; CHECK-NEXT: #MEMBARRIER
+  %5 = load atomic i8* bitcast (i64* @pool to i8*) seq_cst, align 1
+
+; CHECK-NEXT: #MEMBARRIER
+; CHECK-NEXT: stw r[[R0]], dp[pool]
+  store atomic i32 %0, i32* bitcast (i64* @pool to i32*) release, align 4
+
+; CHECK-NEXT: #MEMBARRIER
+; CHECK-NEXT: st16 r3, r[[R1]][r[[R2]]]
+  store atomic i16 %1, i16* bitcast (i64* @pool to i16*) release, align 2
+
+; CHECK-NEXT: #MEMBARRIER
+; CHECK-NEXT: st8 r11, r[[R1]][r[[R2]]]
+  store atomic i8 %2, i8* bitcast (i64* @pool to i8*) release, align 1
+
+; CHECK-NEXT: #MEMBARRIER
+; CHECK-NEXT: stw r4, dp[pool]
+; CHECK-NEXT: #MEMBARRIER
+  store atomic i32 %3, i32* bitcast (i64* @pool to i32*) seq_cst, align 4
+
+; CHECK-NEXT: #MEMBARRIER
+; CHECK-NEXT: st16 r5, r[[R1]][r[[R2]]]
+; CHECK-NEXT: #MEMBARRIER
+  store atomic i16 %4, i16* bitcast (i64* @pool to i16*) seq_cst, align 2
+
+; CHECK-NEXT: #MEMBARRIER
+; CHECK-NEXT: st8 r6, r[[R1]][r[[R2]]]
+; CHECK-NEXT: #MEMBARRIER
+  store atomic i8 %5, i8* bitcast (i64* @pool to i8*) seq_cst, align 1
+
+; CHECK-NEXT: ldw r[[R0]], dp[pool]
+; CHECK-NEXT: stw r[[R0]], dp[pool]
+; CHECK-NEXT: ld16s r[[R0]], r[[R1]][r[[R2]]]
+; CHECK-NEXT: st16 r[[R0]], r[[R1]][r[[R2]]]
+; CHECK-NEXT: ld8u r[[R0]], r[[R1]][r[[R2]]]
+; CHECK-NEXT: st8 r[[R0]], r[[R1]][r[[R2]]]
+  %6 = load atomic i32* bitcast (i64* @pool to i32*) monotonic, align 4
+  store atomic i32 %6, i32* bitcast (i64* @pool to i32*) monotonic, align 4
+  %7 = load atomic i16* bitcast (i64* @pool to i16*) monotonic, align 2
+  store atomic i16 %7, i16* bitcast (i64* @pool to i16*) monotonic, align 2
+  %8 = load atomic i8* bitcast (i64* @pool to i8*) monotonic, align 1
+  store atomic i8 %8, i8* bitcast (i64* @pool to i8*) monotonic, align 1
+
+  ret void
+}
diff --git a/test/CodeGen/XCore/bigstructret.ll b/test/CodeGen/XCore/bigstructret.ll
index 877c571..567b372 100644
--- a/test/CodeGen/XCore/bigstructret.ll
+++ b/test/CodeGen/XCore/bigstructret.ll
@@ -3,8 +3,8 @@
 %0 = type { i32, i32, i32, i32 }
 %1 = type { i32, i32, i32, i32, i32 }
 
-; Structs of 4 words can be returned in registers
-define internal fastcc %0 @ReturnBigStruct() nounwind readnone {
+; Structs of 4 words are returned in registers
+define internal %0 @ReturnBigStruct() nounwind readnone {
 entry:
   %0 = insertvalue %0 zeroinitializer, i32 12, 0
   %1 = insertvalue %0 %0, i32 24, 1
@@ -19,8 +19,39 @@ entry:
 ; CHECK: ldc r3, 24601
 ; CHECK: retsp 0
 
-; Structs bigger than 4 words are returned via a hidden hidden sret-parameter
-define internal fastcc %1 @ReturnBigStruct2() nounwind readnone {
+; Structs of more than 4 words are partially returned in memory so long as the
+; function is not variadic.
+define { i32, i32, i32, i32, i32} @f(i32, i32, i32, i32, i32) nounwind readnone {
+; CHECK-LABEL: f:
+; CHECK: ldc [[REGISTER:r[0-9]+]], 5
+; CHECK-NEXT: stw [[REGISTER]], sp[2]
+; CHECK-NEXT: retsp 0
+body:
+  ret { i32, i32, i32, i32, i32} { i32 undef, i32 undef, i32 undef, i32 undef, i32 5}
+}
+
+@x = external global i32
+@y = external global i32
+
+; Check we call a function returning more than 4 words correctly.
+define i32 @g() nounwind {
+; CHECK-LABEL: g:
+; CHECK: entsp 3
+; CHECK: ldc [[REGISTER:r[0-9]+]], 0
+; CHECK: stw [[REGISTER]], sp[1]
+; CHECK: bl f
+; CHECK-NEXT: ldw r0, sp[2]
+; CHECK-NEXT: retsp 3
+;
+body:
+  %0 = call { i32, i32, i32, i32, i32 } @f(i32 0, i32 0, i32 0, i32 0, i32 0)
+  %1 = extractvalue { i32, i32, i32, i32, i32 } %0, 4
+  ret i32 %1
+}
+
+; Variadic functions return structs bigger than 4 words via a hidden
+; sret-parameter
+define internal %1 @ReturnBigStruct2(i32 %dummy, ...) nounwind readnone {
 entry:
   %0 = insertvalue %1 zeroinitializer, i32 12, 0
   %1 = insertvalue %1 %0, i32 24, 1
diff --git a/test/CodeGen/XCore/byVal.ll b/test/CodeGen/XCore/byVal.ll
index e9612fd..df6c6d3 100644
--- a/test/CodeGen/XCore/byVal.ll
+++ b/test/CodeGen/XCore/byVal.ll
@@ -20,7 +20,7 @@ entry:
 ; CHECK: ldaw r5, sp[1]
 ; CHECK: ldc r2, 40
 ; CHECK: mov r0, r5
-; CHECK: bl memcpy
+; CHECK: bl __memcpy_4
 ; CHECK: mov r0, r5
 ; CHECK: bl f1
 ; CHECK: mov r0, r4
diff --git a/test/CodeGen/XCore/call.ll b/test/CodeGen/XCore/call.ll
new file mode 100644
index 0000000..06a12f1
--- /dev/null
+++ b/test/CodeGen/XCore/call.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+
+; CHECK-LABEL: bl_imm:
+; CHECK: ldw [[R0:r[0-9]+]], cp
+; CHECK: bla [[R0]]
+define void @bl_imm() nounwind {
+entry:
+	tail call void inttoptr (i64 65536 to void ()*)() nounwind
+	ret void
+}
diff --git a/test/CodeGen/XCore/codemodel.ll b/test/CodeGen/XCore/codemodel.ll
new file mode 100644
index 0000000..0245893
--- /dev/null
+++ b/test/CodeGen/XCore/codemodel.ll
@@ -0,0 +1,213 @@
+
+; RUN: not llc < %s -march=xcore -code-model=medium 2>&1 | FileCheck %s -check-prefix=BAD_CM
+; RUN: not llc < %s -march=xcore -code-model=kernel 2>&1 | FileCheck %s -check-prefix=BAD_CM
+; BAD_CM: Target only supports CodeModel Small or Large
+
+
+; RUN: llc < %s -march=xcore -code-model=default | FileCheck %s
+; RUN: llc < %s -march=xcore -code-model=small | FileCheck %s
+; RUN: llc < %s -march=xcore -code-model=large | FileCheck %s -check-prefix=LARGE
+
+
+; CHECK-LABEL: test:
+; CHECK: zext r0, 1
+; CHECK: bt r0, [[JUMP:.LBB[0-9_]*]]
+; CHECK: ldaw r0, dp[A2]
+; CHECK: retsp 0
+; CHECK: [[JUMP]]
+; CHECK: ldaw r0, dp[A1]
+; CHECK: retsp 0
+; LARGE-LABEL: test:
+; LARGE: zext r0, 1
+; LARGE: ldaw r11, cp[.LCPI{{[0-9_]*}}]
+; LARGE: mov r1, r11
+; LARGE: ldaw r11, cp[.LCPI{{[0-9_]*}}]
+; LARGE: bt r0, [[JUMP:.LBB[0-9_]*]]
+; LARGE: mov r11, r1
+; LARGE: [[JUMP]]
+; LARGE: ldw r0, r11[0]
+; LARGE: retsp 0
+@A1 = external global [50000 x i32]
+@A2 = external global [50000 x i32]
+define [50000 x i32]* @test(i1 %bool) nounwind {
+entry:
+  %Addr = select i1 %bool, [50000 x i32]* @A1, [50000 x i32]* @A2
+  ret [50000 x i32]* %Addr
+}
+
+
+; CHECK: .section  .cp.rodata.cst4,"aMc",@progbits,4
+; CHECK: .long 65536
+; CHECK: .text
+; CHECK-LABEL: f:
+; CHECK: ldc r1, 65532
+; CHECK: add r1, r0, r1
+; CHECK: ldw r1, r1[0]
+; CHECK: ldw r2, cp[.LCPI{{[0-9_]*}}]
+; CHECK: add r0, r0, r2
+; CHECK: ldw r0, r0[0]
+; CHECK: add r0, r1, r0
+; CHECK: ldw r1, dp[l]
+; CHECK: add r0, r0, r1
+; CHECK: ldw r1, dp[l+4]
+; CHECK: add r0, r0, r1
+; CHECK: ldw r1, dp[l+392]
+; CHECK: add r0, r0, r1
+; CHECK: ldw r1, dp[l+396]
+; CHECK: add r0, r0, r1
+; CHECK: ldw r1, dp[s]
+; CHECK: add r0, r0, r1
+; CHECK: ldw r1, dp[s+36]
+; CHECK: add r0, r0, r1
+; CHECK: retsp 0
+;
+; LARGE: .section .cp.rodata.cst4,"aMc",@progbits,4
+; LARGE: .long 65536
+; LARGE: .section .cp.rodata,"ac",@progbits
+; LARGE: .long l
+; LARGE: .long l+4
+; LARGE: .long l+392
+; LARGE: .long l+396
+; LARGE: .text
+; LARGE-LABEL: f:
+; LARGE: ldc r1, 65532
+; LARGE: add r1, r0, r1
+; LARGE: ldw r1, r1[0]
+; LARGE: ldw r2, cp[.LCPI{{[0-9_]*}}]
+; LARGE: add r0, r0, r2
+; LARGE: ldw r0, r0[0]
+; LARGE: add r0, r1, r0
+; LARGE: ldw r1, cp[.LCPI{{[0-9_]*}}]
+; LARGE: ldw r1, r1[0]
+; LARGE: add r0, r0, r1
+; LARGE: ldw r1, cp[.LCPI{{[0-9_]*}}]
+; LARGE: ldw r1, r1[0]
+; LARGE: add r0, r0, r1
+; LARGE: ldw r1, cp[.LCPI{{[0-9_]*}}]
+; LARGE: ldw r1, r1[0]
+; LARGE: add r0, r0, r1
+; LARGE: ldw r1, cp[.LCPI{{[0-9_]*}}]
+; LARGE: ldw r1, r1[0]
+; LARGE: add r0, r0, r1
+; LARGE: ldw r1, dp[s]
+; LARGE: add r0, r0, r1
+; LARGE: ldw r1, dp[s+36]
+; LARGE: add r0, r0, r1
+; LARGE: retsp 0
+define i32 @f(i32* %i) {
+entry:
+  %0 = getelementptr inbounds i32* %i, i32 16383
+  %1 = load i32* %0
+  %2 = getelementptr inbounds i32* %i, i32 16384
+  %3 = load i32* %2
+  %4 = add nsw i32 %1, %3
+  %5 = load i32* getelementptr inbounds ([100 x i32]* @l, i32 0, i32 0)
+  %6 = add nsw i32 %4, %5
+  %7 = load i32* getelementptr inbounds ([100 x i32]* @l, i32 0, i32 1)
+  %8 = add nsw i32 %6, %7
+  %9 = load i32* getelementptr inbounds ([100 x i32]* @l, i32 0, i32 98)
+  %10 = add nsw i32 %8, %9
+  %11 = load i32* getelementptr inbounds ([100 x i32]* @l, i32 0, i32 99)
+  %12 = add nsw i32 %10, %11
+  %13 = load i32* getelementptr inbounds ([10 x i32]* @s, i32 0, i32 0)
+  %14 = add nsw i32 %12, %13
+  %15 = load i32* getelementptr inbounds ([10 x i32]* @s, i32 0, i32 9)
+  %16 = add nsw i32 %14, %15
+  ret i32 %16
+}
+
+
+; CHECK-LABEL: UnknownSize:
+; CHECK: ldw r0, dp[NoSize+40]
+; CHECK-NEXT: retsp 0
+;
+; LARGE: .section .cp.rodata,"ac",@progbits
+; LARGE: .LCPI{{[0-9_]*}}
+; LARGE-NEXT: .long NoSize
+; LARGE-NEXT: .text
+; LARGE-LABEL: UnknownSize:
+; LARGE: ldw r0, cp[.LCPI{{[0-9_]*}}]
+; LARGE-NEXT: ldw r0, r0[0]
+; LARGE-NEXT: retsp 0
+@NoSize = external global [0 x i32]
+define i32 @UnknownSize() nounwind {
+entry:
+  %0 = load i32* getelementptr inbounds ([0 x i32]* @NoSize, i32 0, i32 10)
+  ret i32 %0
+}
+
+
+; CHECK-LABEL: UnknownStruct:
+; CHECK: ldaw r0, dp[Unknown]
+; CHECK-NEXT: retsp 0
+;
+; LARGE: .section .cp.rodata,"ac",@progbits
+; LARGE: .LCPI{{[0-9_]*}}
+; LARGE-NEXT: .long Unknown
+; LARGE-NEXT: .text
+; LARGE-LABEL: UnknownStruct:
+; LARGE: ldw r0, cp[.LCPI{{[0-9_]*}}]
+; LARGE-NEXT: retsp 0
+%Struct = type opaque
+@Unknown = external global %Struct
+define %Struct* @UnknownStruct() nounwind {
+entry:
+  ret %Struct* @Unknown
+}
+
+
+; CHECK: .section .dp.bss,"awd",@nobits
+; CHECK-LABEL: l:
+; CHECK: .space 400
+; LARGE: .section  .dp.bss.large,"awd",@nobits
+; LARGE-LABEL: l:
+; LARGE: .space  400
+@l = global [100 x i32] zeroinitializer
+
+; CHECK-LABEL: s:
+; CHECK: .space 40
+; LARGE: .section  .dp.bss,"awd",@nobits
+; LARGE-LABEL: s:
+; LARGE: .space  40
+@s = global [10 x i32] zeroinitializer
+
+; CHECK: .section .dp.rodata,"awd",@progbits
+; CHECK-LABEL: cl:
+; CHECK: .space 400
+; LARGE: .section .dp.rodata.large,"awd",@progbits
+; LARGE-LABEL: cl:
+; LARGE: .space 400
+@cl = constant  [100 x i32] zeroinitializer
+
+; CHECK-LABEL: cs:
+; CHECK: .space 40
+; LARGE: .section .dp.rodata,"awd",@progbits
+; LARGE-LABEL: cs:
+; LARGE: .space 40
+@cs = constant  [10 x i32] zeroinitializer
+
+; CHECK: .section .cp.rodata,"ac",@progbits
+; CHECK-LABEL: icl:
+; CHECK: .space 400
+; LARGE: .section .cp.rodata.large,"ac",@progbits
+; LARGE-LABEL: icl:
+; LARGE: .space 400
+@icl = internal constant  [100 x i32] zeroinitializer
+
+; CHECK-LABEL: cs:
+; CHECK: .space 40
+; LARGE: .section .cp.rodata,"ac",@progbits
+; LARGE-LABEL: cs:
+; LARGE: .space 40
+@ics = internal constant  [10 x i32] zeroinitializer
+
+; CHECK: .section  .cp.namedsection,"ac",@progbits
+; CHECK-LABEL: cpsec:
+; CHECK: .long 0
+@cpsec = constant i32 0, section ".cp.namedsection"
+
+; CHECK: .section  .dp.namedsection,"awd",@progbits
+; CHECK-LABEL: dpsec:
+; CHECK: .long 0
+@dpsec = global i32 0, section ".dp.namedsection"
+
diff --git a/test/CodeGen/XCore/epilogue_prologue.ll b/test/CodeGen/XCore/epilogue_prologue.ll
index 185565f..14f04a3 100644
--- a/test/CodeGen/XCore/epilogue_prologue.ll
+++ b/test/CodeGen/XCore/epilogue_prologue.ll
@@ -1,5 +1,20 @@
 ; RUN: llc < %s -march=xcore | FileCheck %s
+; RUN: llc < %s -march=xcore -disable-fp-elim | FileCheck %s -check-prefix=CHECKFP
 
+; When using SP for small frames, we don't need any scratch registers (SR).
+; When using SP for large frames, we may need two scratch registers.
+; When using FP, for large or small frames, we may need one scratch register.
+
+; FP + small frame: spill FP+SR = entsp 2
+; CHECKFP-LABEL: f1
+; CHECKFP: entsp 2
+; CHECKFP-NEXT: stw r10, sp[1]
+; CHECKFP-NEXT: ldaw r10, sp[0]
+; CHECKFP: set sp, r10
+; CHECKFP-NEXT: ldw r10, sp[1]
+; CHECKFP-NEXT: retsp 2
+;
+; !FP + small frame: no spills = no stack adjustment needed
 ; CHECK-LABEL: f1
 ; CHECK: stw lr, sp[0]
 ; CHECK: ldw lr, sp[0]
@@ -10,17 +25,212 @@ entry:
   ret void
 }
 
+
+; FP + small frame: spill FP+SR+R0+LR = entsp 3 + extsp 1
+; CHECKFP-LABEL:f3
+; CHECKFP: entsp 3
+; CHECKFP-NEXT: stw r10, sp[1]
+; CHECKFP-NEXT: ldaw r10, sp[0]
+; CHECKFP-NEXT: stw [[REG:r[4-9]+]], r10[2]
+; CHECKFP-NEXT: mov [[REG]], r0
+; CHECKFP-NEXT: extsp 1
+; CHECKFP-NEXT: bl f2
+; CHECKFP-NEXT: ldaw sp, sp[1]
+; CHECKFP-NEXT: mov r0, [[REG]]
+; CHECKFP-NEXT: ldw [[REG]], r10[2]
+; CHECKFP-NEXT: set sp, r10
+; CHECKFP-NEXT: ldw r10, sp[1]
+; CHECKFP-NEXT: retsp 3
+;
+; !FP + small frame: spill R0+LR = entsp 2
 ; CHECK-LABEL: f3
 ; CHECK: entsp 2
-; CHECK: stw [[REG:r[4-9]+]], sp[1]
-; CHECK: mov [[REG]], r0
-; CHECK: bl f2
-; CHECK: mov r0, [[REG]]
-; CHECK: ldw [[REG]], sp[1]
-; CHECK: retsp 2
+; CHECK-NEXT: stw [[REG:r[4-9]+]], sp[1]
+; CHECK-NEXT: mov [[REG]], r0
+; CHECK-NEXT: bl f2
+; CHECK-NEXT: mov r0, [[REG]]
+; CHECK-NEXT: ldw [[REG]], sp[1]
+; CHECK-NEXT: retsp 2
 declare void @f2()
 define i32 @f3(i32 %i) nounwind {
 entry:
   call void @f2()
   ret i32 %i
 }
+
+
+; FP + large frame: spill FP+SR = entsp 2 + 100000
+; CHECKFP-LABEL: f4
+; CHECKFP: entsp 65535
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_def_cfa_offset 262140
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_offset 15, 0
+; CHECKFP-NEXT: extsp 34467
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_def_cfa_offset 400008
+; CHECKFP-NEXT: stw r10, sp[1]
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_offset 10, -400004
+; CHECKFP-NEXT: ldaw r10, sp[0]
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_def_cfa_register 10
+; CHECKFP-NEXT: set sp, r10
+; CHECKFP-NEXT: ldw r10, sp[1]
+; CHECKFP-NEXT: ldaw sp, sp[65535]
+; CHECKFP-NEXT: retsp 34467
+;
+; !FP + large frame: spill SR+SR = entsp 2 + 100000
+; CHECK-LABEL: f4
+; CHECK: entsp 65535
+; CHECK-NEXT: .Ltmp{{[0-9]+}}
+; CHECK-NEXT: .cfi_def_cfa_offset 262140
+; CHECK-NEXT: .Ltmp{{[0-9]+}}
+; CHECK-NEXT: .cfi_offset 15, 0
+; CHECK-NEXT: extsp 34467
+; CHECK-NEXT: .Ltmp{{[0-9]+}}
+; CHECK-NEXT: .cfi_def_cfa_offset 400008
+; CHECK-NEXT: ldaw sp, sp[65535]
+; CHECK-NEXT: retsp 34467
+define void @f4() {
+entry:
+  %0 = alloca [100000 x i32]
+  ret void
+}
+
+
+; FP + large frame: spill FP+SR+R4+LR = entsp 3 + 200000  + extsp 1
+; CHECKFP: .section .cp.rodata.cst4,"aMc",@progbits,4
+; CHECKFP-NEXT: .align 4
+; CHECKFP-NEXT: .LCPI[[CNST0:[0-9_]+]]:
+; CHECKFP-NEXT: .long 200002
+; CHECKFP-NEXT: .LCPI[[CNST1:[0-9_]+]]:
+; CHECKFP-NEXT: .long 200001
+; CHECKFP-NEXT: .text
+; CHECKFP-LABEL: f6
+; CHECKFP: entsp 65535
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_def_cfa_offset 262140
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_offset 15, 0
+; CHECKFP-NEXT: extsp 65535
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_def_cfa_offset 524280
+; CHECKFP-NEXT: extsp 65535
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_def_cfa_offset 786420
+; CHECKFP-NEXT: extsp 3398
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_def_cfa_offset 800012
+; CHECKFP-NEXT: stw r10, sp[1]
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_offset 10, -800008
+; CHECKFP-NEXT: ldaw r10, sp[0]
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_def_cfa_register 10
+; CHECKFP-NEXT: ldw r1, cp[.LCPI[[CNST0]]]
+; CHECKFP-NEXT: stw [[REG:r[4-9]+]], r10[r1]
+; CHECKFP-NEXT: .Ltmp{{[0-9]+}}
+; CHECKFP-NEXT: .cfi_offset 4, -4
+; CHECKFP-NEXT: mov [[REG]], r0
+; CHECKFP-NEXT: extsp 1
+; CHECKFP-NEXT: ldaw r0, r10[2]
+; CHECKFP-NEXT: bl f5
+; CHECKFP-NEXT: ldaw sp, sp[1]
+; CHECKFP-NEXT: ldw r1, cp[.LCPI3_1]
+; CHECKFP-NEXT: ldaw r0, r10[r1]
+; CHECKFP-NEXT: extsp 1
+; CHECKFP-NEXT: bl f5
+; CHECKFP-NEXT: ldaw sp, sp[1]
+; CHECKFP-NEXT: mov r0, [[REG]]
+; CHECKFP-NEXT: ldw r1, cp[.LCPI[[CNST0]]]
+; CHECKFP-NEXT: ldw [[REG]], r10[r1]
+; CHECKFP-NEXT: set sp, r10
+; CHECKFP-NEXT: ldw r10, sp[1]
+; CHECKFP-NEXT: ldaw sp, sp[65535]
+; CHECKFP-NEXT: ldaw sp, sp[65535]
+; CHECKFP-NEXT: ldaw sp, sp[65535]
+; CHECKFP-NEXT: retsp 3398
+;
+; !FP + large frame: spill SR+SR+R4+LR = entsp 4 + 200000
+; CHECK: .section .cp.rodata.cst4,"aMc",@progbits,4
+; CHECK-NEXT: .align 4
+; CHECK-NEXT: .LCPI[[CNST0:[0-9_]+]]:
+; CHECK-NEXT: .long 200003
+; CHECK-NEXT: .LCPI[[CNST1:[0-9_]+]]:
+; CHECK-NEXT: .long 200002
+; CHECK-NEXT: .text
+; CHECK-LABEL: f6
+; CHECK: entsp 65535
+; CHECK-NEXT: .Ltmp{{[0-9]+}}
+; CHECK-NEXT: .cfi_def_cfa_offset 262140
+; CHECK-NEXT: .Ltmp{{[0-9]+}}
+; CHECK-NEXT: .cfi_offset 15, 0
+; CHECK-NEXT: extsp 65535
+; CHECK-NEXT: .Ltmp{{[0-9]+}}
+; CHECK-NEXT: .cfi_def_cfa_offset 524280
+; CHECK-NEXT: extsp 65535
+; CHECK-NEXT: .Ltmp{{[0-9]+}}
+; CHECK-NEXT: .cfi_def_cfa_offset 786420
+; CHECK-NEXT: extsp 3399
+; CHECK-NEXT: .Ltmp{{[0-9]+}}
+; CHECK-NEXT: .cfi_def_cfa_offset 800016
+; CHECK-NEXT: ldaw r1, sp[0]
+; CHECK-NEXT: ldw r2, cp[.LCPI[[CNST0]]]
+; CHECK-NEXT: stw [[REG:r[4-9]+]], r1[r2]
+; CHECK-NEXT: .Ltmp{{[0-9]+}}
+; CHECK-NEXT: .cfi_offset 4, -4
+; CHECK-NEXT: mov [[REG]], r0
+; CHECK-NEXT: ldaw r0, sp[3]
+; CHECK-NEXT: bl f5
+; CHECK-NEXT: ldaw r0, sp[0]
+; CHECK-NEXT: ldw r1, cp[.LCPI[[CNST1]]]
+; CHECK-NEXT: ldaw r0, r0[r1]
+; CHECK-NEXT: bl f5
+; CHECK-NEXT: mov r0, [[REG]]
+; CHECK-NEXT: ldaw [[REG]], sp[0]
+; CHECK-NEXT: ldw r1, cp[.LCPI[[CNST0]]]
+; CHECK-NEXT: ldw [[REG]], [[REG]][r1]
+; CHECK-NEXT: ldaw sp, sp[65535]
+; CHECK-NEXT: ldaw sp, sp[65535]
+; CHECK-NEXT: ldaw sp, sp[65535]
+; CHECK-NEXT: retsp 3399
+declare void @f5(i32*)
+define i32 @f6(i32 %i) {
+entry:
+  %0 = alloca [200000 x i32]
+  %1 = getelementptr inbounds [200000 x i32]* %0, i32 0, i32 0
+  call void @f5(i32* %1)
+  %2 = getelementptr inbounds [200000 x i32]* %0, i32 0, i32 199999
+  call void @f5(i32* %2)
+  ret i32 %i
+}
+
+
+; FP + large frame: spill FP+SR+LR = entsp 2 + 32768  + extsp 1
+; CHECKFP-LABEL:f8
+; CHECKFP: entsp 32770
+; CHECKFP-NEXT: stw r10, sp[1]
+; CHECKFP-NEXT: ldaw r10, sp[0]
+; CHECKFP-NEXT: mkmsk r1, 15
+; CHECKFP-NEXT: ldaw r0, r10[r1]
+; CHECKFP-NEXT: extsp 1
+; CHECKFP-NEXT: bl f5
+; CHECKFP-NEXT: ldaw sp, sp[1]
+; CHECKFP-NEXT: set sp, r10
+; CHECKFP-NEXT: ldw r10, sp[1]
+; CHECKFP-NEXT: retsp 32770
+;
+; !FP + large frame: spill SR+SR+LR = entsp 3 + 32768
+; CHECK-LABEL:f8
+; CHECK: entsp 32771
+; CHECK-NEXT: ldaw r0, sp[32768]
+; CHECK-NEXT: bl f5
+; CHECK-NEXT: retsp 32771
+define void @f8() nounwind {
+entry:
+  %0 = alloca [32768 x i32]
+  %1 = getelementptr inbounds [32768 x i32]* %0, i32 0, i32 32765
+  call void @f5(i32* %1)
+  ret void
+}
diff --git a/test/CodeGen/XCore/exception.ll b/test/CodeGen/XCore/exception.ll
index 8018cdc..3179fcd 100644
--- a/test/CodeGen/XCore/exception.ll
+++ b/test/CodeGen/XCore/exception.ll
@@ -29,9 +29,8 @@ entry:
 ; CHECK: .cfi_offset 15, 0
 ; CHECK: ldc r0, 4
 ; CHECK: bl __cxa_allocate_exception
-; CHECK: ldaw r11, cp[_ZTIi]
+; CHECK: ldaw r1, dp[_ZTIi]
 ; CHECK: ldc r2, 0
-; CHECK: mov r1, r11
 ; CHECK: bl __cxa_throw
 define void @fn_throw() {
 entry:
diff --git a/test/CodeGen/XCore/globals.ll b/test/CodeGen/XCore/globals.ll
index b3a872b..04e135c 100644
--- a/test/CodeGen/XCore/globals.ll
+++ b/test/CodeGen/XCore/globals.ll
@@ -17,11 +17,18 @@ entry:
 define i32 *@addr_G3() {
 entry:
 ; CHECK-LABEL: addr_G3:
-; CHECK: ldaw r11, cp[G3]
-; CHECK: mov r0, r11
+; CHECK: ldaw r0, dp[G3]
 	ret i32* @G3
 }
 
+define i32 *@addr_iG3() {
+entry:
+; CHECK-LABEL: addr_iG3:
+; CHECK: ldaw r11, cp[iG3]
+; CHECK: mov r0, r11
+  ret i32* @iG3
+}
+
 define i32 **@addr_G4() {
 entry:
 ; CHECK-LABEL: addr_G4:
@@ -32,11 +39,18 @@ entry:
 define i32 **@addr_G5() {
 entry:
 ; CHECK-LABEL: addr_G5:
-; CHECK: ldaw r11, cp[G5]
-; CHECK: mov r0, r11
+; CHECK: ldaw r0, dp[G5]
 	ret i32** @G5
 }
 
+define i32 **@addr_iG5() {
+entry:
+; CHECK-LABEL: addr_iG5:
+; CHECK: ldaw r11, cp[iG5]
+; CHECK: mov r0, r11
+  ret i32** @iG5
+}
+
 define i32 **@addr_G6() {
 entry:
 ; CHECK-LABEL: addr_G6:
@@ -47,11 +61,18 @@ entry:
 define i32 **@addr_G7() {
 entry:
 ; CHECK-LABEL: addr_G7:
-; CHECK: ldaw r11, cp[G7]
-; CHECK: mov r0, r11
+; CHECK: ldaw r0, dp[G7]
 	ret i32** @G7
 }
 
+define i32 **@addr_iG7() {
+entry:
+; CHECK-LABEL: addr_iG7:
+; CHECK: ldaw r11, cp[iG7]
+; CHECK: mov r0, r11
+  ret i32** @iG7
+}
+
 define i32 *@addr_G8() {
 entry:
 ; CHECK-LABEL: addr_G8:
@@ -68,26 +89,38 @@ entry:
 ; CHECK: G2:
 
 @G3 = unnamed_addr constant i32 9401
-; CHECK: .section .cp.rodata.cst4,"aMc",@progbits,4
+; CHECK: .section .dp.rodata,"awd",@progbits
 ; CHECK: G3:
 
+@iG3 = internal constant i32 9401
+; CHECK: .section .cp.rodata,"ac",@progbits
+; CHECK: iG3:
+
 @G4 = global i32* @G1
 ; CHECK: .section .dp.data,"awd",@progbits
 ; CHECK: G4:
 
 @G5 = unnamed_addr constant i32* @G1
-; CHECK: .section .cp.rodata,"ac",@progbits
+; CHECK: .section .dp.rodata,"awd",@progbits
 ; CHECK: G5:
 
+@iG5 = internal unnamed_addr constant i32* @G1
+; CHECK: .section .cp.rodata,"ac",@progbits
+; CHECK: iG5:
+
 @G6 = global i32* @G8
 ; CHECK: .section .dp.data,"awd",@progbits
 ; CHECK: G6:
 
 @G7 = unnamed_addr constant i32* @G8
-; CHECK: .section .cp.rodata,"ac",@progbits
+; CHECK: .section .dp.rodata,"awd",@progbits
 ; CHECK: G7:
 
-@G8 = internal global i32 9312
+@iG7 = internal unnamed_addr constant i32* @G8
+; CHECK: .section .cp.rodata,"ac",@progbits
+; CHECK: iG7:
+
+@G8 = global i32 9312
 ; CHECK: .section .dp.data,"awd",@progbits
 ; CHECK: G8:
 
diff --git a/test/CodeGen/XCore/inline-asm.ll b/test/CodeGen/XCore/inline-asm.ll
index af3edd1..e9f5b57 100644
--- a/test/CodeGen/XCore/inline-asm.ll
+++ b/test/CodeGen/XCore/inline-asm.ll
@@ -30,3 +30,24 @@ entry:
   tail call void asm sideeffect "foo ${0:n}", "i"(i32 99) nounwind
   ret void
 }
+
+@x = external global i32
+@y = external global i32, section ".cp.rodata"
+
+; CHECK-LABEL: f5:
+; CHECK: ldw r0, dp[x]
+; CHECK: retsp 0
+define i32 @f5() nounwind {
+entry:
+  %asmtmp = call i32 asm "ldw $0, $1", "=r,*m"(i32* @x) nounwind
+  ret i32 %asmtmp
+}
+
+; CHECK-LABEL: f6:
+; CHECK: ldw r0, cp[y]
+; CHECK: retsp 0
+define i32 @f6() nounwind {
+entry:
+  %asmtmp = call i32 asm "ldw $0, $1", "=r,*m"(i32* @y) nounwind
+  ret i32 %asmtmp
+}
diff --git a/test/CodeGen/XCore/linkage.ll b/test/CodeGen/XCore/linkage.ll
index 7a1179b..7384fe7 100644
--- a/test/CodeGen/XCore/linkage.ll
+++ b/test/CodeGen/XCore/linkage.ll
@@ -25,9 +25,21 @@ define protected void @test_protected() {
 ; CHECK: .weak array
 @array = weak global [2 x i32] zeroinitializer
 
+; CHECK: .globl ac.globound
+; CHECK: ac.globound = 2
+; CHECK: .weak ac.globound
+; CHECK: .globl ac
+; CHECK: .weak ac
+@ac = common global [2 x i32] zeroinitializer
+
+; CHECK: .globl gd
 ; CHECK: .weak gd
 @gd = weak global i32 0
 
+; CHECK: .globl gc
+; CHECK: .weak gc
+@gc = common global i32 0
+
 ; CHECK-NOT: .hidden test_hidden_declaration
 
 ; CHECK: .weak gr
diff --git a/test/CodeGen/XCore/llvm-intrinsics.ll b/test/CodeGen/XCore/llvm-intrinsics.ll
new file mode 100644
index 0000000..e0acd66
--- /dev/null
+++ b/test/CodeGen/XCore/llvm-intrinsics.ll
@@ -0,0 +1,363 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+; RUN: llc < %s -march=xcore -disable-fp-elim | FileCheck %s -check-prefix=CHECKFP
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+declare i8* @llvm.returnaddress(i32) nounwind
+declare i8* @llvm.eh.dwarf.cfa(i32) nounwind
+declare void @llvm.eh.return.i32(i32, i8*) nounwind
+declare void @llvm.eh.unwind.init() nounwind
+
+define i8* @FA0() nounwind {
+entry:
+; CHECK-LABEL: FA0
+; CHECK: ldaw r0, sp[0]
+; CHECK-NEXT: retsp 0
+  %0 = call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @FA1() nounwind {
+entry:
+; CHECK-LABEL: FA1
+; CHECK: entsp 100
+; CHECK-NEXT: ldaw r0, sp[0]
+; CHECK-NEXT: retsp 100
+  %0 = alloca [100 x i32]
+  %1 = call i8* @llvm.frameaddress(i32 0)
+  ret i8* %1
+}
+
+define i8* @RA0() nounwind {
+entry:
+; CHECK-LABEL: RA0
+; CHECK: stw lr, sp[0]
+; CHECK-NEXT: ldw r0, sp[0]
+; CHECK-NEXT: ldw lr, sp[0]
+; CHECK-NEXT: retsp 0
+  %0 = call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @RA1() nounwind {
+entry:
+; CHECK-LABEL: RA1
+; CHECK: entsp 100
+; CHECK-NEXT: ldw r0, sp[100]
+; CHECK-NEXT: retsp 100
+  %0 = alloca [100 x i32]
+  %1 = call i8* @llvm.returnaddress(i32 0)
+  ret i8* %1
+}
+
+; test FRAME_TO_ARGS_OFFSET lowering
+define i8* @FTAO0() nounwind {
+entry:
+; CHECK-LABEL: FTAO0
+; CHECK: ldc r0, 0
+; CHECK-NEXT: ldaw r1, sp[0]
+; CHECK-NEXT: add r0, r1, r0
+; CHECK-NEXT: retsp 0
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+}
+
+define i8* @FTAO1() nounwind {
+entry:
+; CHECK-LABEL: FTAO1
+; CHECK: entsp 100
+; CHECK-NEXT: ldc r0, 400
+; CHECK-NEXT: ldaw r1, sp[0]
+; CHECK-NEXT: add r0, r1, r0
+; CHECK-NEXT: retsp 100
+  %0 = alloca [100 x i32]
+  %1 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %1
+}
+
+define i8* @EH0(i32 %offset, i8* %handler) {
+entry:
+; CHECK-LABEL: EH0
+; CHECK: entsp 2
+; CHECK: .cfi_def_cfa_offset 8
+; CHECK: .cfi_offset 15, 0
+; CHECK: .cfi_offset 1, -8
+; CHECK: .cfi_offset 0, -4
+; CHECK: ldc r2, 8
+; CHECK-NEXT: ldaw r3, sp[0]
+; CHECK-NEXT: add r2, r3, r2
+; CHECK-NEXT: add r2, r2, r0
+; CHECK-NEXT: mov r3, r1
+; CHECK-NEXT: ldw r1, sp[0]
+; CHECK-NEXT: ldw r0, sp[1]
+; CHECK-NEXT: set sp, r2
+; CHECK-NEXT: bau r3
+  call void @llvm.eh.return.i32(i32 %offset, i8* %handler)
+  unreachable
+}
+
+declare void @foo(...)
+define i8* @EH1(i32 %offset, i8* %handler) {
+entry:
+; CHECK-LABEL: EH1
+; CHECK: entsp 5
+; CHECK: .cfi_def_cfa_offset 20
+; CHECK: .cfi_offset 15, 0
+; CHECK: .cfi_offset 1, -16
+; CHECK: .cfi_offset 0, -12
+; CHECK: stw r4, sp[4]
+; CHECK: .cfi_offset 4, -4
+; CHECK: stw r5, sp[3]
+; CHECK: .cfi_offset 5, -8
+; CHECK: mov r4, r1
+; CHECK-NEXT: mov r5, r0
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: ldc r0, 20
+; CHECK-NEXT: ldaw r1, sp[0]
+; CHECK-NEXT: add r0, r1, r0
+; CHECK-NEXT: add r2, r0, r5
+; CHECK-NEXT: mov r3, r4
+; CHECK-NEXT: ldw r5, sp[3]
+; CHECK-NEXT: ldw r4, sp[4]
+; CHECK-NEXT: ldw r1, sp[1]
+; CHECK-NEXT: ldw r0, sp[2]
+; CHECK-NEXT: set sp, r2
+; CHECK-NEXT: bau r3
+  call void (...)* @foo()
+  call void @llvm.eh.return.i32(i32 %offset, i8* %handler)
+  unreachable
+}
+
+@offset = external constant i32
+@handler = external constant i8
+define i8* @EH2(i32 %r0, i32 %r1, i32 %r2, i32 %r3) {
+entry:
+; CHECK-LABEL: EH2
+; CHECK: entsp 3
+; CHECK: bl foo
+; CHECK-NEXT: ldw r0, dp[offset]
+; CHECK-NEXT: ldc r1, 12
+; CHECK-NEXT: ldaw r2, sp[0]
+; CHECK-NEXT: add r1, r2, r1
+; CHECK-NEXT: add r2, r1, r0
+; CHECK-NEXT: ldaw r3, dp[handler]
+; CHECK-NEXT: ldw r1, sp[1]
+; CHECK-NEXT: ldw r0, sp[2]
+; CHECK-NEXT: set sp, r2
+; CHECK-NEXT: bau r3
+  call void (...)* @foo()
+  %0 = load i32* @offset
+  call void @llvm.eh.return.i32(i32 %0, i8* @handler)
+  unreachable
+}
+
+
+; FP: spill FP+SR+R0:1+R4:9 = entsp 2+2+6
+; But we dont actually spill or restore R0:1
+; CHECKFP-LABEL: Unwind0:
+; CHECKFP: entsp 10
+; CHECKFP: stw r10, sp[1]
+; CHECKFP: ldaw r10, sp[0]
+; CHECKFP: stw r4, r10[9]
+; CHECKFP: stw r5, r10[8]
+; CHECKFP: stw r6, r10[7]
+; CHECKFP: stw r7, r10[6]
+; CHECKFP: stw r8, r10[5]
+; CHECKFP: stw r9, r10[4]
+; CHECKFP: ldw r9, r10[4]
+; CHECKFP: ldw r8, r10[5]
+; CHECKFP: ldw r7, r10[6]
+; CHECKFP: ldw r6, r10[7]
+; CHECKFP: ldw r5, r10[8]
+; CHECKFP: ldw r4, r10[9]
+; CHECKFP: set sp, r10
+; CHECKFP: ldw r10, sp[1]
+; CHECKFP: retsp 10
+;
+; !FP: spill R0:1+R4:10 = entsp 2+7
+; But we dont actually spill or restore R0:1
+; CHECK-LABEL: Unwind0:
+; CHECK: entsp 9
+; CHECK: stw r4, sp[8]
+; CHECK: stw r5, sp[7]
+; CHECK: stw r6, sp[6]
+; CHECK: stw r7, sp[5]
+; CHECK: stw r8, sp[4]
+; CHECK: stw r9, sp[3]
+; CHECK: stw r10, sp[2]
+; CHECK: ldw r10, sp[2]
+; CHECK: ldw r9, sp[3]
+; CHECK: ldw r8, sp[4]
+; CHECK: ldw r7, sp[5]
+; CHECK: ldw r6, sp[6]
+; CHECK: ldw r5, sp[7]
+; CHECK: ldw r4, sp[8]
+; CHECK: retsp 9
+define void @Unwind0() {
+  call void @llvm.eh.unwind.init()
+  ret void
+}
+
+
+; FP: spill FP+SR+R0:1+R4:9+LR = entsp 2+2+6 + extsp 1
+; But we dont actually spill or restore R0:1
+; CHECKFP-LABEL: Unwind1:
+; CHECKFP: entsp 10
+; CHECKFP: stw r10, sp[1]
+; CHECKFP: ldaw r10, sp[0]
+; CHECKFP: stw r4, r10[9]
+; CHECKFP: stw r5, r10[8]
+; CHECKFP: stw r6, r10[7]
+; CHECKFP: stw r7, r10[6]
+; CHECKFP: stw r8, r10[5]
+; CHECKFP: stw r9, r10[4]
+; CHECKFP: extsp 1
+; CHECKFP: bl foo
+; CHECKFP: ldaw sp, sp[1]
+; CHECKFP: ldw r9, r10[4]
+; CHECKFP: ldw r8, r10[5]
+; CHECKFP: ldw r7, r10[6]
+; CHECKFP: ldw r6, r10[7]
+; CHECKFP: ldw r5, r10[8]
+; CHECKFP: ldw r4, r10[9]
+; CHECKFP: set sp, r10
+; CHECKFP: ldw r10, sp[1]
+; CHECKFP: retsp 10
+;
+; !FP: spill R0:1+R4:10+LR = entsp 2+7+1
+; But we dont actually spill or restore R0:1
+; CHECK-LABEL: Unwind1:
+; CHECK: entsp 10
+; CHECK: stw r4, sp[9]
+; CHECK: stw r5, sp[8]
+; CHECK: stw r6, sp[7]
+; CHECK: stw r7, sp[6]
+; CHECK: stw r8, sp[5]
+; CHECK: stw r9, sp[4]
+; CHECK: stw r10, sp[3]
+; CHECK: bl foo
+; CHECK: ldw r10, sp[3]
+; CHECK: ldw r9, sp[4]
+; CHECK: ldw r8, sp[5]
+; CHECK: ldw r7, sp[6]
+; CHECK: ldw r6, sp[7]
+; CHECK: ldw r5, sp[8]
+; CHECK: ldw r4, sp[9]
+; CHECK: retsp 10
+define void @Unwind1() {
+  call void (...)* @foo()
+  call void @llvm.eh.unwind.init()
+  ret void
+}
+
+; FP: spill FP+SR+R0:1+R4:9 = entsp 2+2+6
+; We dont spill R0:1
+; We only restore R0:1 during eh.return
+; CHECKFP-LABEL: UnwindEH:
+; CHECKFP: entsp 10
+; CHECKFP: .cfi_def_cfa_offset 40
+; CHECKFP: .cfi_offset 15, 0
+; CHECKFP: stw r10, sp[1]
+; CHECKFP: .cfi_offset 10, -36
+; CHECKFP: ldaw r10, sp[0]
+; CHECKFP: .cfi_def_cfa_register 10
+; CHECKFP: .cfi_offset 1, -32
+; CHECKFP: .cfi_offset 0, -28
+; CHECKFP: stw r4, r10[9]
+; CHECKFP: .cfi_offset 4, -4
+; CHECKFP: stw r5, r10[8]
+; CHECKFP: .cfi_offset 5, -8
+; CHECKFP: stw r6, r10[7]
+; CHECKFP: .cfi_offset 6, -12
+; CHECKFP: stw r7, r10[6]
+; CHECKFP: .cfi_offset 7, -16
+; CHECKFP: stw r8, r10[5]
+; CHECKFP: .cfi_offset 8, -20
+; CHECKFP: stw r9, r10[4]
+; CHECKFP: .cfi_offset 9, -24
+; CHECKFP: bt r0, .LBB{{[0-9_]+}}
+; CHECKFP: ldw r9, r10[4]
+; CHECKFP-NEXT: ldw r8, r10[5]
+; CHECKFP-NEXT: ldw r7, r10[6]
+; CHECKFP-NEXT: ldw r6, r10[7]
+; CHECKFP-NEXT: ldw r5, r10[8]
+; CHECKFP-NEXT: ldw r4, r10[9]
+; CHECKFP-NEXT: set sp, r10
+; CHECKFP-NEXT: ldw r10, sp[1]
+; CHECKFP-NEXT: retsp 10
+; CHECKFP: .LBB{{[0-9_]+}}
+; CHECKFP-NEXT: ldc r2, 40
+; CHECKFP-NEXT: add r2, r10, r2
+; CHECKFP-NEXT: add r0, r2, r0
+; CHECKFP-NEXT: mov r3, r1
+; CHECKFP-NEXT: mov r2, r0
+; CHECKFP-NEXT: ldw r9, r10[4]
+; CHECKFP-NEXT: ldw r8, r10[5]
+; CHECKFP-NEXT: ldw r7, r10[6]
+; CHECKFP-NEXT: ldw r6, r10[7]
+; CHECKFP-NEXT: ldw r5, r10[8]
+; CHECKFP-NEXT: ldw r4, r10[9]
+; CHECKFP-NEXT: ldw r1, sp[2]
+; CHECKFP-NEXT: ldw r0, sp[3]
+; CHECKFP-NEXT: set sp, r2
+; CHECKFP-NEXT: bau r3
+;
+; !FP: spill R0:1+R4:10 = entsp 2+7
+; We dont spill R0:1
+; We only restore R0:1 during eh.return
+; CHECK-LABEL: UnwindEH:
+; CHECK: entsp 9
+; CHECK: .cfi_def_cfa_offset 36
+; CHECK: .cfi_offset 15, 0
+; CHECK: .cfi_offset 1, -36
+; CHECK: .cfi_offset 0, -32
+; CHECK: stw r4, sp[8]
+; CHECK: .cfi_offset 4, -4
+; CHECK: stw r5, sp[7]
+; CHECK: .cfi_offset 5, -8
+; CHECK: stw r6, sp[6]
+; CHECK: .cfi_offset 6, -12
+; CHECK: stw r7, sp[5]
+; CHECK: .cfi_offset 7, -16
+; CHECK: stw r8, sp[4]
+; CHECK: .cfi_offset 8, -20
+; CHECK: stw r9, sp[3]
+; CHECK: .cfi_offset 9, -24
+; CHECK: stw r10, sp[2]
+; CHECK: .cfi_offset 10, -28
+; CHECK: bt r0, .LBB{{[0-9_]+}}
+; CHECK: ldw r10, sp[2]
+; CHECK-NEXT: ldw r9, sp[3]
+; CHECK-NEXT: ldw r8, sp[4]
+; CHECK-NEXT: ldw r7, sp[5]
+; CHECK-NEXT: ldw r6, sp[6]
+; CHECK-NEXT: ldw r5, sp[7]
+; CHECK-NEXT: ldw r4, sp[8]
+; CHECK-NEXT: retsp 9
+; CHECK: .LBB{{[0-9_]+}}
+; CHECK-NEXT: ldc r2, 36
+; CHECK-NEXT: ldaw r3, sp[0]
+; CHECK-NEXT: add r2, r3, r2
+; CHECK-NEXT: add r0, r2, r0
+; CHECK-NEXT: mov r3, r1
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: ldw r10, sp[2]
+; CHECK-NEXT: ldw r9, sp[3]
+; CHECK-NEXT: ldw r8, sp[4]
+; CHECK-NEXT: ldw r7, sp[5]
+; CHECK-NEXT: ldw r6, sp[6]
+; CHECK-NEXT: ldw r5, sp[7]
+; CHECK-NEXT: ldw r4, sp[8]
+; CHECK-NEXT: ldw r1, sp[0]
+; CHECK-NEXT: ldw r0, sp[1]
+; CHECK-NEXT: set sp, r2
+; CHECK-NEXT: bau r3
+define void @UnwindEH(i32 %offset, i8* %handler) {
+  call void @llvm.eh.unwind.init()
+  %cmp = icmp eq i32 %offset, 0
+  br i1 %cmp, label %normal, label %eh
+eh:
+  call void @llvm.eh.return.i32(i32 %offset, i8* %handler)
+  unreachable
+normal:
+  ret void
+}
diff --git a/test/CodeGen/XCore/load.ll b/test/CodeGen/XCore/load.ll
index 0622f1c..c7fc2a3 100644
--- a/test/CodeGen/XCore/load.ll
+++ b/test/CodeGen/XCore/load.ll
@@ -40,7 +40,7 @@ entry:
 	ret i32 %2
 }
 
-@GConst = external constant i32
+@GConst = internal constant i32 42
 define i32 @load_cp() nounwind {
 entry:
 ; CHECK-LABEL: load_cp:
diff --git a/test/CodeGen/XCore/memcpy.ll b/test/CodeGen/XCore/memcpy.ll
new file mode 100644
index 0000000..fe424c5
--- /dev/null
+++ b/test/CodeGen/XCore/memcpy.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+
+; Optimize memcpy to __memcpy_4 if src, dst and size are all 4 byte aligned.
+define void @f1(i8* %dst, i8* %src, i32 %n) nounwind {
+; CHECK-LABEL: f1:
+; CHECK: bl __memcpy_4
+entry:
+  %0 = shl i32 %n, 2
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %0, i32 4, i1 false)
+  ret void
+}
+
+; Can't optimize - size is not a multiple of 4.
+define void @f2(i8* %dst, i8* %src, i32 %n) nounwind {
+; CHECK-LABEL: f2:
+; CHECK: bl memcpy
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %n, i32 4, i1 false)
+  ret void
+}
+
+; Can't optimize - alignment is not a multiple of 4.
+define void @f3(i8* %dst, i8* %src, i32 %n) nounwind {
+; CHECK-LABEL: f3:
+; CHECK: bl memcpy
+entry:
+  %0 = shl i32 %n, 2
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %0, i32 2, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/XCore/resources.ll b/test/CodeGen/XCore/resources.ll
index 5385010..87bf3c2 100644
--- a/test/CodeGen/XCore/resources.ll
+++ b/test/CodeGen/XCore/resources.ll
@@ -15,12 +15,14 @@ declare void @llvm.xcore.setd.p1i8(i8 addrspace(1)* %r, i32 %value)
 declare void @llvm.xcore.setc.p1i8(i8 addrspace(1)* %r, i32 %value)
 declare i32 @llvm.xcore.inshr.p1i8(i8 addrspace(1)* %r, i32 %value)
 declare i32 @llvm.xcore.outshr.p1i8(i8 addrspace(1)* %r, i32 %value)
+declare void @llvm.xcore.clrpt.p1i8(i8 addrspace(1)* %r)
 declare void @llvm.xcore.setpt.p1i8(i8 addrspace(1)* %r, i32 %value)
 declare i32 @llvm.xcore.getts.p1i8(i8 addrspace(1)* %r)
 declare void @llvm.xcore.syncr.p1i8(i8 addrspace(1)* %r)
 declare void @llvm.xcore.settw.p1i8(i8 addrspace(1)* %r, i32 %value)
 declare void @llvm.xcore.setv.p1i8(i8 addrspace(1)* %r, i8* %p)
 declare void @llvm.xcore.setev.p1i8(i8 addrspace(1)* %r, i8* %p)
+declare void @llvm.xcore.edu.p1i8(i8 addrspace(1)* %r)
 declare void @llvm.xcore.eeu.p1i8(i8 addrspace(1)* %r)
 declare void @llvm.xcore.setclk.p1i8.p1i8(i8 addrspace(1)* %a, i8 addrspace(1)* %b)
 declare void @llvm.xcore.setrdy.p1i8.p1i8(i8 addrspace(1)* %a, i8 addrspace(1)* %b)
@@ -140,6 +142,13 @@ define i32 @outshr(i32 %value, i8 addrspace(1)* %r) {
 	ret i32 %result
 }
 
+define void @clrpt(i8 addrspace(1)* %r) {
+; CHECK-LABEL: clrpt:
+; CHECK: clrpt res[r0]
+	call void @llvm.xcore.clrpt.p1i8(i8 addrspace(1)* %r)
+	ret void
+}
+
 define void @setpt(i8 addrspace(1)* %r, i32 %value) {
 ; CHECK-LABEL: setpt:
 ; CHECK: setpt res[r0], r1
@@ -184,6 +193,13 @@ define void @setev(i8 addrspace(1)* %r, i8* %p) {
 	ret void
 }
 
+define void @edu(i8 addrspace(1)* %r) {
+; CHECK-LABEL: edu:
+; CHECK: edu res[r0]
+	call void @llvm.xcore.edu.p1i8(i8 addrspace(1)* %r)
+	ret void
+}
+
 define void @eeu(i8 addrspace(1)* %r) {
 ; CHECK-LABEL: eeu:
 ; CHECK: eeu res[r0]
diff --git a/test/CodeGen/XCore/resources_combine.ll b/test/CodeGen/XCore/resources_combine.ll
new file mode 100644
index 0000000..20c184a5
--- /dev/null
+++ b/test/CodeGen/XCore/resources_combine.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=xcore < %s | FileCheck %s
+
+declare i32 @llvm.xcore.int.p1i8(i8 addrspace(1)* %r)
+declare i32 @llvm.xcore.inct.p1i8(i8 addrspace(1)* %r)
+declare i32 @llvm.xcore.testct.p1i8(i8 addrspace(1)* %r)
+declare i32 @llvm.xcore.testwct.p1i8(i8 addrspace(1)* %r)
+declare i32 @llvm.xcore.getts.p1i8(i8 addrspace(1)* %r)
+declare void @llvm.xcore.outt.p1i8(i8 addrspace(1)* %r, i32 %value)
+declare void @llvm.xcore.outct.p1i8(i8 addrspace(1)* %r, i32 %value)
+declare void @llvm.xcore.chkct.p1i8(i8 addrspace(1)* %r, i32 %value)
+declare void @llvm.xcore.setpt.p1i8(i8 addrspace(1)* %r, i32 %value)
+
+define i32 @int(i8 addrspace(1)* %r) nounwind {
+; CHECK-LABEL: int:
+; CHECK: int r0, res[r0]
+; CHECK-NEXT: retsp 0
+	%result = call i32 @llvm.xcore.int.p1i8(i8 addrspace(1)* %r)
+	%trunc = and i32 %result, 255
+	ret i32 %trunc
+}
+
+define i32 @inct(i8 addrspace(1)* %r) nounwind {
+; CHECK-LABEL: inct:
+; CHECK: inct r0, res[r0]
+; CHECK-NEXT: retsp 0
+	%result = call i32 @llvm.xcore.inct.p1i8(i8 addrspace(1)* %r)
+	%trunc = and i32 %result, 255
+	ret i32 %trunc
+}
+
+define i32 @testct(i8 addrspace(1)* %r) nounwind {
+; CHECK-LABEL: testct:
+; CHECK: testct r0, res[r0]
+; CHECK-NEXT: retsp 0
+	%result = call i32 @llvm.xcore.testct.p1i8(i8 addrspace(1)* %r)
+	%trunc = and i32 %result, 1
+	ret i32 %trunc
+}
+
+define i32 @testwct(i8 addrspace(1)* %r) nounwind {
+; CHECK-LABEL: testwct:
+; CHECK: testwct r0, res[r0]
+; CHECK-NEXT: retsp 0
+	%result = call i32 @llvm.xcore.testwct.p1i8(i8 addrspace(1)* %r)
+	%trunc = and i32 %result, 7
+	ret i32 %trunc
+}
+
+define i32 @getts(i8 addrspace(1)* %r) nounwind {
+; CHECK-LABEL: getts:
+; CHECK: getts r0, res[r0]
+; CHECK-NEXT: retsp 0
+	%result = call i32 @llvm.xcore.getts.p1i8(i8 addrspace(1)* %r)
+	%trunc = and i32 %result, 65535
+	ret i32 %result
+}
+
+define void @outt(i8 addrspace(1)* %r, i32 %value) nounwind {
+; CHECK-LABEL: outt:
+; CHECK-NOT: zext
+; CHECK: outt res[r0], r1
+; CHECK-NEXT: retsp 0
+	%trunc = and i32 %value, 255
+	call void @llvm.xcore.outt.p1i8(i8 addrspace(1)* %r, i32 %trunc)
+	ret void
+}
+
+define void @outct(i8 addrspace(1)* %r, i32 %value) nounwind {
+; CHECK-LABEL: outct:
+; CHECK-NOT: zext
+; CHECK: outct res[r0], r1
+	%trunc = and i32 %value, 255
+	call void @llvm.xcore.outct.p1i8(i8 addrspace(1)* %r, i32 %trunc)
+	ret void
+}
+
+define void @chkct(i8 addrspace(1)* %r, i32 %value) nounwind {
+; CHECK-LABEL: chkct:
+; CHECK-NOT: zext
+; CHECK: chkct res[r0], r1
+	%trunc = and i32 %value, 255
+	call void @llvm.xcore.chkct.p1i8(i8 addrspace(1)* %r, i32 %trunc)
+	ret void
+}
+
+define void @setpt(i8 addrspace(1)* %r, i32 %value) nounwind {
+; CHECK-LABEL: setpt:
+; CHECK-NOT: zext
+; CHECK: setpt res[r0], r1
+	%trunc = and i32 %value, 65535
+	call void @llvm.xcore.setpt.p1i8(i8 addrspace(1)* %r, i32 %trunc)
+	ret void
+}
diff --git a/test/CodeGen/XCore/scavenging.ll b/test/CodeGen/XCore/scavenging.ll
index 5b612d0..a0c8a2e 100644
--- a/test/CodeGen/XCore/scavenging.ll
+++ b/test/CodeGen/XCore/scavenging.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=xcore
+; RUN: llc < %s -march=xcore | FileCheck %s
+
 @size = global i32 0		; <i32*> [#uses=1]
 @g0 = external global i32		; <i32*> [#uses=2]
 @g1 = external global i32		; <i32*> [#uses=2]
@@ -48,5 +49,69 @@ entry:
 	call void @g(i32* %x1, i32* %1) nounwind
 	ret void
 }
-
 declare void @g(i32*, i32*)
+
+
+; CHECK: .section .cp.rodata.cst4,"aMc",@progbits,4
+; CHECK: .align  4
+; CHECK: [[ARG5:.LCPI[0-9_]+]]:
+; CHECK: .long   100003
+; CHECK: [[INDEX0:.LCPI[0-9_]+]]:
+; CHECK: .long   80002
+; CHECK: [[INDEX1:.LCPI[0-9_]+]]:
+; CHECK: .long   81002
+; CHECK: [[INDEX2:.LCPI[0-9_]+]]:
+; CHECK: .long   82002
+; CHECK: [[INDEX3:.LCPI[0-9_]+]]:
+; CHECK: .long   83002
+; CHECK: [[INDEX4:.LCPI[0-9_]+]]:
+; CHECK: .long   84002
+; CHECK: .text
+; !FP + large frame: spill SR+SR = entsp 2 + 100000
+; CHECK-LABEL: ScavengeSlots:
+; CHECK: entsp 65535
+; CHECK: extsp 34467
+; scavenge r11
+; CHECK: ldaw r11, sp[0]
+; scavenge r4 using SR spill slot
+; CHECK: stw r4, sp[1]
+; CHECK: ldw r4, cp{{\[}}[[ARG5]]{{\]}}
+; r11 used to load 5th argument
+; CHECK: ldw r11, r11[r4]
+; CHECK: ldaw r4, sp[0]
+; scavenge r5 using SR spill slot
+; CHECK: stw r5, sp[0]
+; CHECK: ldw r5, cp{{\[}}[[INDEX0]]{{\]}}
+; r4 & r5 used by InsertSPConstInst() to emit STW_l3r instruction.
+; CHECK: stw r0, r4[r5]
+; CHECK: ldaw r0, sp[0]
+; CHECK: ldw r5, cp{{\[}}[[INDEX1]]{{\]}}
+; CHECK: stw r1, r0[r5]
+; CHECK: ldaw r0, sp[0]
+; CHECK: ldw r1, cp{{\[}}[[INDEX2]]{{\]}}
+; CHECK: stw r2, r0[r1]
+; CHECK: ldaw r0, sp[0]
+; CHECK: ldw r1, cp{{\[}}[[INDEX3]]{{\]}}
+; CHECK: stw r3, r0[r1]
+; CHECK: ldaw r0, sp[0]
+; CHECK: ldw r1, cp{{\[}}[[INDEX4]]{{\]}}
+; CHECK: stw r11, r0[r1]
+; CHECK: ldaw sp, sp[65535]
+; CHECK: ldw r4, sp[1]
+; CHECK: ldw r5, sp[0]
+; CHECK: retsp 34467
+define void @ScavengeSlots(i32 %r0, i32 %r1, i32 %r2, i32 %r3, i32 %r4) nounwind {
+entry:
+  %Data = alloca [100000 x i32]
+  %i0 = getelementptr inbounds [100000 x i32]* %Data, i32 0, i32 80000
+  store volatile i32 %r0, i32* %i0
+  %i1 = getelementptr inbounds [100000 x i32]* %Data, i32 0, i32 81000
+  store volatile i32 %r1, i32* %i1
+  %i2 = getelementptr inbounds [100000 x i32]* %Data, i32 0, i32 82000
+  store volatile i32 %r2, i32* %i2
+  %i3 = getelementptr inbounds [100000 x i32]* %Data, i32 0, i32 83000
+  store volatile i32 %r3, i32* %i3
+  %i4 = getelementptr inbounds [100000 x i32]* %Data, i32 0, i32 84000
+  store volatile i32 %r4, i32* %i4
+  ret void
+}
author	Stephen Hines <srhines@google.com>	2014-04-23 16:57:46 -0700
committer	Stephen Hines <srhines@google.com>	2014-04-24 15:53:16 -0700
commit	36b56886974eae4f9c5ebc96befd3e7bfe5de338 (patch)
tree	e6cfb69fbbd937f450eeb83bfb83b9da3b01275a /test/CodeGen
parent	69a8640022b04415ae9fac62f8ab090601d8f889 (diff)
download	external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.zip external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.gz external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.bz2