Update LLVM for 3.5 rebase (r209712).

Change-Id: I149556c940fb7dc92d075273c87ff584f400941f
author: Stephen Hines <srhines@google.com> 2014-05-29 02:49:00 -0700
committer: Stephen Hines <srhines@google.com> 2014-05-29 02:49:00 -0700
commit: dce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch)
tree: dcebc53f2b182f145a2e659393bf9a0472cedf23 /test/CodeGen
parent: 220b921aed042f9e520c26cffd8282a94c66c3d5 (diff)
download: external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2
906 files changed, 36630 insertions, 43367 deletions
diff --git a/test/CodeGen/AArch64/128bit_load_store.ll b/test/CodeGen/AArch64/128bit_load_store.ll
index 502fd70..a6f0776 100644
--- a/test/CodeGen/AArch64/128bit_load_store.ll
+++ b/test/CodeGen/AArch64/128bit_load_store.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK
 
 define void @test_store_f128(fp128* %ptr, fp128 %val) #0 {
-; CHECK: test_store_f128
+; CHECK-LABEL: test_store_f128
 ; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
 entry:
   store fp128 %val, fp128* %ptr, align 16
@@ -9,7 +9,7 @@ entry:
 }
 
 define fp128 @test_load_f128(fp128* readonly %ptr) #2 {
-; CHECK: test_load_f128
+; CHECK-LABEL: test_load_f128
 ; CHECK: ldr	 {{q[0-9]+}}, [{{x[0-9]+}}]
 entry:
   %0 = load fp128* %ptr, align 16
@@ -17,9 +17,9 @@ entry:
 }
 
 define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
-; CHECK: test_vstrq_p128
-; CHECK: str	{{x[0-9]+}}, [{{x[0-9]+}}, #8]
-; CHECK-NEXT: str	 {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-LABEL: test_vstrq_p128
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+
 entry:
   %0 = bitcast i128* %ptr to fp128*
   %1 = bitcast i128 %val to fp128
@@ -28,9 +28,9 @@ entry:
 }
 
 define i128 @test_vldrq_p128(i128* readonly %ptr) #2 {
-; CHECK: test_vldrq_p128
-; CHECK: ldr	{{x[0-9]+}}, [{{x[0-9]+}}]
-; CHECK-NEXT: ldr	{{x[0-9]+}}, [{{x[0-9]+}}, #8]
+; CHECK-LABEL: test_vldrq_p128
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+
 entry:
   %0 = bitcast i128* %ptr to fp128*
   %1 = load fp128* %0, align 16
@@ -39,7 +39,7 @@ entry:
 }
 
 define void @test_ld_st_p128(i128* nocapture %ptr) #0 {
-; CHECK: test_ld_st_p128
+; CHECK-LABEL: test_ld_st_p128
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
 ; CHECK-NEXT: str	{{q[0-9]+}}, [{{x[0-9]+}}, #16]
 entry:
diff --git a/test/CodeGen/AArch64/neon-v1i1-setcc.ll b/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
index 6c7d009..c932253 100644
--- a/test/CodeGen/AArch64/neon-v1i1-setcc.ll
+++ b/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; arm64 has a separate copy as aarch64-neon-v1i1-setcc.ll
 
 ; This file test the DAG node like "v1i1 SETCC v1i64, v1i64". As the v1i1 type
 ; is illegal in AArch64 backend, the legalizer tries to scalarize this node.
@@ -10,7 +11,7 @@
 
 define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
 ; CHECK-LABEL: test_sext_extr_cmp_0:
-; CHECK: cmge d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
   %1 = icmp sge <1 x i64> %v1, %v2
   %2 = extractelement <1 x i1> %1, i32 0
   %vget_lane = sext i1 %2 to i64
@@ -19,7 +20,7 @@ define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
 
 define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
 ; CHECK-LABEL: test_sext_extr_cmp_1:
-; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
   %1 = fcmp oeq <1 x double> %v1, %v2
   %2 = extractelement <1 x i1> %1, i32 0
   %vget_lane = sext i1 %2 to i64
@@ -29,7 +30,7 @@ define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
 define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
 ; CHECK-LABEL: test_select_v1i1_0:
 ; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
   %1 = icmp eq <1 x i64> %v1, %v2
   %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
   ret <1 x i64> %res
@@ -38,7 +39,7 @@ define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3
 define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
 ; CHECK-LABEL: test_select_v1i1_1:
 ; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
   %1 = fcmp oeq <1 x double> %v1, %v2
   %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
   ret <1 x i64> %res
@@ -47,7 +48,7 @@ define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i6
 define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
 ; CHECK-LABEL: test_select_v1i1_2:
 ; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
   %1 = icmp eq <1 x i64> %v1, %v2
   %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
   ret <1 x double> %res
diff --git a/test/CodeGen/AArch64/adc.ll b/test/CodeGen/AArch64/adc.ll
index 29637d3..892573b 100644
--- a/test/CodeGen/AArch64/adc.ll
+++ b/test/CodeGen/AArch64/adc.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
 
 define i128 @test_simple(i128 %a, i128 %b, i128 %c) {
 ; CHECK-LABEL: test_simple:
diff --git a/test/CodeGen/AArch64/addsub-shifted.ll b/test/CodeGen/AArch64/addsub-shifted.ll
index 269c1e8..0a93edd 100644
--- a/test/CodeGen/AArch64/addsub-shifted.ll
+++ b/test/CodeGen/AArch64/addsub-shifted.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -35,7 +35,7 @@ define void @test_lsl_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift4a = shl i32 %lhs4a, 15
   %val4a = sub i32 0, %shift4a
   store volatile i32 %val4a, i32* @var32
-; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, lsl #15
+; CHECK: neg {{w[0-9]+}}, {{w[0-9]+}}, lsl #15
 
   %rhs5 = load volatile i64* @var64
   %shift5 = shl i64 %rhs5, 18
@@ -66,7 +66,7 @@ define void @test_lsl_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift8a = shl i64 %lhs8a, 60
   %val8a = sub i64 0, %shift8a
   store volatile i64 %val8a, i64* @var64
-; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, lsl #60
+; CHECK: neg {{x[0-9]+}}, {{x[0-9]+}}, lsl #60
 
   ret void
 ; CHECK: ret
@@ -99,7 +99,7 @@ define void @test_lsr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift4a = lshr i32 %lhs32, 15
   %val4a = sub i32 0, %shift4a
   store volatile i32 %val4a, i32* @var32
-; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, lsr #15
+; CHECK: neg {{w[0-9]+}}, {{w[0-9]+}}, lsr #15
 
   %shift5 = lshr i64 %rhs64, 18
   %val5 = add i64 %lhs64, %shift5
@@ -125,7 +125,7 @@ define void @test_lsr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift8a = lshr i64 %lhs64, 45
   %val8a = sub i64 0, %shift8a
   store volatile i64 %val8a, i64* @var64
-; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, lsr #45
+; CHECK: neg {{x[0-9]+}}, {{x[0-9]+}}, lsr #45
 
   ret void
 ; CHECK: ret
@@ -158,7 +158,7 @@ define void @test_asr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift4a = ashr i32 %lhs32, 15
   %val4a = sub i32 0, %shift4a
   store volatile i32 %val4a, i32* @var32
-; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, asr #15
+; CHECK: neg {{w[0-9]+}}, {{w[0-9]+}}, asr #15
 
   %shift5 = ashr i64 %rhs64, 18
   %val5 = add i64 %lhs64, %shift5
@@ -184,7 +184,7 @@ define void @test_asr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift8a = ashr i64 %lhs64, 45
   %val8a = sub i64 0, %shift8a
   store volatile i64 %val8a, i64* @var64
-; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, asr #45
+; CHECK: neg {{x[0-9]+}}, {{x[0-9]+}}, asr #45
 
   ret void
 ; CHECK: ret
@@ -245,7 +245,7 @@ define i32 @test_cmn(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   br i1 %tst1, label %t2, label %end
   ; Important that this isn't lowered to a cmn instruction because if %rhs32 ==
   ; 0 then the results will differ.
-; CHECK: sub [[RHS:w[0-9]+]], wzr, {{w[0-9]+}}, lsl #13
+; CHECK: neg [[RHS:w[0-9]+]], {{w[0-9]+}}, lsl #13
 ; CHECK: cmp {{w[0-9]+}}, [[RHS]]
 
 t2:
@@ -268,7 +268,7 @@ t4:
   %tst4 = icmp slt i64 %lhs64, %val4
   br i1 %tst4, label %t5, label %end
   ; Again, it's important that cmn isn't used here in case %rhs64 == 0.
-; CHECK: sub [[RHS:x[0-9]+]], xzr, {{x[0-9]+}}, lsl #43
+; CHECK: neg [[RHS:x[0-9]+]], {{x[0-9]+}}, lsl #43
 ; CHECK: cmp {{x[0-9]+}}, [[RHS]]
 
 t5:
diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll
index 4d46d04..b85fdbb 100644
--- a/test/CodeGen/AArch64/addsub.ll
+++ b/test/CodeGen/AArch64/addsub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 ; Note that this should be refactored (for efficiency if nothing else)
 ; when the PCS is implemented so we don't have to worry about the
@@ -28,12 +28,12 @@ define void @add_small() {
 define void @add_med() {
 ; CHECK-LABEL: add_med:
 
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #3567, lsl #12
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{#3567, lsl #12|#14610432}}
   %val32 = load i32* @var_i32
   %newval32 = add i32 %val32, 14610432 ; =0xdef000
   store i32 %newval32, i32* @var_i32
 
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #4095, lsl #12
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{#4095, lsl #12|#16773120}}
   %val64 = load i64* @var_i64
   %newval64 = add i64 %val64, 16773120 ; =0xfff000
   store i64 %newval64, i64* @var_i64
@@ -62,12 +62,12 @@ define void @sub_small() {
 define void @sub_med() {
 ; CHECK-LABEL: sub_med:
 
-; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, #3567, lsl #12
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{#3567, lsl #12|#14610432}}
   %val32 = load i32* @var_i32
   %newval32 = sub i32 %val32, 14610432 ; =0xdef000
   store i32 %newval32, i32* @var_i32
 
-; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, #4095, lsl #12
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{#4095, lsl #12|#16773120}}
   %val64 = load i64* @var_i64
   %newval64 = sub i64 %val64, 16773120 ; =0xfff000
   store i64 %newval64, i64* @var_i64
@@ -80,13 +80,13 @@ define void @testing() {
   %val = load i32* @var_i32
 
 ; CHECK: cmp {{w[0-9]+}}, #4095
-; CHECK: b.ne .LBB4_6
+; CHECK: b.ne [[RET:.?LBB[0-9]+_[0-9]+]]
   %cmp_pos_small = icmp ne i32 %val, 4095
   br i1 %cmp_pos_small, label %ret, label %test2
 
 test2:
-; CHECK: cmp {{w[0-9]+}}, #3567, lsl #12
-; CHECK: b.lo .LBB4_6
+; CHECK: cmp {{w[0-9]+}}, {{#3567, lsl #12|#14610432}}
+; CHECK: b.lo [[RET]]
   %newval2 = add i32 %val, 1
   store i32 %newval2, i32* @var_i32
   %cmp_pos_big = icmp ult i32 %val, 14610432
@@ -94,7 +94,7 @@ test2:
 
 test3:
 ; CHECK: cmp {{w[0-9]+}}, #123
-; CHECK: b.lt .LBB4_6
+; CHECK: b.lt [[RET]]
   %newval3 = add i32 %val, 2
   store i32 %newval3, i32* @var_i32
   %cmp_pos_slt = icmp slt i32 %val, 123
@@ -102,7 +102,7 @@ test3:
 
 test4:
 ; CHECK: cmp {{w[0-9]+}}, #321
-; CHECK: b.gt .LBB4_6
+; CHECK: b.gt [[RET]]
   %newval4 = add i32 %val, 3
   store i32 %newval4, i32* @var_i32
   %cmp_pos_sgt = icmp sgt i32 %val, 321
@@ -110,7 +110,7 @@ test4:
 
 test5:
 ; CHECK: cmn {{w[0-9]+}}, #444
-; CHECK: b.gt .LBB4_6
+; CHECK: b.gt [[RET]]
   %newval5 = add i32 %val, 4
   store i32 %newval5, i32* @var_i32
   %cmp_neg_uge = icmp sgt i32 %val, -444
diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll
index f0e11c6..a2266b1 100644
--- a/test/CodeGen/AArch64/addsub_ext.ll
+++ b/test/CodeGen/AArch64/addsub_ext.ll
@@ -1,11 +1,11 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
 @var32 = global i32 0
 @var64 = global i64 0
 
-define void @addsub_i8rhs() {
+define void @addsub_i8rhs() minsize {
 ; CHECK-LABEL: addsub_i8rhs:
     %val8_tmp = load i8* @var8
     %lhs32 = load i32* @var32
@@ -80,7 +80,7 @@ end:
     ret void
 }
 
-define void @addsub_i16rhs() {
+define void @addsub_i16rhs() minsize {
 ; CHECK-LABEL: addsub_i16rhs:
     %val16_tmp = load i16* @var16
     %lhs32 = load i32* @var32
@@ -158,7 +158,7 @@ end:
 ; N.b. we could probably check more here ("add w2, w3, w1, uxtw" for
 ; example), but the remaining instructions are probably not idiomatic
 ; in the face of "add/sub (shifted register)" so I don't intend to.
-define void @addsub_i32rhs() {
+define void @addsub_i32rhs() minsize {
 ; CHECK-LABEL: addsub_i32rhs:
     %val32_tmp = load i32* @var32
     %lhs64 = load i64* @var64
diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll
index 1d3c0a0..f93efbc 100644
--- a/test/CodeGen/AArch64/alloca.ll
+++ b/test/CodeGen/AArch64/alloca.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
 
 declare void @use_addr(i8*)
 
@@ -8,23 +8,22 @@ define void @test_simple_alloca(i64 %n) {
 
   %buf = alloca i8, i64 %n
   ; Make sure we align the stack change to 16 bytes:
-; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15
-; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0
+; CHECK: {{mov|add}} x29
+; CHECK: mov [[TMP:x[0-9]+]], sp
+; CHECK: add [[SPDELTA_TMP:x[0-9]+]], x0, #15
+; CHECK: and [[SPDELTA:x[0-9]+]], [[SPDELTA_TMP]], #0xfffffffffffffff0
 
   ; Make sure we change SP. It would be surprising if anything but x0 were used
   ; for the final sp, but it could be if it was then moved into x0.
-; CHECK-DAG: mov [[TMP:x[0-9]+]], sp
-; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]]
-; CHECK: mov sp, x0
+; CHECK: sub [[NEWSP:x[0-9]+]], [[TMP]], [[SPDELTA]]
+; CHECK: mov sp, [[NEWSP]]
 
   call void @use_addr(i8* %buf)
 ; CHECK: bl use_addr
 
   ret void
   ; Make sure epilogue restores sp from fp
-; CHECK: sub sp, x29, #16
-; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: add sp, sp, #32
+; CHECK: {{sub|mov}} sp, x29
 ; CHECK: ret
 }
 
@@ -32,57 +31,70 @@ declare void @use_addr_loc(i8*, i64*)
 
 define i64 @test_alloca_with_local(i64 %n) {
 ; CHECK-LABEL: test_alloca_with_local:
-; CHECK: sub sp, sp, #32
-; CHECK: stp x29, x30, [sp, #16]
+; CHECK-DAG: sub sp, sp, [[LOCAL_STACK:#[0-9]+]]
+; CHECK-DAG: {{mov|add}} x29, sp
 
   %loc = alloca i64
   %buf = alloca i8, i64 %n
   ; Make sure we align the stack change to 16 bytes:
-; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15
-; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0
+; CHECK: mov [[TMP:x[0-9]+]], sp
+; CHECK: add [[SPDELTA_TMP:x[0-9]+]], x0, #15
+; CHECK: and [[SPDELTA:x[0-9]+]], [[SPDELTA_TMP]], #0xfffffffffffffff0
 
   ; Make sure we change SP. It would be surprising if anything but x0 were used
   ; for the final sp, but it could be if it was then moved into x0.
-; CHECK-DAG: mov [[TMP:x[0-9]+]], sp
-; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]]
-; CHECK: mov sp, x0
+; CHECK: sub [[NEWSP:x[0-9]+]], [[TMP]], [[SPDELTA]]
+; CHECK: mov sp, [[NEWSP]]
 
-  ; Obviously suboptimal code here, but it to get &local in x1
-; CHECK: sub [[TMP:x[0-9]+]], x29, [[LOC_FROM_FP:#[0-9]+]]
-; CHECK: add x1, [[TMP]], #0
+; CHECK: sub {{x[0-9]+}}, x29, #[[LOC_FROM_FP:[0-9]+]]
 
   call void @use_addr_loc(i8* %buf, i64* %loc)
 ; CHECK: bl use_addr
 
   %val = load i64* %loc
-; CHECK: sub x[[TMP:[0-9]+]], x29, [[LOC_FROM_FP]]
-; CHECK: ldr x0, [x[[TMP]]]
+
+; CHECK: ldur x0, [x29, #-[[LOC_FROM_FP]]]
 
   ret i64 %val
   ; Make sure epilogue restores sp from fp
-; CHECK: sub sp, x29, #16
-; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: add sp, sp, #32
+; CHECK: {{sub|mov}} sp, x29
 ; CHECK: ret
 }
 
 define void @test_variadic_alloca(i64 %n, ...) {
-; CHECK: test_variadic_alloca:
-
-; CHECK: sub     sp, sp, #208
-; CHECK: stp     x29, x30, [sp, #192]
-; CHECK: add     x29, sp, #192
-; CHECK: sub     [[TMP:x[0-9]+]], x29, #192
-; CHECK: add     x8, [[TMP]], #0
-; CHECK-FP: str     q7, [x8, #112]
+; CHECK-LABEL: test_variadic_alloca:
+
 ; [...]
-; CHECK-FP: str     q1, [x8, #16]
 
-; CHECK-NOFP: sub     sp, sp, #80
-; CHECK-NOFP: stp     x29, x30, [sp, #64]
-; CHECK-NOFP: add     x29, sp, #64
-; CHECK-NOFP: sub     [[TMP:x[0-9]+]], x29, #64
-; CHECK-NOFP: add     x8, [[TMP]], #0
+
+; CHECK-NOFP-AARCH64: sub     sp, sp, #80
+; CHECK-NOFP-AARCH64: stp     x29, x30, [sp, #64]
+; CHECK-NOFP-AARCH64: add     x29, sp, #64
+; CHECK-NOFP-AARCH64: sub     [[TMP:x[0-9]+]], x29, #64
+; CHECK-NOFP-AARCH64: add     x8, [[TMP]], #0
+
+
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
+; CHECK: sub     sp, sp, #192
+; CHECK: stp     q6, q7, [x29, #-96]
+; [...]
+; CHECK: stp     q0, q1, [x29, #-192]
+
+; CHECK: stp     x6, x7, [x29, #-16]
+; [...]
+; CHECK: stp     x2, x3, [x29, #-48]
+
+; CHECK-NOFP-ARM64: stp     x29, x30, [sp, #-16]!
+; CHECK-NOFP-ARM64: mov     x29, sp
+; CHECK-NOFP-ARM64: sub     sp, sp, #64
+; CHECK-NOFP-ARM64: stp     x6, x7, [x29, #-16]
+; [...]
+; CHECK-NOFP-ARM64: stp     x4, x5, [x29, #-32]
+; [...]
+; CHECK-NOFP-ARM64: stp     x2, x3, [x29, #-48]
+; [...]
+; CHECK-NOFP-ARM64: mov     x8, sp
 
   %addr = alloca i8, i64 %n
 
@@ -90,23 +102,24 @@ define void @test_variadic_alloca(i64 %n, ...) {
 ; CHECK: bl use_addr
 
   ret void
-; CHECK: sub sp, x29, #192
-; CHECK: ldp x29, x30, [sp, #192]
-; CHECK: add sp, sp, #208
 
-; CHECK-NOFP: sub sp, x29, #64
-; CHECK-NOFP: ldp x29, x30, [sp, #64]
-; CHECK-NOFP: add sp, sp, #80
+; CHECK-NOFP-AARCH64: sub sp, x29, #64
+; CHECK-NOFP-AARCH64: ldp x29, x30, [sp, #64]
+; CHECK-NOFP-AARCH64: add sp, sp, #80
+
+; CHECK-NOFP-ARM64: mov sp, x29
+; CHECK-NOFP-ARM64: ldp x29, x30, [sp], #16
 }
 
 define void @test_alloca_large_frame(i64 %n) {
 ; CHECK-LABEL: test_alloca_large_frame:
 
-; CHECK: sub sp, sp, #496
-; CHECK: stp x29, x30, [sp, #480]
-; CHECK: add x29, sp, #480
-; CHECK: sub sp, sp, #48
-; CHECK: sub sp, sp, #1953, lsl #12
+
+; CHECK: stp     x20, x19, [sp, #-32]!
+; CHECK: stp     x29, x30, [sp, #16]
+; CHECK: add     x29, sp, #16
+; CHECK: sub     sp, sp, #1953, lsl #12
+; CHECK: sub     sp, sp, #512
 
   %addr1 = alloca i8, i64 %n
   %addr2 = alloca i64, i64 1000000
@@ -114,9 +127,10 @@ define void @test_alloca_large_frame(i64 %n) {
   call void @use_addr_loc(i8* %addr1, i64* %addr2)
 
   ret void
-; CHECK: sub sp, x29, #480
-; CHECK: ldp x29, x30, [sp, #480]
-; CHECK: add sp, sp, #496
+
+; CHECK: sub     sp, x29, #16
+; CHECK: ldp     x29, x30, [sp, #16]
+; CHECK: ldp     x20, x19, [sp], #32
 }
 
 declare i8* @llvm.stacksave()
@@ -124,7 +138,6 @@ declare void @llvm.stackrestore(i8*)
 
 define void @test_scoped_alloca(i64 %n) {
 ; CHECK-LABEL: test_scoped_alloca:
-; CHECK: sub sp, sp, #32
 
   %sp = call i8* @llvm.stacksave()
 ; CHECK: mov [[SAVED_SP:x[0-9]+]], sp
diff --git a/test/CodeGen/AArch64/analyze-branch.ll b/test/CodeGen/AArch64/analyze-branch.ll
index 36bc2e0..6616b27 100644
--- a/test/CodeGen/AArch64/analyze-branch.ll
+++ b/test/CodeGen/AArch64/analyze-branch.ll
@@ -168,7 +168,7 @@ define void @test_TBZ_fallthrough_nottaken(i64 %in) nounwind {
   %tst = icmp eq i64 %bit, 0
   br i1 %tst, label %true, label %false, !prof !1
 
-; CHECK: tbz {{x[0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK: tbz {{[wx][0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: // BB#
 ; CHECK-NEXT: bl test_false
 
@@ -213,7 +213,7 @@ define void @test_TBNZ_fallthrough_nottaken(i64 %in) nounwind {
   %tst = icmp ne i64 %bit, 0
   br i1 %tst, label %true, label %false, !prof !1
 
-; CHECK: tbnz {{x[0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK: tbnz {{[wx][0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: // BB#
 ; CHECK-NEXT: bl test_false
 
diff --git a/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll b/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
index 6fb7c3f..6fb7c3f 100644
--- a/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
+++ b/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
diff --git a/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
index 2b083d8..2b083d8 100644
--- a/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
+++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
diff --git a/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll b/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
index 6f0ec34..6f0ec34 100644
--- a/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
+++ b/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
diff --git a/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll b/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
index 88232fc..88232fc 100644
--- a/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
+++ b/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
diff --git a/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
index ea1cd02..8f99bc3 100644
--- a/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
+++ b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
@@ -14,7 +14,7 @@ for.body:
 ; CHECK: for.body
 ; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
 ; CHECK: add x[[REG:[0-9]+]],
-; CHECK:                      x[[REG]], #4096
+; CHECK:                      x[[REG]], #1, lsl  #12
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %0 = shl nsw i64 %indvars.iv, 12
   %add = add nsw i64 %0, 34628173824
diff --git a/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll b/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
index d47dbb2..d47dbb2 100644
--- a/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
+++ b/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
diff --git a/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll b/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
index a4d37e4..a4d37e4 100644
--- a/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
+++ b/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
diff --git a/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
index d59b0d0..d59b0d0 100644
--- a/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
+++ b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
diff --git a/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
index d1840d3..d1840d3 100644
--- a/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
+++ b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
diff --git a/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
index 4b037db..4b037db 100644
--- a/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
+++ b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
diff --git a/test/CodeGen/ARM64/2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
index dda4ff5..168e921 100644
--- a/test/CodeGen/ARM64/2012-06-06-FPToUI.ll
+++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
@@ -7,8 +7,9 @@
 @.str3 = private unnamed_addr constant [7 x i8] c"%f %u\0A\00", align 1
 
 define void @testDouble(double %d) ssp {
-; CHECK:  fcvtzu x{{.}}, d{{.}}
-; CHECK:  fcvtzu w{{.}}, d{{.}}
+; CHECK-LABEL: testDouble:
+; CHECK:  fcvtzu x{{[0-9]+}}, d{{[0-9]+}}
+; CHECK:  fcvtzu w{{[0-9]+}}, d{{[0-9]+}}
 entry:
   %d.addr = alloca double, align 8
   store double %d, double* %d.addr, align 8
@@ -26,8 +27,9 @@ entry:
 declare i32 @printf(i8*, ...)
 
 define void @testFloat(float %f) ssp {
-; CHECK:  fcvtzu x{{.}}, s{{.}}
-; CHECK:  fcvtzu w{{.}}, s{{.}}
+; CHECK-LABEL: testFloat:
+; CHECK:  fcvtzu x{{[0-9]+}}, s{{[0-9]+}}
+; CHECK:  fcvtzu w{{[0-9]+}}, s{{[0-9]+}}
 entry:
   %f.addr = alloca float, align 4
   store float %f, float* %f.addr, align 4
diff --git a/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll b/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
index 55ecfb5..55ecfb5 100644
--- a/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
+++ b/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
diff --git a/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
index b40a581..e2c43d9 100644
--- a/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
+++ b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios7.0.0"
diff --git a/test/CodeGen/ARM64/2013-01-23-frem-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
index 9451124..9451124 100644
--- a/test/CodeGen/ARM64/2013-01-23-frem-crash.ll
+++ b/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
diff --git a/test/CodeGen/ARM64/2013-01-23-sext-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
index 404027b..404027b 100644
--- a/test/CodeGen/ARM64/2013-01-23-sext-crash.ll
+++ b/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
diff --git a/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
index 70e745f..a350ba1 100644
--- a/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
+++ b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple
 
 ;CHECK-LABEL: Shuff:
 ;CHECK: tbl.8b
diff --git a/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll b/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
new file mode 100644
index 0000000..a73b707
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=arm64
+
+; This test case tests an infinite loop bug in DAG combiner.
+; It just tries to do the following replacing endlessly:
+; (1)  Replacing.3 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4]
+;      With: 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4]
+;
+; (2)  Replacing.2 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4]
+;      With: 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4]
+; As we think the (2) optimization from SIGN_EXTEND to ANY_EXTEND is
+; an optimization to replace unused bits with undefined bits, we remove
+; the (1) optimization (It doesn't make sense to replace undefined bits
+; with signed bits).
+
+define <4 x i32> @infiniteLoop(<4 x i32> %in0, <4 x i16> %in1) {
+entry:
+  %cmp.i = icmp sge <4 x i16> %in1, <i16 32767, i16 32767, i16 -1, i16 -32768>
+  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+  %mul.i = mul <4 x i32> %in0, %sext.i
+  %sext = shl <4 x i32> %mul.i, <i32 16, i32 16, i32 16, i32 16>
+  %vmovl.i.i = ashr <4 x i32> %sext, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %vmovl.i.i
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll b/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
new file mode 100644
index 0000000..3949b85
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -verify-machineinstrs -march=arm64 | FileCheck %s
+
+; Check if sqshl/uqshl with constant shift amout can be selected. 
+define i64 @test_vqshld_s64_i(i64 %a) {
+; CHECK-LABEL: test_vqshld_s64_i:
+; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
+  %1 = tail call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 36)
+  ret i64 %1
+}
+
+define i64 @test_vqshld_u64_i(i64 %a) {
+; CHECK-LABEL: test_vqshld_u64_i:
+; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
+  %1 = tail call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 36)
+  ret i64 %1
+}
+
+declare i64 @llvm.aarch64.neon.uqshl.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqshl.i64(i64, i64)
diff --git a/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
new file mode 100644
index 0000000..1b2d543
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; The following 2 test cases test shufflevector with beginning UNDEF mask.
+define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) {
+;CHECK-LABEL: test_vext_undef_traverse:
+;CHECK: {{ext.16b.*v0, #4}}
+  %vext = shufflevector <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0>, <8 x i16> %in, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x i16> %vext
+}
+
+define <8 x i16> @test_vext_undef_traverse2(<8 x i16> %in) {
+;CHECK-LABEL: test_vext_undef_traverse2:
+;CHECK: {{ext.16b.*v0, #6}}
+  %vext = shufflevector <8 x i16> %in, <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2>
+  ret <8 x i16> %vext
+}
+
+define <8 x i8> @test_vext_undef_traverse3(<8 x i8> %in) {
+;CHECK-LABEL: test_vext_undef_traverse3:
+;CHECK: {{ext.8b.*v0, #6}}
+  %vext = shufflevector <8 x i8> %in, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
+  ret <8 x i8> %vext
+}
diff --git a/test/CodeGen/ARM64/AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 6397ac5..c4597d5 100644
--- a/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -1,10 +1,15 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -arm64-simd-scalar=true -asm-verbose=false | FileCheck %s
-;
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
+
 define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: bar:
 ; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
 ; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
 ; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+; GENERIC-LABEL: bar:
+; GENERIC: add	v[[REG:[0-9]+]].2d, v0.2d, v1.2d
+; GENERIC: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; GENERIC: sub	d[[REG2:[0-9]+]], d[[REG]], d1
   %add = add <2 x i64> %a, %b
   %vgetq_lane = extractelement <2 x i64> %add, i32 0
   %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
@@ -19,6 +24,9 @@ define double @subdd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: subdd_su64:
 ; CHECK: sub d0, d1, d0
 ; CHECK-NEXT: ret
+; GENERIC-LABEL: subdd_su64:
+; GENERIC: sub d0, d1, d0
+; GENERIC-NEXT: ret
   %vecext = extractelement <2 x i64> %a, i32 0
   %vecext1 = extractelement <2 x i64> %b, i32 0
   %sub.i = sub nsw i64 %vecext1, %vecext
@@ -30,9 +38,30 @@ define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: vaddd_su64:
 ; CHECK: add d0, d1, d0
 ; CHECK-NEXT: ret
+; GENERIC-LABEL: vaddd_su64:
+; GENERIC: add d0, d1, d0
+; GENERIC-NEXT: ret
   %vecext = extractelement <2 x i64> %a, i32 0
   %vecext1 = extractelement <2 x i64> %b, i32 0
   %add.i = add nsw i64 %vecext1, %vecext
   %retval = bitcast i64 %add.i to double
   ret double %retval
 }
+
+; sub MI doesn't access dsub register.
+define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: add_sub_su64:
+; CHECK: add d0, d1, d0
+; CHECK: sub d0, {{d[0-9]+}}, d0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: add_sub_su64:
+; GENERIC: add d0, d1, d0
+; GENERIC: sub d0, {{d[0-9]+}}, d0
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %add.i = add i64 %vecext1, %vecext
+  %sub.i = sub i64 0, %add.i
+  %retval = bitcast i64 %sub.i to double
+  ret double %retval
+}
diff --git a/test/CodeGen/ARM64/aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
index 27d2aa7..b713f0d 100644
--- a/test/CodeGen/ARM64/aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -21,7 +21,7 @@ define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
 
   %ext_bool = zext i1 %bool to i64
   store volatile i64 %ext_bool, i64* @var64, align 8
-; CHECK: ldr w[[EXT:[0-9]+]], [sp]
+; CHECK: ldrb w[[EXT:[0-9]+]], [sp]
 ; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
 ; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
 
@@ -37,7 +37,7 @@ define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
 
   %ext_int = zext i32 %int to i64
   store volatile i64 %ext_int, i64* @var64, align 8
-; CHECK: ldr w[[EXT:[0-9]+]], [sp, #24]
+; CHECK: ldr{{b?}} w[[EXT:[0-9]+]], [sp, #24]
 ; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
 
   store volatile i64 %long, i64* @var64, align 8
@@ -57,7 +57,7 @@ define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
 
   %ext_char = sext i8 %char to i64
   store volatile i64 %ext_char, i64* @var64
-; CHECK: sxtb [[EXT:x[0-9]+]], x1
+; CHECK: sxtb [[EXT:x[0-9]+]], w1
 ; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
 
   %ext_short = zext i16 %short to i64
@@ -67,7 +67,7 @@ define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
 
   %ext_int = zext i32 %int to i64
   store volatile i64 %ext_int, i64* @var64
-; CHECK: uxtw [[EXT:x[0-9]+]], x3
+; CHECK: ubfx [[EXT:x[0-9]+]], x3, #0, #32
 ; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
 
   ret void
@@ -80,7 +80,24 @@ declare void @variadic(i32 %a, ...)
 define void @test_variadic() {
   call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
 ; CHECK: fmov d0, #2.0
-; CHECK: orr x1, xzr, #0x1
+; CHECK: orr w1, wzr, #0x1
 ; CHECK: bl variadic
   ret void
 }
+
+; We weren't marking x7 as used after deciding that the i128 didn't fit into
+; registers and putting the first half on the stack, so the *second* half went
+; into x7. Yuck!
+define i128 @test_i128_shadow([7 x i64] %x0_x6, i128 %sp) {
+; CHECK-LABEL: test_i128_shadow:
+; CHECK: ldp x0, x1, [sp]
+
+  ret i128 %sp
+}
+
+; This test is to check if fp128 can be correctly handled on stack.
+define fp128 @test_fp128([8 x float] %arg0, fp128 %arg1) {
+; CHECK-LABEL: test_fp128:
+; CHECK: ldr {{q[0-9]+}}, [sp]
+  ret fp128 %arg1
+}
diff --git a/test/CodeGen/ARM64/abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
index 92db392..92db392 100644
--- a/test/CodeGen/ARM64/abi-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
diff --git a/test/CodeGen/ARM64/abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
index a7693b6..e2de434 100644
--- a/test/CodeGen/ARM64/abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -77,6 +77,7 @@ entry:
 ; CHECK: fixed_4i
 ; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
 ; FAST: fixed_4i
+; FAST: sub sp, sp, #64
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
   %0 = load <4 x i32>* %in, align 16
@@ -130,6 +131,7 @@ entry:
 ; CHECK: test3
 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
 ; FAST: test3
+; FAST: sub sp, sp, #32
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
   %0 = load <2 x i32>* %in, align 8
diff --git a/test/CodeGen/ARM64/abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
index 61c661e..44c5a07 100644
--- a/test/CodeGen/ARM64/abi_align.ll
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -74,7 +74,7 @@ define i32 @caller38_stack() #1 {
 entry:
 ; CHECK: caller38_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
   %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
   %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
@@ -128,7 +128,7 @@ entry:
 ; CHECK: caller39_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
   %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
   %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
@@ -184,7 +184,7 @@ entry:
 ; CHECK: caller40_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
   %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
   %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
@@ -238,7 +238,7 @@ entry:
 ; CHECK: caller41_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
   %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
   %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
@@ -294,7 +294,7 @@ entry:
 ; FAST: sub sp, sp, #96
 ; Space for s1 is allocated at fp-24 = sp+72
 ; Space for s2 is allocated at sp+48
-; FAST: sub x[[A:[0-9]+]], fp, #24
+; FAST: sub x[[A:[0-9]+]], x29, #24
 ; FAST: add x[[A:[0-9]+]], sp, #48
 ; Call memcpy with size = 24 (0x18)
 ; FAST: orr {{x[0-9]+}}, xzr, #0x18
@@ -317,27 +317,27 @@ declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 define i32 @caller42_stack() #3 {
 entry:
 ; CHECK: caller42_stack
-; CHECK: mov fp, sp
+; CHECK: mov x29, sp
 ; CHECK: sub sp, sp, #96
-; CHECK: stur {{x[0-9]+}}, [fp, #-16]
-; CHECK: stur {{q[0-9]+}}, [fp, #-32]
+; CHECK: stur {{x[0-9]+}}, [x29, #-16]
+; CHECK: stur {{q[0-9]+}}, [x29, #-32]
 ; CHECK: str {{x[0-9]+}}, [sp, #48]
 ; CHECK: str {{q[0-9]+}}, [sp, #32]
-; Space for s1 is allocated at fp-32 = sp+64
+; Space for s1 is allocated at x29-32 = sp+64
 ; Space for s2 is allocated at sp+32
 ; CHECK: add x[[B:[0-9]+]], sp, #32
 ; CHECK: str x[[B]], [sp, #16]
-; CHECK: sub x[[A:[0-9]+]], fp, #32
+; CHECK: sub x[[A:[0-9]+]], x29, #32
 ; Address of s1 is passed on stack at sp+8
 ; CHECK: str x[[A]], [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
 
 ; FAST: caller42_stack
 ; Space for s1 is allocated at fp-24
 ; Space for s2 is allocated at fp-48
-; FAST: sub x[[A:[0-9]+]], fp, #24
-; FAST: sub x[[B:[0-9]+]], fp, #48
+; FAST: sub x[[A:[0-9]+]], x29, #24
+; FAST: sub x[[B:[0-9]+]], x29, #48
 ; Call memcpy with size = 24 (0x18)
 ; FAST: orr {{x[0-9]+}}, xzr, #0x18
 ; FAST: str {{w[0-9]+}}, [sp]
@@ -399,7 +399,7 @@ entry:
 ; Space for s2 is allocated at sp
 
 ; FAST: caller43
-; FAST: mov fp, sp
+; FAST: mov x29, sp
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
 ; FAST: add x1, sp, #32
@@ -429,32 +429,32 @@ declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 define i32 @caller43_stack() #3 {
 entry:
 ; CHECK: caller43_stack
-; CHECK: mov fp, sp
+; CHECK: mov x29, sp
 ; CHECK: sub sp, sp, #96
-; CHECK: stur {{q[0-9]+}}, [fp, #-16]
-; CHECK: stur {{q[0-9]+}}, [fp, #-32]
+; CHECK: stur {{q[0-9]+}}, [x29, #-16]
+; CHECK: stur {{q[0-9]+}}, [x29, #-32]
 ; CHECK: str {{q[0-9]+}}, [sp, #48]
 ; CHECK: str {{q[0-9]+}}, [sp, #32]
-; Space for s1 is allocated at fp-32 = sp+64
+; Space for s1 is allocated at x29-32 = sp+64
 ; Space for s2 is allocated at sp+32
 ; CHECK: add x[[B:[0-9]+]], sp, #32
 ; CHECK: str x[[B]], [sp, #16]
-; CHECK: sub x[[A:[0-9]+]], fp, #32
+; CHECK: sub x[[A:[0-9]+]], x29, #32
 ; Address of s1 is passed on stack at sp+8
 ; CHECK: str x[[A]], [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
 
 ; FAST: caller43_stack
 ; FAST: sub sp, sp, #96
 ; Space for s1 is allocated at fp-32 = sp+64
 ; Space for s2 is allocated at sp+32
-; FAST: sub x[[A:[0-9]+]], fp, #32
+; FAST: sub x[[A:[0-9]+]], x29, #32
 ; FAST: add x[[B:[0-9]+]], sp, #32
-; FAST: stur {{x[0-9]+}}, [fp, #-32]
-; FAST: stur {{x[0-9]+}}, [fp, #-24]
-; FAST: stur {{x[0-9]+}}, [fp, #-16]
-; FAST: stur {{x[0-9]+}}, [fp, #-8]
+; FAST: stur {{x[0-9]+}}, [x29, #-32]
+; FAST: stur {{x[0-9]+}}, [x29, #-24]
+; FAST: stur {{x[0-9]+}}, [x29, #-16]
+; FAST: stur {{x[0-9]+}}, [x29, #-8]
 ; FAST: str {{x[0-9]+}}, [sp, #32]
 ; FAST: str {{x[0-9]+}}, [sp, #40]
 ; FAST: str {{x[0-9]+}}, [sp, #48]
@@ -487,9 +487,12 @@ entry:
 ; CHECK: str {{w[0-9]+}}, [sp, #16]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
 ; FAST: i128_split
+; FAST: sub sp, sp, #48
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
-; FAST: stp {{x[0-9]+}}, {{x[0-9]+}}, [x[[ADDR]]]
+; Load/Store opt is disabled with -O0, so the i128 is split.
+; FAST: str {{x[0-9]+}}, [x[[ADDR]], #8]
+; FAST: str {{x[0-9]+}}, [x[[ADDR]]]
   %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
   %call = tail call i32 @callee_i128_split(i32 1, i32 2, i32 3, i32 4, i32 5,
                                            i32 6, i32 7, i128 %0, i32 8) #5
diff --git a/test/CodeGen/ARM64/addp.ll b/test/CodeGen/AArch64/arm64-addp.ll
index 8283a00..3f1e5c5 100644
--- a/test/CodeGen/ARM64/addp.ll
+++ b/test/CodeGen/AArch64/arm64-addp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
 
 define double @foo(<2 x double> %a) nounwind {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/ARM64/addr-mode-folding.ll b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
index dff2331..08fb8c9 100644
--- a/test/CodeGen/ARM64/addr-mode-folding.ll
+++ b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
@@ -6,7 +6,7 @@
 define i32 @fct(i32 %i1, i32 %i2) {
 ; CHECK: @fct
 ; Sign extension is used more than once, thus it should not be folded.
-; CodeGenPrepare is not sharing sext accross uses, thus this is folded because
+; CodeGenPrepare is not sharing sext across uses, thus this is folded because
 ; of that.
 ; _CHECK-NOT_: , sxtw]
 entry:
diff --git a/test/CodeGen/ARM64/addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
index 0677603..1a3ca8b 100644
--- a/test/CodeGen/ARM64/addr-type-promotion.ll
+++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
@@ -11,8 +11,8 @@ define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
 ; CHECK: adrp [[PAGE:x[0-9]+]], _block@GOTPAGE
 ; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block@GOTPAGEOFF]
 ; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
-; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  x0, sxtw]
-; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], x1, sxtw]
+; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  w0, sxtw]
+; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], w1, sxtw]
 ; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
 ; CHECK-NEXT b.ne
 ; Next BB
diff --git a/test/CodeGen/ARM64/addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll
index e131237..700fba8 100644
--- a/test/CodeGen/ARM64/addrmode.ll
+++ b/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -37,7 +37,7 @@ define void @t3() {
 
 ; base + unsigned offset (> imm12 * size of type in bytes)
 ; CHECK: @t4
-; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #32768
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #8, lsl #12
 ; CHECK: ldr xzr, [
 ; CHECK: [[ADDREG]]]
 ; CHECK: ret
@@ -60,7 +60,7 @@ define void @t5(i64 %a) {
 ; base + reg + imm
 ; CHECK: @t6
 ; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
-; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #32768
+; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #8, lsl #12
 ; CHECK: ldr xzr, [
 ; CHECK: [[ADDREG]]]
 ; CHECK: ret
diff --git a/test/CodeGen/ARM64/alloc-no-stack-realign.ll b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
index f396bc9..f396bc9 100644
--- a/test/CodeGen/ARM64/alloc-no-stack-realign.ll
+++ b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
diff --git a/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
index 3750f31..3750f31 100644
--- a/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
+++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
diff --git a/test/CodeGen/ARM64/andCmpBrToTBZ.ll b/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
index 4194977..4194977 100644
--- a/test/CodeGen/ARM64/andCmpBrToTBZ.ll
+++ b/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
diff --git a/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
new file mode 100644
index 0000000..34d6287
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
@@ -0,0 +1,31 @@
+; RUN: llc %s -o - | FileCheck %s
+; Check that ANDS (tst) is not merged with ADD when the immediate
+; is not 0.
+; <rdar://problem/16693089>
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+; CHECK-LABEL: tst1:
+; CHECK: add [[REG:w[0-9]+]], w{{[0-9]+}}, #1
+; CHECK: tst [[REG]], #0x1
+define void @tst1() {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %result.09 = phi i32 [ %add2.result.0, %for.body ], [ 1, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %and = and i32 %i.08, 1
+  %cmp1 = icmp eq i32 %and, 0
+  %add2.result.0 = select i1 %cmp1, i32 undef, i32 %result.09
+  %inc = add nsw i32 %i.08, 1
+  %cmp = icmp slt i32 %i.08, undef
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %add2.result.0.lcssa = phi i32 [ %add2.result.0, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
diff --git a/test/CodeGen/ARM64/anyregcc-crash.ll b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
index 241cf97..241cf97 100644
--- a/test/CodeGen/ARM64/anyregcc-crash.ll
+++ b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
diff --git a/test/CodeGen/ARM64/anyregcc.ll b/test/CodeGen/AArch64/arm64-anyregcc.ll
index e26875d..e26875d 100644
--- a/test/CodeGen/ARM64/anyregcc.ll
+++ b/test/CodeGen/AArch64/arm64-anyregcc.ll
diff --git a/test/CodeGen/ARM64/arith-saturating.ll b/test/CodeGen/AArch64/arm64-arith-saturating.ll
index 437ebb8..78cd1fc 100644
--- a/test/CodeGen/ARM64/arith-saturating.ll
+++ b/test/CodeGen/AArch64/arm64-arith-saturating.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
 
 define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK-LABEL: qadds:
 ; CHECK: sqadd s0, s0, s1
   %vecext = extractelement <4 x i32> %b, i32 0
   %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqadd.i = tail call i32 @llvm.arm64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  %vqadd.i = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
   ret i32 %vqadd.i
 }
 
@@ -14,7 +14,7 @@ define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
 ; CHECK: sqadd d0, d0, d1
   %vecext = extractelement <2 x i64> %b, i32 0
   %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqadd.i = tail call i64 @llvm.arm64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  %vqadd.i = tail call i64 @llvm.aarch64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
   ret i64 %vqadd.i
 }
 
@@ -23,7 +23,7 @@ define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK: uqadd s0, s0, s1
   %vecext = extractelement <4 x i32> %b, i32 0
   %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqadd.i = tail call i32 @llvm.arm64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  %vqadd.i = tail call i32 @llvm.aarch64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
   ret i32 %vqadd.i
 }
 
@@ -32,21 +32,21 @@ define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
 ; CHECK: uqadd d0, d0, d1
   %vecext = extractelement <2 x i64> %b, i32 0
   %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqadd.i = tail call i64 @llvm.arm64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  %vqadd.i = tail call i64 @llvm.aarch64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
   ret i64 %vqadd.i
 }
 
-declare i64 @llvm.arm64.neon.uqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.uqadd.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.uqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqadd.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) nounwind readnone
 
 define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK-LABEL: qsubs:
 ; CHECK: sqsub s0, s0, s1
   %vecext = extractelement <4 x i32> %b, i32 0
   %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqsub.i = tail call i32 @llvm.arm64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  %vqsub.i = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
   ret i32 %vqsub.i
 }
 
@@ -55,7 +55,7 @@ define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
 ; CHECK: sqsub d0, d0, d1
   %vecext = extractelement <2 x i64> %b, i32 0
   %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqsub.i = tail call i64 @llvm.arm64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  %vqsub.i = tail call i64 @llvm.aarch64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
   ret i64 %vqsub.i
 }
 
@@ -64,7 +64,7 @@ define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK: uqsub s0, s0, s1
   %vecext = extractelement <4 x i32> %b, i32 0
   %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqsub.i = tail call i32 @llvm.arm64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  %vqsub.i = tail call i32 @llvm.aarch64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
   ret i32 %vqsub.i
 }
 
@@ -73,21 +73,21 @@ define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
 ; CHECK: uqsub d0, d0, d1
   %vecext = extractelement <2 x i64> %b, i32 0
   %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqsub.i = tail call i64 @llvm.arm64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  %vqsub.i = tail call i64 @llvm.aarch64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
   ret i64 %vqsub.i
 }
 
-declare i64 @llvm.arm64.neon.uqsub.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.uqsub.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.uqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqsub.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) nounwind readnone
 
 define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
 ; CHECK-LABEL: qabss:
 ; CHECK: sqabs s0, s0
 ; CHECK: ret
   %vecext = extractelement <4 x i32> %b, i32 0
-  %vqabs.i = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %vecext) nounwind
+  %vqabs.i = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %vecext) nounwind
   ret i32 %vqabs.i
 }
 
@@ -96,7 +96,7 @@ define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
 ; CHECK: sqabs d0, d0
 ; CHECK: ret
   %vecext = extractelement <2 x i64> %b, i32 0
-  %vqabs.i = tail call i64 @llvm.arm64.neon.sqabs.i64(i64 %vecext) nounwind
+  %vqabs.i = tail call i64 @llvm.aarch64.neon.sqabs.i64(i64 %vecext) nounwind
   ret i64 %vqabs.i
 }
 
@@ -105,7 +105,7 @@ define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
 ; CHECK: sqneg s0, s0
 ; CHECK: ret
   %vecext = extractelement <4 x i32> %b, i32 0
-  %vqneg.i = tail call i32 @llvm.arm64.neon.sqneg.i32(i32 %vecext) nounwind
+  %vqneg.i = tail call i32 @llvm.aarch64.neon.sqneg.i32(i32 %vecext) nounwind
   ret i32 %vqneg.i
 }
 
@@ -114,21 +114,21 @@ define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
 ; CHECK: sqneg d0, d0
 ; CHECK: ret
   %vecext = extractelement <2 x i64> %b, i32 0
-  %vqneg.i = tail call i64 @llvm.arm64.neon.sqneg.i64(i64 %vecext) nounwind
+  %vqneg.i = tail call i64 @llvm.aarch64.neon.sqneg.i64(i64 %vecext) nounwind
   ret i64 %vqneg.i
 }
 
-declare i64 @llvm.arm64.neon.sqneg.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqneg.i32(i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqabs.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqabs.i32(i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqneg.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqneg.i32(i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqabs.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) nounwind readnone
 
 
 define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: vqmovund:
 ; CHECK: sqxtun s0, d0
   %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovun.i = tail call i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
+  %vqmovun.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
   ret i32 %vqmovun.i
 }
 
@@ -136,7 +136,7 @@ define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: vqmovnd_s:
 ; CHECK: sqxtn s0, d0
   %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
+  %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
   ret i32 %vqmovn.i
 }
 
@@ -144,10 +144,10 @@ define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: vqmovnd_u:
 ; CHECK: uqxtn s0, d0
   %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
+  %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
   ret i32 %vqmovn.i
 }
 
-declare i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
diff --git a/test/CodeGen/ARM64/arith.ll b/test/CodeGen/AArch64/arm64-arith.ll
index b6ff0da..ed9b569 100644
--- a/test/CodeGen/ARM64/arith.ll
+++ b/test/CodeGen/AArch64/arm64-arith.ll
@@ -48,7 +48,7 @@ entry:
 define i32 @t6(i32 %a, i32 %b) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: t6:
-; CHECK: lslv w0, w0, w1
+; CHECK: lsl w0, w0, w1
 ; CHECK: ret
   %shl = shl i32 %a, %b
   ret i32 %shl
@@ -57,7 +57,7 @@ entry:
 define i64 @t7(i64 %a, i64 %b) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: t7:
-; CHECK: lslv x0, x0, x1
+; CHECK: lsl x0, x0, x1
 ; CHECK: ret
   %shl = shl i64 %a, %b
   ret i64 %shl
@@ -66,7 +66,7 @@ entry:
 define i32 @t8(i32 %a, i32 %b) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: t8:
-; CHECK: lsrv w0, w0, w1
+; CHECK: lsr w0, w0, w1
 ; CHECK: ret
   %lshr = lshr i32 %a, %b
   ret i32 %lshr
@@ -75,7 +75,7 @@ entry:
 define i64 @t9(i64 %a, i64 %b) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: t9:
-; CHECK: lsrv x0, x0, x1
+; CHECK: lsr x0, x0, x1
 ; CHECK: ret
   %lshr = lshr i64 %a, %b
   ret i64 %lshr
@@ -84,7 +84,7 @@ entry:
 define i32 @t10(i32 %a, i32 %b) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: t10:
-; CHECK: asrv w0, w0, w1
+; CHECK: asr w0, w0, w1
 ; CHECK: ret
   %ashr = ashr i32 %a, %b
   ret i32 %ashr
@@ -93,7 +93,7 @@ entry:
 define i64 @t11(i64 %a, i64 %b) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: t11:
-; CHECK: asrv x0, x0, x1
+; CHECK: asr x0, x0, x1
 ; CHECK: ret
   %ashr = ashr i64 %a, %b
   ret i64 %ashr
@@ -155,8 +155,8 @@ entry:
 define i64 @t17(i16 %a, i64 %x) nounwind ssp {
 entry:
 ; CHECK-LABEL: t17:
-; CHECK: sxth [[REG:x[0-9]+]], x0
-; CHECK: sub x0, xzr, [[REG]], lsl #32
+; CHECK: sxth [[REG:x[0-9]+]], w0
+; CHECK: neg x0, [[REG]], lsl #32
 ; CHECK: ret
   %tmp16 = sext i16 %a to i64
   %tmp17 = mul i64 %tmp16, -4294967296
@@ -168,7 +168,7 @@ entry:
 ; CHECK-LABEL: t18:
 ; CHECK: sdiv w0, w0, w1
 ; CHECK: ret
-  %sdiv = call i32 @llvm.arm64.sdiv.i32(i32 %a, i32 %b)
+  %sdiv = call i32 @llvm.aarch64.sdiv.i32(i32 %a, i32 %b)
   ret i32 %sdiv
 }
 
@@ -177,7 +177,7 @@ entry:
 ; CHECK-LABEL: t19:
 ; CHECK: sdiv x0, x0, x1
 ; CHECK: ret
-  %sdiv = call i64 @llvm.arm64.sdiv.i64(i64 %a, i64 %b)
+  %sdiv = call i64 @llvm.aarch64.sdiv.i64(i64 %a, i64 %b)
   ret i64 %sdiv
 }
 
@@ -186,7 +186,7 @@ entry:
 ; CHECK-LABEL: t20:
 ; CHECK: udiv w0, w0, w1
 ; CHECK: ret
-  %udiv = call i32 @llvm.arm64.udiv.i32(i32 %a, i32 %b)
+  %udiv = call i32 @llvm.aarch64.udiv.i32(i32 %a, i32 %b)
   ret i32 %udiv
 }
 
@@ -195,14 +195,14 @@ entry:
 ; CHECK-LABEL: t21:
 ; CHECK: udiv x0, x0, x1
 ; CHECK: ret
-  %udiv = call i64 @llvm.arm64.udiv.i64(i64 %a, i64 %b)
+  %udiv = call i64 @llvm.aarch64.udiv.i64(i64 %a, i64 %b)
   ret i64 %udiv
 }
 
-declare i32 @llvm.arm64.sdiv.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.sdiv.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.udiv.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.udiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.sdiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.sdiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.udiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.udiv.i64(i64, i64) nounwind readnone
 
 ; 32-bit not.
 define i32 @inv_32(i32 %x) nounwind ssp {
diff --git a/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
new file mode 100644
index 0000000..0904b62
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=arm64 -aarch64-dead-def-elimination=false < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test1() #0 {
+  %tmp1 = alloca i8
+  %tmp2 = icmp eq i8* %tmp1, null
+  %tmp3 = zext i1 %tmp2 to i32
+
+  ret i32 %tmp3
+
+  ; CHECK-LABEL: test1
+  ; CHECK: adds {{x[0-9]+}}, sp, #15
+}
diff --git a/test/CodeGen/ARM64/atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
index a0039a3..3b43aa1 100644
--- a/test/CodeGen/ARM64/atomic-128.ll
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -1,17 +1,18 @@
-; RUN: llc < %s -march=arm64 -mtriple=arm64-linux-gnu -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone | FileCheck %s
 
 @var = global i128 0
 
 define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
 ; CHECK-LABEL: val_compare_and_swap:
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp   [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x0]
-; CHECK: cmp    [[RESULTLO]], x2
-; CHECK: sbc    xzr, [[RESULTHI]], x3
-; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
-; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x0]
+; CHECK: ldaxp   [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x[[ADDR:[0-9]+]]]
+; CHECK-DAG: eor     [[MISMATCH_LO:x[0-9]+]], [[RESULTLO]], x2
+; CHECK-DAG: eor     [[MISMATCH_HI:x[0-9]+]], [[RESULTHI]], x3
+; CHECK: orr [[MISMATCH:x[0-9]+]], [[MISMATCH_LO]], [[MISMATCH_HI]]
+; CHECK: cbnz    [[MISMATCH]], [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-; CHECK: [[LABEL2]]:
+; CHECK: [[DONE]]:
   %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
   ret i128 %val
 }
@@ -20,13 +21,13 @@ define void @fetch_and_nand(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_nand:
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: bic    [[SCRATCH_REGLO:x[0-9]+]], x2, [[DEST_REGLO]]
-; CHECK: bic    [[SCRATCH_REGHI:x[0-9]+]], x3, [[DEST_REGHI]]
+; CHECK-DAG: bic    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK-DAG: bic    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
   %val = atomicrmw nand i128* %p, i128 %bits release
   store i128 %val, i128* @var, align 16
   ret void
@@ -36,13 +37,13 @@ define void @fetch_and_or(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_or:
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: orr    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK: orr    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK-DAG: orr    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK-DAG: orr    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
   %val = atomicrmw or i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -53,12 +54,12 @@ define void @fetch_and_add(i128* %p, i128 %bits) {
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
 ; CHECK: adds   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK: adc    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: adcs   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
   %val = atomicrmw add i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -69,12 +70,12 @@ define void @fetch_and_sub(i128* %p, i128 %bits) {
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
 ; CHECK: subs   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK: sbc    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: sbcs    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
   %val = atomicrmw sub i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -83,16 +84,20 @@ define void @fetch_and_sub(i128* %p, i128 %bits) {
 define void @fetch_and_min(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_min:
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp    [[DEST_REGLO]], x2
-; CHECK: sbc    xzr, [[DEST_REGHI]], x3
-; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, lt
-; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, lt
+; CHECK: ldaxp   [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], ls
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], le
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
   %val = atomicrmw min i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -102,15 +107,19 @@ define void @fetch_and_max(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_max:
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp    [[DEST_REGLO]], x2
-; CHECK: sbc    xzr, [[DEST_REGHI]], x3
-; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, gt
-; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, gt
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], hi
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], gt
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
   %val = atomicrmw max i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -120,15 +129,19 @@ define void @fetch_and_umin(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_umin:
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp    [[DEST_REGLO]], x2
-; CHECK: sbc    xzr, [[DEST_REGHI]], x3
-; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, cc
-; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, cc
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], ls
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], ls
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
   %val = atomicrmw umin i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -138,15 +151,19 @@ define void @fetch_and_umax(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_umax:
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp    [[DEST_REGLO]], x2
-; CHECK: sbc    xzr, [[DEST_REGHI]], x3
-; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, hi
-; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, hi
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], hi
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], hi
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
   %val = atomicrmw umax i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -164,12 +181,7 @@ define i128 @atomic_load_seq_cst(i128* %p) {
 define i128 @atomic_load_relaxed(i128* %p) {
 ; CHECK-LABEL: atomic_load_relaxed:
 ; CHECK-NOT: dmb
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
-; CHECK: orr [[SAMELO:x[0-9]+]], [[LO]], xzr
-; CHECK: orr [[SAMEHI:x[0-9]+]], [[HI]], xzr
-; CHECK: stxp [[SUCCESS:w[0-9]+]], [[SAMELO]], [[SAMEHI]], [x0]
-; CHECK: cbnz [[SUCCESS]], [[LABEL]]
 ; CHECK-NOT: dmb
    %r = load atomic i128* %p monotonic, align 16
    ret i128 %r
diff --git a/test/CodeGen/ARM64/atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
index cf8cf7d..aa9b284 100644
--- a/test/CodeGen/ARM64/atomic.ll
+++ b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -1,12 +1,11 @@
-; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -verify-machineinstrs -mcpu=cyclone | FileCheck %s
 
 define i32 @val_compare_and_swap(i32* %p) {
 ; CHECK-LABEL: val_compare_and_swap:
 ; CHECK: orr    [[NEWVAL_REG:w[0-9]+]], wzr, #0x4
-; CHECK: orr    [[OLDVAL_REG:w[0-9]+]], wzr, #0x7
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldaxr   [[RESULT:w[0-9]+]], [x0]
-; CHECK: cmp    [[RESULT]], [[OLDVAL_REG]]
+; CHECK: cmp    [[RESULT]], #7
 ; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
 ; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
@@ -17,14 +16,13 @@ define i32 @val_compare_and_swap(i32* %p) {
 
 define i64 @val_compare_and_swap_64(i64* %p) {
 ; CHECK-LABEL: val_compare_and_swap_64:
-; CHECK: orr    [[NEWVAL_REG:x[0-9]+]], xzr, #0x4
-; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: orr    w[[NEWVAL_REG:[0-9]+]], wzr, #0x4
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldxr   [[RESULT:x[0-9]+]], [x0]
-; CHECK: cmp    [[RESULT]], [[OLDVAL_REG]]
+; CHECK: cmp    [[RESULT]], #7
 ; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
-; CHECK-NOT: stxr [[NEWVAL_REG]], [[NEWVAL_REG]]
-; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
+; CHECK-NOT: stxr x[[NEWVAL_REG]], x[[NEWVAL_REG]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], x[[NEWVAL_REG]], [x0]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
 ; CHECK: [[LABEL2]]:
   %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
@@ -33,10 +31,9 @@ define i64 @val_compare_and_swap_64(i64* %p) {
 
 define i32 @fetch_and_nand(i32* %p) {
 ; CHECK-LABEL: fetch_and_nand:
-; CHECK: orr    [[OLDVAL_REG:w[0-9]+]], wzr, #0x7
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
-; CHECK: bic    [[SCRATCH2_REG:w[0-9]+]], [[OLDVAL_REG]], w[[DEST_REG]]
+; CHECK: and    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], #0xfffffff8
 ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
 ; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
@@ -47,20 +44,20 @@ define i32 @fetch_and_nand(i32* %p) {
 
 define i64 @fetch_and_nand_64(i64* %p) {
 ; CHECK-LABEL: fetch_and_nand_64:
-; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: mov    x[[ADDR:[0-9]+]], x0
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr   [[DEST_REG:x[0-9]+]], [x0]
-; CHECK: bic    [[SCRATCH2_REG:x[0-9]+]], [[OLDVAL_REG]], [[DEST_REG]]
-; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: ldaxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
+; CHECK: and    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0xfffffffffffffff8
+; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: mov    x0, [[DEST_REG]]
+
   %val = atomicrmw nand i64* %p, i64 7 acq_rel
   ret i64 %val
 }
 
 define i32 @fetch_and_or(i32* %p) {
 ; CHECK-LABEL: fetch_and_or:
-; CHECK: movz   [[OLDVAL_REG:w[0-9]+]], #5
+; CHECK: movz   [[OLDVAL_REG:w[0-9]+]], #0x5
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldaxr   w[[DEST_REG:[0-9]+]], [x0]
 ; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
@@ -74,13 +71,12 @@ define i32 @fetch_and_or(i32* %p) {
 
 define i64 @fetch_and_or_64(i64* %p) {
 ; CHECK: fetch_and_or_64:
-; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: mov    x[[ADDR:[0-9]+]], x0
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxr   [[DEST_REG:x[0-9]+]], [x0]
-; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], [[OLDVAL_REG]]
-; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: ldxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
+; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: mov    x0, [[DEST_REG]]
   %val = atomicrmw or i64* %p, i64 7 monotonic
   ret i64 %val
 }
@@ -122,8 +118,7 @@ define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) {
   %ptr_regoff = getelementptr i8* %p, i32 %off32
   %val_regoff = load atomic i8* %ptr_regoff unordered, align 1
   %tot1 = add i8 %val_unsigned, %val_regoff
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: ldrb {{w[0-9]+}}, [x0, x1, sxtw]
+; CHECK: ldrb {{w[0-9]+}}, [x0, w1, sxtw]
 
   %ptr_unscaled = getelementptr i8* %p, i32 -256
   %val_unscaled = load atomic i8* %ptr_unscaled monotonic, align 1
@@ -133,7 +128,7 @@ define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) {
   %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
   %val_random = load atomic i8* %ptr_random unordered, align 1
   %tot3 = add i8 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
 ; CHECK: ldrb {{w[0-9]+}}, [x[[ADDR]]]
 
   ret i8 %tot3
@@ -148,8 +143,7 @@ define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) {
   %ptr_regoff = getelementptr i16* %p, i32 %off32
   %val_regoff = load atomic i16* %ptr_regoff unordered, align 2
   %tot1 = add i16 %val_unsigned, %val_regoff
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: ldrh {{w[0-9]+}}, [x0, x1, sxtw #1]
+; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1]
 
   %ptr_unscaled = getelementptr i16* %p, i32 -128
   %val_unscaled = load atomic i16* %ptr_unscaled monotonic, align 2
@@ -159,7 +153,7 @@ define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) {
   %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
   %val_random = load atomic i16* %ptr_random unordered, align 2
   %tot3 = add i16 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
 ; CHECK: ldrh {{w[0-9]+}}, [x[[ADDR]]]
 
   ret i16 %tot3
@@ -174,8 +168,7 @@ define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) {
   %ptr_regoff = getelementptr i32* %p, i32 %off32
   %val_regoff = load atomic i32* %ptr_regoff unordered, align 4
   %tot1 = add i32 %val_unsigned, %val_regoff
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: ldr {{w[0-9]+}}, [x0, x1, sxtw #2]
+; CHECK: ldr {{w[0-9]+}}, [x0, w1, sxtw #2]
 
   %ptr_unscaled = getelementptr i32* %p, i32 -64
   %val_unscaled = load atomic i32* %ptr_unscaled monotonic, align 4
@@ -185,7 +178,7 @@ define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) {
   %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
   %val_random = load atomic i32* %ptr_random unordered, align 4
   %tot3 = add i32 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
 ; CHECK: ldr {{w[0-9]+}}, [x[[ADDR]]]
 
   ret i32 %tot3
@@ -200,8 +193,7 @@ define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) {
   %ptr_regoff = getelementptr i64* %p, i32 %off32
   %val_regoff = load atomic i64* %ptr_regoff unordered, align 8
   %tot1 = add i64 %val_unsigned, %val_regoff
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: ldr {{x[0-9]+}}, [x0, x1, sxtw #3]
+; CHECK: ldr {{x[0-9]+}}, [x0, w1, sxtw #3]
 
   %ptr_unscaled = getelementptr i64* %p, i32 -32
   %val_unscaled = load atomic i64* %ptr_unscaled monotonic, align 8
@@ -211,7 +203,7 @@ define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) {
   %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
   %val_random = load atomic i64* %ptr_random unordered, align 8
   %tot3 = add i64 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
 ; CHECK: ldr {{x[0-9]+}}, [x[[ADDR]]]
 
   ret i64 %tot3
@@ -233,8 +225,7 @@ define void @atomic_store_relaxed_8(i8* %p, i32 %off32, i8 %val) {
 
   %ptr_regoff = getelementptr i8* %p, i32 %off32
   store atomic i8 %val, i8* %ptr_regoff unordered, align 1
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: strb {{w[0-9]+}}, [x0, x1, sxtw]
+; CHECK: strb {{w[0-9]+}}, [x0, w1, sxtw]
 
   %ptr_unscaled = getelementptr i8* %p, i32 -256
   store atomic i8 %val, i8* %ptr_unscaled monotonic, align 1
@@ -242,7 +233,7 @@ define void @atomic_store_relaxed_8(i8* %p, i32 %off32, i8 %val) {
 
   %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
   store atomic i8 %val, i8* %ptr_random unordered, align 1
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
 ; CHECK: strb {{w[0-9]+}}, [x[[ADDR]]]
 
   ret void
@@ -256,8 +247,7 @@ define void @atomic_store_relaxed_16(i16* %p, i32 %off32, i16 %val) {
 
   %ptr_regoff = getelementptr i16* %p, i32 %off32
   store atomic i16 %val, i16* %ptr_regoff unordered, align 2
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: strh {{w[0-9]+}}, [x0, x1, sxtw #1]
+; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1]
 
   %ptr_unscaled = getelementptr i16* %p, i32 -128
   store atomic i16 %val, i16* %ptr_unscaled monotonic, align 2
@@ -265,7 +255,7 @@ define void @atomic_store_relaxed_16(i16* %p, i32 %off32, i16 %val) {
 
   %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
   store atomic i16 %val, i16* %ptr_random unordered, align 2
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
 ; CHECK: strh {{w[0-9]+}}, [x[[ADDR]]]
 
   ret void
@@ -279,8 +269,7 @@ define void @atomic_store_relaxed_32(i32* %p, i32 %off32, i32 %val) {
 
   %ptr_regoff = getelementptr i32* %p, i32 %off32
   store atomic i32 %val, i32* %ptr_regoff unordered, align 4
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: str {{w[0-9]+}}, [x0, x1, sxtw #2]
+; CHECK: str {{w[0-9]+}}, [x0, w1, sxtw #2]
 
   %ptr_unscaled = getelementptr i32* %p, i32 -64
   store atomic i32 %val, i32* %ptr_unscaled monotonic, align 4
@@ -288,7 +277,7 @@ define void @atomic_store_relaxed_32(i32* %p, i32 %off32, i32 %val) {
 
   %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
   store atomic i32 %val, i32* %ptr_random unordered, align 4
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
 ; CHECK: str {{w[0-9]+}}, [x[[ADDR]]]
 
   ret void
@@ -302,8 +291,7 @@ define void @atomic_store_relaxed_64(i64* %p, i32 %off32, i64 %val) {
 
   %ptr_regoff = getelementptr i64* %p, i32 %off32
   store atomic i64 %val, i64* %ptr_regoff unordered, align 8
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: str {{x[0-9]+}}, [x0, x1, sxtw #3]
+; CHECK: str {{x[0-9]+}}, [x0, w1, sxtw #3]
 
   %ptr_unscaled = getelementptr i64* %p, i32 -32
   store atomic i64 %val, i64* %ptr_unscaled monotonic, align 8
@@ -311,7 +299,7 @@ define void @atomic_store_relaxed_64(i64* %p, i32 %off32, i64 %val) {
 
   %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
   store atomic i64 %val, i64* %ptr_random unordered, align 8
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
 ; CHECK: str {{x[0-9]+}}, [x[[ADDR]]]
 
   ret void
diff --git a/test/CodeGen/ARM64/basic-pic.ll b/test/CodeGen/AArch64/arm64-basic-pic.ll
index 9fdb1e9..9fdb1e9 100644
--- a/test/CodeGen/ARM64/basic-pic.ll
+++ b/test/CodeGen/AArch64/arm64-basic-pic.ll
diff --git a/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
new file mode 100644
index 0000000..f0e968b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -0,0 +1,1101 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+define void @test_i64_f64(double* %p, i64* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_i64:
+define void @test_f64_i64(i64* %p, double* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev16 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev16 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: ext
+; CHECK: str
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: ext
+; CHECK: str
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: str q
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: str
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: str
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: ext
+; CHECK: str q
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
+; CHECK: ldr
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
+; CHECK: ldr
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
+; CHECK: ldr q
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev16 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
+; CHECK: ldr q
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev16 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-eh.ll b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
new file mode 100644
index 0000000..93e7da9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
@@ -0,0 +1,73 @@
+; RUN: llc -mtriple arm64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s
+
+; ARM EHABI for big endian
+; This test case checks whether CIE length record is laid out in big endian format.
+;
+; This is the LLVM assembly generated from following C++ code:
+;
+; extern void foo(int);
+; void test(int a, int b) {
+;   try {
+;   foo(a);
+; } catch (...) {
+;   foo(b);
+; }
+;}
+
+define void @_Z4testii(i32 %a, i32 %b) #0 {
+entry:
+  invoke void @_Z3fooi(i32 %a)
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2
+  invoke void @_Z3fooi(i32 %b)
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %lpad
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %invoke.cont2
+  ret void
+
+lpad1:                                            ; preds = %lpad
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %lpad1
+  resume { i8*, i32 } %3
+
+terminate.lpad:                                   ; preds = %lpad1
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %5 = extractvalue { i8*, i32 } %4, 0
+  tail call void @__clang_call_terminate(i8* %5) #3
+  unreachable
+}
+
+declare void @_Z3fooi(i32) #0
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+define linkonce_odr hidden void @__clang_call_terminate(i8*) #1 {
+  %2 = tail call i8* @__cxa_begin_catch(i8* %0) #2
+  tail call void @_ZSt9terminatev() #3
+  unreachable
+}
+
+declare void @_ZSt9terminatev()
+
+; CHECK-LABEL: Contents of section .eh_frame:
+; CHECK-NEXT: 0000 0000001c
+
diff --git a/test/CodeGen/AArch64/arm64-big-endian-varargs.ll b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
new file mode 100644
index 0000000..d7b26b9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s | FileCheck %s
+
+; Vararg saving must save Q registers using the equivalent of STR/STP.
+
+target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64_be-arm-none-eabi"
+
+%struct.__va_list = type { i8*, i8*, i8*, i32, i32 }
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_end(i8*) nounwind
+
+define double @callee(i32 %a, ...) {
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+entry:
+  %vl = alloca %struct.__va_list, align 8
+  %vl1 = bitcast %struct.__va_list* %vl to i8*
+  call void @llvm.va_start(i8* %vl1)
+  %vr_offs_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 4
+  %vr_offs = load i32* %vr_offs_p, align 4
+  %0 = icmp sgt i32 %vr_offs, -1
+  br i1 %0, label %vaarg.on_stack, label %vaarg.maybe_reg
+
+vaarg.maybe_reg:                                  ; preds = %entry
+  %new_reg_offs = add i32 %vr_offs, 16
+  store i32 %new_reg_offs, i32* %vr_offs_p, align 4
+  %inreg = icmp slt i32 %new_reg_offs, 1
+  br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack
+
+vaarg.in_reg:                                     ; preds = %vaarg.maybe_reg
+  %reg_top_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 2
+  %reg_top = load i8** %reg_top_p, align 8
+  %1 = sext i32 %vr_offs to i64
+  %2 = getelementptr i8* %reg_top, i64 %1
+  %3 = ptrtoint i8* %2 to i64
+  %align_be = add i64 %3, 8
+  %4 = inttoptr i64 %align_be to i8*
+  br label %vaarg.end
+
+vaarg.on_stack:                                   ; preds = %vaarg.maybe_reg, %entry
+  %stack_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 0
+  %stack = load i8** %stack_p, align 8
+  %new_stack = getelementptr i8* %stack, i64 8
+  store i8* %new_stack, i8** %stack_p, align 8
+  br label %vaarg.end
+
+vaarg.end:                                        ; preds = %vaarg.on_stack, %vaarg.in_reg
+  %.sink = phi i8* [ %4, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ]
+  %5 = bitcast i8* %.sink to double*
+  %6 = load double* %5, align 8
+  call void @llvm.va_end(i8* %vl1)
+  ret double %6
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
new file mode 100644
index 0000000..1dcccf1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
@@ -0,0 +1,848 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+define i64 @test_i64_f64(double %p) {
+; CHECK-NOT: rev
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+define i64 @test_i64_v1i64(<1 x i64> %p) {
+; CHECK-NOT: rev
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+define i64 @test_i64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+define i64 @test_i64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+define i64 @test_i64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+define i64 @test_i64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_f64_i64:
+define double @test_f64_i64(i64 %p) {
+; CHECK-NOT: rev
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+define double @test_f64_v1i64(<1 x i64> %p) {
+; CHECK-NOT: rev
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+define double @test_f64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+define double @test_f64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+define double @test_f64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+define double @test_f64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+define <1 x i64> @test_v1i64_i64(i64 %p) {
+; CHECK-NOT: rev
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+define <1 x i64> @test_v1i64_f64(double %p) {
+; CHECK-NOT: rev
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+define <1 x i64> @test_v1i64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+define <1 x i64> @test_v1i64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+define <1 x i64> @test_v1i64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+define <1 x i64> @test_v1i64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+define <2 x float> @test_v2f32_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+define <2 x float> @test_v2f32_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+define <2 x float> @test_v2f32_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+define <2 x float> @test_v2f32_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+define <2 x float> @test_v2f32_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+define <2 x float> @test_v2f32_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+define <2 x i32> @test_v2i32_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+define <2 x i32> @test_v2i32_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+define <2 x i32> @test_v2i32_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+define <2 x i32> @test_v2i32_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+define <2 x i32> @test_v2i32_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+define <2 x i32> @test_v2i32_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+define <4 x i16> @test_v4i16_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+define <4 x i16> @test_v4i16_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+define <4 x i16> @test_v4i16_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+define <4 x i16> @test_v4i16_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+define <4 x i16> @test_v4i16_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+define <4 x i16> @test_v4i16_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+define <8 x i8> @test_v8i8_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+define <8 x i8> @test_v8i8_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+define <8 x i8> @test_v8i8_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+define <8 x i8> @test_v8i8_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+define <8 x i8> @test_v8i8_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+define <8 x i8> @test_v8i8_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+define fp128 @test_f128_v2f64(<2 x double> %p) {
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+define fp128 @test_f128_v2i64(<2 x i64> %p) {
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+define fp128 @test_f128_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+define fp128 @test_f128_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+define fp128 @test_f128_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+define fp128 @test_f128_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+define <2 x double> @test_v2f64_f128(fp128 %p) {
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+define <2 x double> @test_v2f64_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+define <2 x double> @test_v2f64_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+define <2 x double> @test_v2f64_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+define <2 x double> @test_v2f64_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+define <2 x double> @test_v2f64_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+define <2 x i64> @test_v2i64_f128(fp128 %p) {
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+define <2 x i64> @test_v2i64_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+define <2 x i64> @test_v2i64_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+define <2 x i64> @test_v2i64_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+define <2 x i64> @test_v2i64_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+define <2 x i64> @test_v2i64_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+define <4 x float> @test_v4f32_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+define <4 x float> @test_v4f32_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+define <4 x float> @test_v4f32_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+define <4 x float> @test_v4f32_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+define <4 x float> @test_v4f32_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+define <4 x float> @test_v4f32_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+define <4 x i32> @test_v4i32_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+define <4 x i32> @test_v4i32_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+define <4 x i32> @test_v4i32_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+define <4 x i32> @test_v4i32_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+define <4 x i32> @test_v4i32_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+define <4 x i32> @test_v4i32_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+define <8 x i16> @test_v8i16_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+define <8 x i16> @test_v8i16_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+define <8 x i16> @test_v8i16_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+define <8 x i16> @test_v8i16_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+define <8 x i16> @test_v8i16_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+define <8 x i16> @test_v8i16_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+define <16 x i8> @test_v16i8_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+define <16 x i8> @test_v16i8_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+define <16 x i8> @test_v16i8_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+define <16 x i8> @test_v16i8_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+define <16 x i8> @test_v16i8_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+define <16 x i8> @test_v16i8_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
new file mode 100644
index 0000000..9a12b7a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
@@ -0,0 +1,1100 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+declare i64 @test_i64_f64_helper(double %p)
+define void @test_i64_f64(double* %p, i64* %q) {
+; CHECK-NOT: rev
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call i64 @test_i64_f64_helper(double %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+declare i64 @test_i64_v1i64_helper(<1 x i64> %p)
+define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
+; CHECK-NOT: rev
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+declare i64 @test_i64_v2f32_helper(<2 x float> %p)
+define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call i64 @test_i64_v2f32_helper(<2 x float> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+declare i64 @test_i64_v2i32_helper(<2 x i32> %p)
+define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+declare i64 @test_i64_v4i16_helper(<4 x i16> %p)
+define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+declare i64 @test_i64_v8i8_helper(<8 x i8> %p)
+define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_i64:
+declare double @test_f64_i64_helper(i64 %p)
+define void @test_f64_i64(i64* %p, double* %q) {
+; CHECK-NOT: rev
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call double @test_f64_i64_helper(i64 %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+declare double @test_f64_v1i64_helper(<1 x i64> %p)
+define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
+; CHECK-NOT: rev
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call double @test_f64_v1i64_helper(<1 x i64> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+declare double @test_f64_v2f32_helper(<2 x float> %p)
+define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call double @test_f64_v2f32_helper(<2 x float> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+declare double @test_f64_v2i32_helper(<2 x i32> %p)
+define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call double @test_f64_v2i32_helper(<2 x i32> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+declare double @test_f64_v4i16_helper(<4 x i16> %p)
+define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call double @test_f64_v4i16_helper(<4 x i16> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+declare double @test_f64_v8i8_helper(<8 x i8> %p)
+define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call double @test_f64_v8i8_helper(<8 x i8> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+declare <1 x i64> @test_v1i64_i64_helper(i64 %p)
+define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
+; CHECK-NOT: rev
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+declare <1 x i64> @test_v1i64_f64_helper(double %p)
+define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
+; CHECK-NOT: rev
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <1 x i64> @test_v1i64_f64_helper(double %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+declare <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %p)
+define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+declare <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %p)
+define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+declare <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %p)
+define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+declare <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %p)
+define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+declare <2 x float> @test_v2f32_i64_helper(i64 %p)
+define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <2 x float> @test_v2f32_i64_helper(i64 %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+declare <2 x float> @test_v2f32_f64_helper(double %p)
+define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <2 x float> @test_v2f32_f64_helper(double %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+declare <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %p)
+define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+declare <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %p)
+define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+declare <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %p)
+define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+declare <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %p)
+define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+declare <2 x i32> @test_v2i32_i64_helper(i64 %p)
+define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+declare <2 x i32> @test_v2i32_f64_helper(double %p)
+define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <2 x i32> @test_v2i32_f64_helper(double %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+declare <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %p)
+define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+declare <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %p)
+define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+declare <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %p)
+define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+declare <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %p)
+define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+declare <4 x i16> @test_v4i16_i64_helper(i64 %p)
+define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+declare <4 x i16> @test_v4i16_f64_helper(double %p)
+define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <4 x i16> @test_v4i16_f64_helper(double %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+declare <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %p)
+define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+declare <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %p)
+define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+declare <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %p)
+define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+declare <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %p)
+define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+declare <8 x i8> @test_v8i8_i64_helper(i64 %p)
+define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+declare <8 x i8> @test_v8i8_f64_helper(double %p)
+define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <8 x i8> @test_v8i8_f64_helper(double %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+declare <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %p)
+define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+declare <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %p)
+define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+declare <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %p)
+define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+declare <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %p)
+define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+declare fp128 @test_f128_v2f64_helper(<2 x double> %p)
+define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+declare fp128 @test_f128_v2i64_helper(<2 x i64> %p)
+define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+declare fp128 @test_f128_v4f32_helper(<4 x float> %p)
+define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+declare fp128 @test_f128_v4i32_helper(<4 x i32> %p)
+define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+declare fp128 @test_f128_v8i16_helper(<8 x i16> %p)
+define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+declare fp128 @test_f128_v16i8_helper(<16 x i8> %p)
+define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+declare <2 x double> @test_v2f64_f128_helper(fp128 %p)
+define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+declare <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %p)
+define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
+; CHECK: ext
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+declare <2 x double> @test_v2f64_v4f32_helper(<4 x float> %p)
+define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+declare <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %p)
+define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+declare <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %p)
+define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+declare <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %p)
+define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+declare <2 x i64> @test_v2i64_f128_helper(fp128 %p)
+define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+declare <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %p)
+define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
+; CHECK: ext
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+declare <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %p)
+define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+declare <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %p)
+define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+declare <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %p)
+define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+declare <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %p)
+define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+declare <4 x float> @test_v4f32_f128_helper(fp128 %p)
+define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+declare <4 x float> @test_v4f32_v2f64_helper(<2 x double> %p)
+define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+declare <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %p)
+define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+declare <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %p)
+define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+declare <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %p)
+define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+declare <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %p)
+define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+declare <4 x i32> @test_v4i32_f128_helper(fp128 %p)
+define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+declare <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %p)
+define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+declare <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %p)
+define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+declare <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %p)
+define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+declare <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %p)
+define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+declare <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %p)
+define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+declare <8 x i16> @test_v8i16_f128_helper(fp128 %p)
+define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+declare <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %p)
+define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+declare <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %p)
+define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+declare <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %p)
+define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+declare <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %p)
+define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+declare <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %p)
+define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+declare <16 x i8> @test_v16i8_f128_helper(fp128 %p)
+define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+declare <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %p)
+define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+declare <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %p)
+define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+declare <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %p)
+define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+declare <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %p)
+define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+declare <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %p)
+define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
diff --git a/test/CodeGen/ARM64/big-imm-offsets.ll b/test/CodeGen/AArch64/arm64-big-imm-offsets.ll
index a56df07..a56df07 100644
--- a/test/CodeGen/ARM64/big-imm-offsets.ll
+++ b/test/CodeGen/AArch64/arm64-big-imm-offsets.ll
diff --git a/test/CodeGen/ARM64/big-stack.ll b/test/CodeGen/AArch64/arm64-big-stack.ll
index 56ca30c..3f91bb3c2 100644
--- a/test/CodeGen/ARM64/big-stack.ll
+++ b/test/CodeGen/AArch64/arm64-big-stack.ll
@@ -7,9 +7,9 @@ target triple = "arm64-apple-macosx10"
 ; shift left (up to 12). I.e., 16773120 is the biggest value.
 ; <rdar://12513931>
 ; CHECK-LABEL: foo:
-; CHECK: sub sp, sp, #16773120
-; CHECK: sub sp, sp, #16773120
-; CHECK: sub sp, sp, #8192
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #2, lsl #12
 define void @foo() nounwind ssp {
 entry:
   %buffer = alloca [33554432 x i8], align 1
diff --git a/test/CodeGen/ARM64/bitfield-extract.ll b/test/CodeGen/AArch64/arm64-bitfield-extract.ll
index 96b6967..112efdd 100644
--- a/test/CodeGen/ARM64/bitfield-extract.ll
+++ b/test/CodeGen/AArch64/arm64-bitfield-extract.ll
@@ -1,3 +1,4 @@
+; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s
 ; RUN: llc < %s -march=arm64 | FileCheck %s
 %struct.X = type { i8, i8, [2 x i8] }
 %struct.Y = type { i32, i8 }
@@ -6,7 +7,7 @@
 
 define void @foo(%struct.X* nocapture %x, %struct.Y* nocapture %y) nounwind optsize ssp {
 ; CHECK-LABEL: foo:
-; CHECK: ubfm
+; CHECK: ubfx
 ; CHECK-NOT: and
 ; CHECK: ret
 
@@ -22,7 +23,7 @@ define void @foo(%struct.X* nocapture %x, %struct.Y* nocapture %y) nounwind opts
 
 define i32 @baz(i64 %cav1.coerce) nounwind {
 ; CHECK-LABEL: baz:
-; CHECK: sbfm  w0, w0, #0, #3
+; CHECK: sbfx  w0, w0, #0, #4
   %tmp = trunc i64 %cav1.coerce to i32
   %tmp1 = shl i32 %tmp, 28
   %bf.val.sext = ashr exact i32 %tmp1, 28
@@ -31,7 +32,7 @@ define i32 @baz(i64 %cav1.coerce) nounwind {
 
 define i32 @bar(i64 %cav1.coerce) nounwind {
 ; CHECK-LABEL: bar:
-; CHECK: sbfm  w0, w0, #4, #9
+; CHECK: sbfx  w0, w0, #4, #6
   %tmp = trunc i64 %cav1.coerce to i32
   %cav1.sroa.0.1.insert = shl i32 %tmp, 22
   %tmp1 = ashr i32 %cav1.sroa.0.1.insert, 26
@@ -40,7 +41,7 @@ define i32 @bar(i64 %cav1.coerce) nounwind {
 
 define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp {
 ; CHECK-LABEL: fct1:
-; CHECK: ubfm
+; CHECK: ubfx
 ; CHECK-NOT: and
 ; CHECK: ret
 
@@ -55,7 +56,7 @@ define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind opt
 
 define i64 @fct2(i64 %cav1.coerce) nounwind {
 ; CHECK-LABEL: fct2:
-; CHECK: sbfm  x0, x0, #0, #35
+; CHECK: sbfx  x0, x0, #0, #36
   %tmp = shl i64 %cav1.coerce, 28
   %bf.val.sext = ashr exact i64 %tmp, 28
   ret i64 %bf.val.sext
@@ -63,7 +64,7 @@ define i64 @fct2(i64 %cav1.coerce) nounwind {
 
 define i64 @fct3(i64 %cav1.coerce) nounwind {
 ; CHECK-LABEL: fct3:
-; CHECK: sbfm  x0, x0, #4, #41
+; CHECK: sbfx  x0, x0, #4, #38
   %cav1.sroa.0.1.insert = shl i64 %cav1.coerce, 22
   %tmp1 = ashr i64 %cav1.sroa.0.1.insert, 26
   ret i64 %tmp1
@@ -73,7 +74,7 @@ define void @fct4(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
 entry:
 ; CHECK-LABEL: fct4:
 ; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #16, #39
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #24
 ; CHECK-NEXT: str [[REG1]],
 ; CHECK-NEXT: ret
   %0 = load i64* %y, align 8
@@ -89,7 +90,7 @@ define void @fct5(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
 entry:
 ; CHECK-LABEL: fct5:
 ; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
 ; CHECK-NEXT: str [[REG1]],
 ; CHECK-NEXT: ret
   %0 = load i32* %y, align 8
@@ -106,7 +107,7 @@ define void @fct6(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
 entry:
 ; CHECK-LABEL: fct6:
 ; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
 ; lsr is an alias of ubfm
 ; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #2
 ; CHECK-NEXT: str [[REG2]],
@@ -127,7 +128,7 @@ define void @fct7(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
 entry:
 ; CHECK-LABEL: fct7:
 ; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
 ; lsl is an alias of ubfm
 ; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
 ; CHECK-NEXT: str [[REG2]],
@@ -149,7 +150,7 @@ define void @fct8(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
 entry:
 ; CHECK-LABEL: fct8:
 ; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
 ; lsr is an alias of ubfm
 ; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #2
 ; CHECK-NEXT: str [[REG2]],
@@ -171,7 +172,7 @@ define void @fct9(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
 entry:
 ; CHECK-LABEL: fct9:
 ; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
 ; lsr is an alias of ubfm
 ; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
 ; CHECK-NEXT: str [[REG2]],
@@ -192,7 +193,7 @@ define void @fct10(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
 entry:
 ; CHECK-LABEL: fct10:
 ; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #0, #2
+; CHECK-NEXT: bfxil [[REG1]], w1, #0, #3
 ; lsl is an alias of ubfm
 ; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
 ; CHECK-NEXT: str [[REG2]],
@@ -212,7 +213,7 @@ define void @fct11(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
 entry:
 ; CHECK-LABEL: fct11:
 ; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #0, #2
+; CHECK-NEXT: bfxil [[REG1]], x1, #0, #3
 ; lsl is an alias of ubfm
 ; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
 ; CHECK-NEXT: str [[REG2]],
@@ -229,7 +230,7 @@ entry:
 define zeroext i1 @fct12bis(i32 %tmp2) unnamed_addr nounwind ssp align 2 {
 ; CHECK-LABEL: fct12bis:
 ; CHECK-NOT: and
-; CHECK: ubfm w0, w0, #11, #11
+; CHECK: ubfx w0, w0, #11, #1
   %and.i.i = and i32 %tmp2, 2048
   %tobool.i.i = icmp ne i32 %and.i.i, 0
   ret i1 %tobool.i.i
@@ -241,9 +242,9 @@ define void @fct12(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
 entry:
 ; CHECK-LABEL: fct12:
 ; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
 ; lsr is an alias of ubfm
-; CHECK-NEXT: ubfm [[REG2:w[0-9]+]], [[REG1]], #2, #29
+; CHECK-NEXT: ubfx [[REG2:w[0-9]+]], [[REG1]], #2, #28
 ; CHECK-NEXT: str [[REG2]],
 ; CHECK-NEXT: ret
   %0 = load i32* %y, align 8
@@ -264,9 +265,9 @@ define void @fct13(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
 entry:
 ; CHECK-LABEL: fct13:
 ; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
 ; lsr is an alias of ubfm
-; CHECK-NEXT: ubfm [[REG2:x[0-9]+]], [[REG1]], #2, #61
+; CHECK-NEXT: ubfx [[REG2:x[0-9]+]], [[REG1]], #2, #60
 ; CHECK-NEXT: str [[REG2]],
 ; CHECK-NEXT: ret
   %0 = load i64* %y, align 8
@@ -287,10 +288,10 @@ define void @fct14(i32* nocapture %y, i32 %x, i32 %x1) nounwind optsize inlinehi
 entry:
 ; CHECK-LABEL: fct14:
 ; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #16, #23
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #8
 ; lsr is an alias of ubfm
 ; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #4
-; CHECK-NEXT: bfm [[REG2]], w2, #5, #7
+; CHECK-NEXT: bfxil [[REG2]], w2, #5, #3
 ; lsl is an alias of ubfm
 ; CHECK-NEXT: lsl [[REG3:w[0-9]+]], [[REG2]], #2
 ; CHECK-NEXT: str [[REG3]],
@@ -317,10 +318,10 @@ define void @fct15(i64* nocapture %y, i64 %x, i64 %x1) nounwind optsize inlinehi
 entry:
 ; CHECK-LABEL: fct15:
 ; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #16, #23
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #8
 ; lsr is an alias of ubfm
 ; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #4
-; CHECK-NEXT: bfm [[REG2]], x2, #5, #7
+; CHECK-NEXT: bfxil [[REG2]], x2, #5, #3
 ; lsl is an alias of ubfm
 ; CHECK-NEXT: lsl [[REG3:x[0-9]+]], [[REG2]], #2
 ; CHECK-NEXT: str [[REG3]],
@@ -347,13 +348,13 @@ entry:
 ; CHECK-LABEL: fct16:
 ; CHECK: ldr [[REG1:w[0-9]+]],
 ; Create the constant
-; CHECK: movz [[REGCST:w[0-9]+]], #26, lsl #16
-; CHECK: movk [[REGCST]], #33120
+; CHECK: movz [[REGCST:w[0-9]+]], #0x1a, lsl #16
+; CHECK: movk [[REGCST]], #0x8160
 ; Do the masking
 ; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]]
-; CHECK-NEXT: bfm [[REG2]], w1, #16, #18
+; CHECK-NEXT: bfxil [[REG2]], w1, #16, #3
 ; lsr is an alias of ubfm
-; CHECK-NEXT: ubfm [[REG3:w[0-9]+]], [[REG2]], #2, #29
+; CHECK-NEXT: ubfx [[REG3:w[0-9]+]], [[REG2]], #2, #28
 ; CHECK-NEXT: str [[REG3]],
 ; CHECK-NEXT: ret
   %0 = load i32* %y, align 8
@@ -376,13 +377,13 @@ entry:
 ; CHECK-LABEL: fct17:
 ; CHECK: ldr [[REG1:x[0-9]+]],
 ; Create the constant
-; CHECK: movz [[REGCST:x[0-9]+]], #26, lsl #16
-; CHECK: movk [[REGCST]], #33120
+; CHECK: movz w[[REGCST:[0-9]+]], #0x1a, lsl #16
+; CHECK: movk w[[REGCST]], #0x8160
 ; Do the masking
-; CHECK: and [[REG2:x[0-9]+]], [[REG1]], [[REGCST]]
-; CHECK-NEXT: bfm [[REG2]], x1, #16, #18
+; CHECK: and [[REG2:x[0-9]+]], [[REG1]], x[[REGCST]]
+; CHECK-NEXT: bfxil [[REG2]], x1, #16, #3
 ; lsr is an alias of ubfm
-; CHECK-NEXT: ubfm [[REG3:x[0-9]+]], [[REG2]], #2, #61
+; CHECK-NEXT: ubfx [[REG3:x[0-9]+]], [[REG2]], #2, #60
 ; CHECK-NEXT: str [[REG3]],
 ; CHECK-NEXT: ret
   %0 = load i64* %y, align 8
@@ -398,9 +399,134 @@ entry:
 
 define i64 @fct18(i32 %xor72) nounwind ssp {
 ; CHECK-LABEL: fct18:
-; CHECK: ubfm x0, x0, #9, #16
+; CHECK: ubfx x0, x0, #9, #8
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
   %result = and i64 %conv82, 255
   ret i64 %result
 }
+
+; Using the access to the global array to keep the instruction and control flow.
+@first_ones = external global [65536 x i8]
+
+; Function Attrs: nounwind readonly ssp
+define i32 @fct19(i64 %arg1) nounwind readonly ssp  {
+; CHECK-LABEL: fct19:
+entry:
+  %x.sroa.1.0.extract.shift = lshr i64 %arg1, 16
+  %x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16
+  %x.sroa.3.0.extract.shift = lshr i64 %arg1, 32
+  %x.sroa.5.0.extract.shift = lshr i64 %arg1, 48
+  %tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %x.sroa.5.0.extract.shift
+  %0 = load i8* %arrayidx3, align 1
+  %conv = zext i8 %0 to i32
+  br label %return
+
+; OPT-LABEL: if.end
+if.end:                                           ; preds = %entry
+; OPT: lshr
+; CHECK: ubfx	[[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #16
+  %x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16
+  %tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0
+; CHECK: cbz
+  br i1 %tobool6, label %if.end13, label %if.then7
+
+; OPT-LABEL: if.then7
+if.then7:                                         ; preds = %if.end
+; OPT: lshr
+; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. 
+; So neither of them should be in the assemble code. 
+; CHECK-NOT: and
+; CHECK-NOT: ubfm
+  %idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535
+  %arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom10
+  %1 = load i8* %arrayidx11, align 1
+  %conv12 = zext i8 %1 to i32
+  %add = add nsw i32 %conv12, 16
+  br label %return
+
+; OPT-LABEL: if.end13
+if.end13:                                         ; preds = %if.end
+; OPT: lshr
+; OPT: trunc
+; CHECK: ubfx	[[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #16
+  %tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0
+; CHECK: cbz
+  br i1 %tobool16, label %return, label %if.then17
+
+; OPT-LABEL: if.then17
+if.then17:                                        ; preds = %if.end13
+; OPT: lshr
+; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. 
+; So neither of them should be in the assemble code. 
+; CHECK-NOT: and
+; CHECK-NOT: ubfm
+  %idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535
+  %arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom20
+  %2 = load i8* %arrayidx21, align 1
+  %conv22 = zext i8 %2 to i32
+  %add23 = add nsw i32 %conv22, 32
+  br label %return
+
+return:                                           ; preds = %if.end13, %if.then17, %if.then7, %if.then
+; CHECK: ret
+  %retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23, %if.then17 ], [ 64, %if.end13 ]
+  ret i32 %retval.0
+}
+
+; Make sure we do not assert if the immediate in and is bigger than i64.
+; PR19503.
+; OPT-LABEL: @fct20
+; OPT: lshr
+; OPT-NOT: lshr
+; OPT: ret
+; CHECK-LABEL: fct20:
+; CHECK: ret
+define i80 @fct20(i128 %a, i128 %b) {
+entry:
+  %shr = lshr i128 %a, 18
+  %conv = trunc i128 %shr to i80
+  %tobool = icmp eq i128 %b, 0
+  br i1 %tobool, label %then, label %end
+then:                     
+  %and = and i128 %shr, 483673642326615442599424
+  %conv2 = trunc i128 %and to i80
+  br label %end
+end:
+  %conv3 = phi i80 [%conv, %entry], [%conv2, %then] 
+  ret i80 %conv3
+}
+
+; Check if we can still catch UBFX when "AND" is used by SHL.
+; CHECK-LABEL: fct21:
+; CHECK: ubfx
+@arr = external global [8 x [64 x i64]]
+define i64 @fct21(i64 %x) {
+entry:
+  %shr = lshr i64 %x, 4
+  %and = and i64 %shr, 15
+  %arrayidx = getelementptr inbounds [8 x [64 x i64]]* @arr, i64 0, i64 0, i64 %and
+  %0 = load i64* %arrayidx, align 8
+  ret i64 %0
+}
+
+define i16 @test_ignored_rightbits(i32 %dst, i32 %in) {
+; CHECK-LABEL: test_ignored_rightbits:
+
+  %positioned_field = shl i32 %in, 3
+  %positioned_masked_field = and i32 %positioned_field, 120
+  %masked_dst = and i32 %dst, 7
+  %insertion = or i32 %masked_dst, %positioned_masked_field
+; CHECK: {{bfm|bfi|bfxil}}
+
+  %shl16 = shl i32 %insertion, 8
+  %or18 = or i32 %shl16, %insertion
+  %conv19 = trunc i32 %or18 to i16
+; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #8, #7
+
+  ret i16 %conv19
+}
diff --git a/test/CodeGen/ARM64/blockaddress.ll b/test/CodeGen/AArch64/arm64-blockaddress.ll
index ac4f19e..ac4f19e 100644
--- a/test/CodeGen/ARM64/blockaddress.ll
+++ b/test/CodeGen/AArch64/arm64-blockaddress.ll
diff --git a/test/CodeGen/ARM64/build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll
index 1d137ae..c109263 100644
--- a/test/CodeGen/ARM64/build-vector.ll
+++ b/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; Check that building up a vector w/ only one non-zero lane initializes
 ; intelligently.
@@ -6,7 +6,7 @@ define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind {
 ; CHECK-LABEL: one_lane:
 ; CHECK: dup.16b v[[REG:[0-9]+]], wzr
 ; CHECK-NEXT: ins.b v[[REG]][0], w1
-; v and q are aliases, and str is prefered against st.16b when possible
+; v and q are aliases, and str is preferred against st.16b when possible
 ; rdar://11246289
 ; CHECK: str q[[REG]], [x0]
 ; CHECK: ret
diff --git a/test/CodeGen/ARM64/call-tailcalls.ll b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
index 487c1d9..487c1d9 100644
--- a/test/CodeGen/ARM64/call-tailcalls.ll
+++ b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
diff --git a/test/CodeGen/ARM64/cast-opt.ll b/test/CodeGen/AArch64/arm64-cast-opt.ll
index 3d7f257..65a871d 100644
--- a/test/CodeGen/ARM64/cast-opt.ll
+++ b/test/CodeGen/AArch64/arm64-cast-opt.ll
@@ -7,7 +7,7 @@
 
 define zeroext i8 @foo(i32 %i1, i32 %i2) {
 ; CHECK-LABEL: foo:
-; CHECK: csinc
+; CHECK: cset
 ; CHECK-NOT: and
 entry:
   %idxprom = sext i32 %i1 to i64
diff --git a/test/CodeGen/ARM64/ccmp-heuristics.ll b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
index 5575997..664a26c 100644
--- a/test/CodeGen/ARM64/ccmp-heuristics.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp | FileCheck %s
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp | FileCheck %s
 target triple = "arm64-apple-ios7.0.0"
 
 @channelColumns = external global i64
diff --git a/test/CodeGen/ARM64/ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
index 79e6f94..63965f9 100644
--- a/test/CodeGen/ARM64/ccmp.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp -arm64-stress-ccmp | FileCheck %s
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp -aarch64-stress-ccmp | FileCheck %s
 target triple = "arm64-apple-ios"
 
 ; CHECK: single_same
diff --git a/test/CodeGen/AArch64/arm64-clrsb.ll b/test/CodeGen/AArch64/arm64-clrsb.ll
new file mode 100644
index 0000000..042e52e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-clrsb.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=arm64 |  FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) #0
+declare i64 @llvm.ctlz.i64(i64, i1) #1
+
+; Function Attrs: nounwind ssp
+define i32 @clrsb32(i32 %x) #2 {
+entry:
+  %shr = ashr i32 %x, 31
+  %xor = xor i32 %shr, %x
+  %mul = shl i32 %xor, 1
+  %add = or i32 %mul, 1
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %add, i1 false)
+
+  ret i32 %0
+; CHECK-LABEL: clrsb32
+; CHECK:   cls [[TEMP:w[0-9]+]], [[TEMP]]
+}
+
+; Function Attrs: nounwind ssp
+define i64 @clrsb64(i64 %x) #3 {
+entry:
+  %shr = ashr i64 %x, 63
+  %xor = xor i64 %shr, %x
+  %mul = shl nsw i64 %xor, 1
+  %add = or i64 %mul, 1
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %add, i1 false)
+
+  ret i64 %0
+; CHECK-LABEL: clrsb64
+; CHECK:   cls [[TEMP:x[0-9]+]], [[TEMP]]
+}
diff --git a/test/CodeGen/ARM64/coalesce-ext.ll b/test/CodeGen/AArch64/arm64-coalesce-ext.ll
index 9e8d08e..9420bf3 100644
--- a/test/CodeGen/ARM64/coalesce-ext.ll
+++ b/test/CodeGen/AArch64/arm64-coalesce-ext.ll
@@ -7,7 +7,7 @@ define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
   %D = trunc i64 %C to i32
   %E = shl i64 %C, 32
   %F = ashr i64 %E, 32
-  ; CHECK: sxtw x[[EXT:[0-9]+]], x[[SUM]]
+  ; CHECK: sxtw x[[EXT:[0-9]+]], w[[SUM]]
   store volatile i64 %F, i64 *%P2
   ; CHECK: str x[[EXT]]
   store volatile i32 %D, i32* %P
diff --git a/test/CodeGen/ARM64/code-model-large-abs.ll b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll
index 264da2d..264da2d 100644
--- a/test/CodeGen/ARM64/code-model-large-abs.ll
+++ b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll
diff --git a/test/CodeGen/ARM64/collect-loh-garbage-crash.ll b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
index 98cb625..81cee38 100644
--- a/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O3 -arm64-collect-loh -arm64-collect-loh-bb-only=true -arm64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios -O3 -aarch64-collect-loh -aarch64-collect-loh-bb-only=true -aarch64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
 ; Check that the LOH analysis does not crash when the analysed chained
 ; contains instructions that are filtered out.
 ;
diff --git a/test/CodeGen/ARM64/collect-loh-str.ll b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
index fc63f8b..d7bc00e 100644
--- a/test/CodeGen/ARM64/collect-loh-str.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
 ; Test case for <rdar://problem/15942912>.
 ; AdrpAddStr cannot be used when the store uses same
 ; register as address and value. Indeed, the related
diff --git a/test/CodeGen/ARM64/collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll
index 08ab062..6d73daa 100644
--- a/test/CodeGen/ARM64/collect-loh.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -1,4 +1,10 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF
+
+; CHECK-ELF-NOT: .loh
+; CHECK-ELF-NOT: AdrpAdrp
+; CHECK-ELF-NOT: AdrpAdd
+; CHECK-ELF-NOT: AdrpLdrGot
 
 @a = internal unnamed_addr global i32 0, align 4
 @b = external global i32
diff --git a/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll b/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
new file mode 100644
index 0000000..f65b116
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s
+
+; The DAG combiner decided to use a vector load/store for this struct copy
+; previously. This probably shouldn't happen without NEON, but the most
+; important thing is that it compiles.
+
+define void @store_combine() nounwind {
+  %src = alloca { double, double }, align 8
+  %dst = alloca { double, double }, align 8
+
+  %src.realp = getelementptr inbounds { double, double }* %src, i32 0, i32 0
+  %src.real = load double* %src.realp
+  %src.imagp = getelementptr inbounds { double, double }* %src, i32 0, i32 1
+  %src.imag = load double* %src.imagp
+
+  %dst.realp = getelementptr inbounds { double, double }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { double, double }* %dst, i32 0, i32 1
+  store double %src.real, double* %dst.realp
+  store double %src.imag, double* %dst.imagp
+  ret void
+}
diff --git a/test/CodeGen/ARM64/complex-ret.ll b/test/CodeGen/AArch64/arm64-complex-ret.ll
index 93d50a5..93d50a5 100644
--- a/test/CodeGen/ARM64/complex-ret.ll
+++ b/test/CodeGen/AArch64/arm64-complex-ret.ll
diff --git a/test/CodeGen/AArch64/arm64-const-addr.ll b/test/CodeGen/AArch64/arm64-const-addr.ll
new file mode 100644
index 0000000..c55a922
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-const-addr.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=arm64-darwin-unknown < %s | FileCheck %s
+
+%T = type { i32, i32, i32, i32 }
+
+; Test if the constant base address gets only materialized once.
+define i32 @test1() nounwind {
+; CHECK-LABEL:  test1
+; CHECK:        movz  w8, #0x40f, lsl #16
+; CHECK-NEXT:   movk  w8, #0xc000
+; CHECK-NEXT:   ldp w9, w10, [x8, #4]
+; CHECK:        ldr w8, [x8, #12]
+  %at = inttoptr i64 68141056 to %T*
+  %o1 = getelementptr %T* %at, i32 0, i32 1
+  %t1 = load i32* %o1
+  %o2 = getelementptr %T* %at, i32 0, i32 2
+  %t2 = load i32* %o2
+  %a1 = add i32 %t1, %t2
+  %o3 = getelementptr %T* %at, i32 0, i32 3
+  %t3 = load i32* %o3
+  %a2 = add i32 %a1, %t3
+  ret i32 %a2
+}
+
diff --git a/test/CodeGen/ARM64/convert-v2f64-v2i32.ll b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
index 1a07c98..d862b1e 100644
--- a/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
+++ b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; CHECK: fptosi_1
 ; CHECK: fcvtzs.2d
diff --git a/test/CodeGen/ARM64/convert-v2i32-v2f64.ll b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
index 63129a4..daaf1e0 100644
--- a/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
+++ b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/ARM64/copy-tuple.ll b/test/CodeGen/AArch64/arm64-copy-tuple.ll
index 6325c3f..1803787 100644
--- a/test/CodeGen/ARM64/copy-tuple.ll
+++ b/test/CodeGen/AArch64/arm64-copy-tuple.ll
@@ -9,138 +9,138 @@
 
 define void @test_D1D2_from_D0D1(i8* %addr) #0 {
 ; CHECK-LABEL: test_D1D2_from_D0D1:
-; CHECK: orr.8b v2, v1
-; CHECK: orr.8b v1, v0
+; CHECK: mov.8b v2, v1
+; CHECK: mov.8b v1, v0
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
 define void @test_D0D1_from_D1D2(i8* %addr) #0 {
 ; CHECK-LABEL: test_D0D1_from_D1D2:
-; CHECK: orr.8b v0, v1
-; CHECK: orr.8b v1, v2
+; CHECK: mov.8b v0, v1
+; CHECK: mov.8b v1, v2
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
 define void @test_D0D1_from_D31D0(i8* %addr) #0 {
 ; CHECK-LABEL: test_D0D1_from_D31D0:
-; CHECK: orr.8b v1, v0
-; CHECK: orr.8b v0, v31
+; CHECK: mov.8b v1, v0
+; CHECK: mov.8b v0, v31
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
 define void @test_D31D0_from_D0D1(i8* %addr) #0 {
 ; CHECK-LABEL: test_D31D0_from_D0D1:
-; CHECK: orr.8b v31, v0
-; CHECK: orr.8b v0, v1
+; CHECK: mov.8b v31, v0
+; CHECK: mov.8b v0, v1
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
 define void @test_D2D3D4_from_D0D1D2(i8* %addr) #0 {
 ; CHECK-LABEL: test_D2D3D4_from_D0D1D2:
-; CHECK: orr.8b v4, v2
-; CHECK: orr.8b v3, v1
-; CHECK: orr.8b v2, v0
+; CHECK: mov.8b v4, v2
+; CHECK: mov.8b v3, v1
+; CHECK: mov.8b v2, v0
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1
   %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2
 
   tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
 
   tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
   ret void
 }
 
 define void @test_Q0Q1Q2_from_Q1Q2Q3(i8* %addr) #0 {
 ; CHECK-LABEL: test_Q0Q1Q2_from_Q1Q2Q3:
-; CHECK: orr.16b v0, v1
-; CHECK: orr.16b v1, v2
-; CHECK: orr.16b v2, v3
+; CHECK: mov.16b v0, v1
+; CHECK: mov.16b v1, v2
+; CHECK: mov.16b v2, v3
 entry:
   %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
-  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
   %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
   %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
   %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
   tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
 
   tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
   ret void
 }
 
 define void @test_Q1Q2Q3Q4_from_Q30Q31Q0Q1(i8* %addr) #0 {
 ; CHECK-LABEL: test_Q1Q2Q3Q4_from_Q30Q31Q0Q1:
-; CHECK: orr.16b v4, v1
-; CHECK: orr.16b v3, v0
-; CHECK: orr.16b v2, v31
-; CHECK: orr.16b v1, v30
+; CHECK: mov.16b v4, v1
+; CHECK: mov.16b v3, v0
+; CHECK: mov.16b v2, v31
+; CHECK: mov.16b v1, v30
   %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
-  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
   %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
   %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
   %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
   %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3
 
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"()
-  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+  tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
 
   tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+  tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
   ret void
 }
 
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
 
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
diff --git a/test/CodeGen/ARM64/crc32.ll b/test/CodeGen/AArch64/arm64-crc32.ll
index 609eb44..d3099e6 100644
--- a/test/CodeGen/ARM64/crc32.ll
+++ b/test/CodeGen/AArch64/arm64-crc32.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=arm64 -o - %s | FileCheck %s
+; RUN: llc -march=arm64 -mattr=+crc -o - %s | FileCheck %s
 
 define i32 @test_crc32b(i32 %cur, i8 %next) {
 ; CHECK-LABEL: test_crc32b:
 ; CHECK: crc32b w0, w0, w1
   %bits = zext i8 %next to i32
-  %val = call i32 @llvm.arm64.crc32b(i32 %cur, i32 %bits)
+  %val = call i32 @llvm.aarch64.crc32b(i32 %cur, i32 %bits)
   ret i32 %val
 }
 
@@ -12,21 +12,21 @@ define i32 @test_crc32h(i32 %cur, i16 %next) {
 ; CHECK-LABEL: test_crc32h:
 ; CHECK: crc32h w0, w0, w1
   %bits = zext i16 %next to i32
-  %val = call i32 @llvm.arm64.crc32h(i32 %cur, i32 %bits)
+  %val = call i32 @llvm.aarch64.crc32h(i32 %cur, i32 %bits)
   ret i32 %val
 }
 
 define i32 @test_crc32w(i32 %cur, i32 %next) {
 ; CHECK-LABEL: test_crc32w:
 ; CHECK: crc32w w0, w0, w1
-  %val = call i32 @llvm.arm64.crc32w(i32 %cur, i32 %next)
+  %val = call i32 @llvm.aarch64.crc32w(i32 %cur, i32 %next)
   ret i32 %val
 }
 
 define i32 @test_crc32x(i32 %cur, i64 %next) {
 ; CHECK-LABEL: test_crc32x:
 ; CHECK: crc32x w0, w0, x1
-  %val = call i32 @llvm.arm64.crc32x(i32 %cur, i64 %next)
+  %val = call i32 @llvm.aarch64.crc32x(i32 %cur, i64 %next)
   ret i32 %val
 }
 
@@ -34,7 +34,7 @@ define i32 @test_crc32cb(i32 %cur, i8 %next) {
 ; CHECK-LABEL: test_crc32cb:
 ; CHECK: crc32cb w0, w0, w1
   %bits = zext i8 %next to i32
-  %val = call i32 @llvm.arm64.crc32cb(i32 %cur, i32 %bits)
+  %val = call i32 @llvm.aarch64.crc32cb(i32 %cur, i32 %bits)
   ret i32 %val
 }
 
@@ -42,30 +42,30 @@ define i32 @test_crc32ch(i32 %cur, i16 %next) {
 ; CHECK-LABEL: test_crc32ch:
 ; CHECK: crc32ch w0, w0, w1
   %bits = zext i16 %next to i32
-  %val = call i32 @llvm.arm64.crc32ch(i32 %cur, i32 %bits)
+  %val = call i32 @llvm.aarch64.crc32ch(i32 %cur, i32 %bits)
   ret i32 %val
 }
 
 define i32 @test_crc32cw(i32 %cur, i32 %next) {
 ; CHECK-LABEL: test_crc32cw:
 ; CHECK: crc32cw w0, w0, w1
-  %val = call i32 @llvm.arm64.crc32cw(i32 %cur, i32 %next)
+  %val = call i32 @llvm.aarch64.crc32cw(i32 %cur, i32 %next)
   ret i32 %val
 }
 
 define i32 @test_crc32cx(i32 %cur, i64 %next) {
 ; CHECK-LABEL: test_crc32cx:
 ; CHECK: crc32cx w0, w0, x1
-  %val = call i32 @llvm.arm64.crc32cx(i32 %cur, i64 %next)
+  %val = call i32 @llvm.aarch64.crc32cx(i32 %cur, i64 %next)
   ret i32 %val
 }
 
-declare i32 @llvm.arm64.crc32b(i32, i32)
-declare i32 @llvm.arm64.crc32h(i32, i32)
-declare i32 @llvm.arm64.crc32w(i32, i32)
-declare i32 @llvm.arm64.crc32x(i32, i64)
+declare i32 @llvm.aarch64.crc32b(i32, i32)
+declare i32 @llvm.aarch64.crc32h(i32, i32)
+declare i32 @llvm.aarch64.crc32w(i32, i32)
+declare i32 @llvm.aarch64.crc32x(i32, i64)
 
-declare i32 @llvm.arm64.crc32cb(i32, i32)
-declare i32 @llvm.arm64.crc32ch(i32, i32)
-declare i32 @llvm.arm64.crc32cw(i32, i32)
-declare i32 @llvm.arm64.crc32cx(i32, i64)
+declare i32 @llvm.aarch64.crc32cb(i32, i32)
+declare i32 @llvm.aarch64.crc32ch(i32, i32)
+declare i32 @llvm.aarch64.crc32cw(i32, i32)
+declare i32 @llvm.aarch64.crc32cx(i32, i64)
diff --git a/test/CodeGen/ARM64/crypto.ll b/test/CodeGen/AArch64/arm64-crypto.ll
index 3804310..2908b33 100644
--- a/test/CodeGen/ARM64/crypto.ll
+++ b/test/CodeGen/AArch64/arm64-crypto.ll
@@ -1,50 +1,50 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s | FileCheck %s
+; RUN: llc -march=arm64 -mattr=crypto -aarch64-neon-syntax=apple -o - %s | FileCheck %s
 
-declare <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
-declare <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
-declare <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
-declare <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
+declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
+declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
 
 define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
 ; CHECK-LABEL: test_aese:
 ; CHECK: aese.16b v0, v1
-  %res = call <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+  %res = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
   ret <16 x i8> %res
 }
 
 define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
 ; CHECK-LABEL: test_aesd:
 ; CHECK: aesd.16b v0, v1
-  %res = call <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+  %res = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
   ret <16 x i8> %res
 }
 
 define <16 x i8> @test_aesmc(<16 x i8> %data) {
 ; CHECK-LABEL: test_aesmc:
 ; CHECK: aesmc.16b v0, v0
- %res = call <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
   ret <16 x i8> %res
 }
 
 define <16 x i8> @test_aesimc(<16 x i8> %data) {
 ; CHECK-LABEL: test_aesimc:
 ; CHECK: aesimc.16b v0, v0
- %res = call <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
   ret <16 x i8> %res
 }
 
-declare <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
-declare <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
-declare <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+declare <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
 
 define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
 ; CHECK-LABEL: test_sha1c:
 ; CHECK: fmov [[HASH_E:s[0-9]+]], w0
 ; CHECK: sha1c.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
   ret <4 x i32> %res
 }
 
@@ -55,9 +55,9 @@ define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i3
 ; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
 ; CHECK-NOT: fmov
 ; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
   %extract = extractelement <4 x i32> %res, i32 0
-  %res2 = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
+  %res2 = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
   ret <4 x i32> %res2
 }
 
@@ -65,7 +65,7 @@ define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
 ; CHECK-LABEL: test_sha1p:
 ; CHECK: fmov [[HASH_E:s[0-9]+]], w0
 ; CHECK: sha1p.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
   ret <4 x i32> %res
 }
 
@@ -73,7 +73,7 @@ define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
 ; CHECK-LABEL: test_sha1m:
 ; CHECK: fmov [[HASH_E:s[0-9]+]], w0
 ; CHECK: sha1m.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
   ret <4 x i32> %res
 }
 
@@ -82,33 +82,33 @@ define i32 @test_sha1h(i32 %hash_e) {
 ; CHECK: fmov [[HASH_E:s[0-9]+]], w0
 ; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
 ; CHECK: fmov w0, [[RES]]
-  %res = call i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
+  %res = call i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
   ret i32 %res
 }
 
 define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
 ; CHECK-LABEL: test_sha1su0:
 ; CHECK: sha1su0.4s v0, v1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
   ret <4 x i32> %res
 }
 
 define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
 ; CHECK-LABEL: test_sha1su1:
 ; CHECK: sha1su1.4s v0, v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
-declare <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+declare <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
 
 define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
 ; CHECK-LABEL: test_sha256h:
 ; CHECK: sha256h.4s q0, q1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
   ret <4 x i32> %res
 }
 
@@ -116,20 +116,20 @@ define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x
 ; CHECK-LABEL: test_sha256h2:
 ; CHECK: sha256h2.4s q0, q1, v2
 
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
   ret <4 x i32> %res
 }
 
 define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
 ; CHECK-LABEL: test_sha256su0:
 ; CHECK: sha256su0.4s v0, v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
   ret <4 x i32> %res
 }
 
 define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
 ; CHECK-LABEL: test_sha256su1:
 ; CHECK: sha256su1.4s v0, v1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
   ret <4 x i32> %res
 }
diff --git a/test/CodeGen/ARM64/cse.ll b/test/CodeGen/AArch64/arm64-cse.ll
index d98bfd6..bb14c89 100644
--- a/test/CodeGen/ARM64/cse.ll
+++ b/test/CodeGen/AArch64/arm64-cse.ll
@@ -13,7 +13,7 @@ entry:
 ; CHECK: b.ge
 ; CHECK: sub
 ; CHECK: sub
-; CHECK_NOT: sub
+; CHECK-NOT: sub
 ; CHECK: ret
  %0 = load i32* %offset, align 4
  %cmp = icmp slt i32 %0, %size
diff --git a/test/CodeGen/ARM64/csel.ll b/test/CodeGen/AArch64/arm64-csel.ll
index cbf1769..98eba30 100644
--- a/test/CodeGen/ARM64/csel.ll
+++ b/test/CodeGen/AArch64/arm64-csel.ll
@@ -2,9 +2,8 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
 target triple = "arm64-unknown-unknown"
 
-; CHECK: foo1
-; CHECK: csinc w{{[0-9]+}}, w[[REG:[0-9]+]],
-; CHECK:                                     w[[REG]], eq
+; CHECK-LABEL: foo1
+; CHECK: cinc w{{[0-9]+}}, w{{[0-9]+}}, ne
 define i32 @foo1(i32 %b, i32 %c) nounwind readnone ssp {
 entry:
   %not.tobool = icmp ne i32 %c, 0
@@ -14,9 +13,8 @@ entry:
   ret i32 %add1
 }
 
-; CHECK: foo2
-; CHECK: csneg w{{[0-9]+}}, w[[REG:[0-9]+]],
-; CHECK:                                     w[[REG]], eq
+; CHECK-LABEL: foo2
+; CHECK: cneg w{{[0-9]+}}, w{{[0-9]+}}, ne
 define i32 @foo2(i32 %b, i32 %c) nounwind readnone ssp {
 entry:
   %mul = sub i32 0, %b
@@ -26,9 +24,8 @@ entry:
   ret i32 %add
 }
 
-; CHECK: foo3
-; CHECK: csinv w{{[0-9]+}}, w[[REG:[0-9]+]],
-; CHECK:                                     w[[REG]], eq
+; CHECK-LABEL: foo3
+; CHECK: cinv w{{[0-9]+}}, w{{[0-9]+}}, ne
 define i32 @foo3(i32 %b, i32 %c) nounwind readnone ssp {
 entry:
   %not.tobool = icmp ne i32 %c, 0
@@ -40,8 +37,8 @@ entry:
 
 ; rdar://11632325
 define i32@foo4(i32 %a) nounwind ssp {
-; CHECK: foo4
-; CHECK: csneg
+; CHECK-LABEL: foo4
+; CHECK: cneg
 ; CHECK-NEXT: ret
   %cmp = icmp sgt i32 %a, -1
   %neg = sub nsw i32 0, %a
@@ -51,9 +48,9 @@ define i32@foo4(i32 %a) nounwind ssp {
 
 define i32@foo5(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: foo5
+; CHECK-LABEL: foo5
 ; CHECK: subs
-; CHECK-NEXT: csneg
+; CHECK-NEXT: cneg
 ; CHECK-NEXT: ret
   %sub = sub nsw i32 %a, %b
   %cmp = icmp sgt i32 %sub, -1
@@ -64,7 +61,7 @@ entry:
 
 ; make sure we can handle branch instruction in optimizeCompare.
 define i32@foo6(i32 %a, i32 %b) nounwind ssp {
-; CHECK: foo6
+; CHECK-LABEL: foo6
 ; CHECK: b
   %sub = sub nsw i32 %a, %b
   %cmp = icmp sgt i32 %sub, 0
@@ -116,7 +113,7 @@ entry:
 ; CHECK-LABEL: foo9:
 ; CHECK: cmp w0, #0
 ; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
-; CHECK: csinv w0, w[[REG]], w[[REG]], ne
+; CHECK: cinv w0, w[[REG]], eq
   %tobool = icmp ne i32 %v, 0
   %cond = select i1 %tobool, i32 4, i32 -5
   ret i32 %cond
@@ -126,8 +123,8 @@ define i64 @foo10(i64 %v) nounwind readnone optsize ssp {
 entry:
 ; CHECK-LABEL: foo10:
 ; CHECK: cmp x0, #0
-; CHECK: orr x[[REG:[0-9]+]], xzr, #0x4
-; CHECK: csinv x0, x[[REG]], x[[REG]], ne
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cinv x0, x[[REG]], eq
   %tobool = icmp ne i64 %v, 0
   %cond = select i1 %tobool, i64 4, i64 -5
   ret i64 %cond
@@ -138,7 +135,7 @@ entry:
 ; CHECK-LABEL: foo11:
 ; CHECK: cmp w0, #0
 ; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
-; CHECK: csneg w0, w[[REG]], w[[REG]], ne
+; CHECK: cneg w0, w[[REG]], eq
   %tobool = icmp ne i32 %v, 0
   %cond = select i1 %tobool, i32 4, i32 -4
   ret i32 %cond
@@ -148,8 +145,8 @@ define i64 @foo12(i64 %v) nounwind readnone optsize ssp {
 entry:
 ; CHECK-LABEL: foo12:
 ; CHECK: cmp x0, #0
-; CHECK: orr x[[REG:[0-9]+]], xzr, #0x4
-; CHECK: csneg x0, x[[REG]], x[[REG]], ne
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cneg x0, x[[REG]], eq
   %tobool = icmp ne i64 %v, 0
   %cond = select i1 %tobool, i64 4, i64 -4
   ret i64 %cond
@@ -182,7 +179,7 @@ entry:
 ; CHECK-LABEL: foo15:
 ; CHECK: cmp w0, w1
 ; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
-; CHECK: csinc w0, w[[REG]], w[[REG]], le
+; CHECK: cinc w0, w[[REG]], gt
   %cmp = icmp sgt i32 %a, %b
   %. = select i1 %cmp, i32 2, i32 1
   ret i32 %.
@@ -193,7 +190,7 @@ entry:
 ; CHECK-LABEL: foo16:
 ; CHECK: cmp w0, w1
 ; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
-; CHECK: csinc w0, w[[REG]], w[[REG]], gt
+; CHECK: cinc w0, w[[REG]], le
   %cmp = icmp sgt i32 %a, %b
   %. = select i1 %cmp, i32 1, i32 2
   ret i32 %.
@@ -203,8 +200,8 @@ define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp {
 entry:
 ; CHECK-LABEL: foo17:
 ; CHECK: cmp x0, x1
-; CHECK: orr x[[REG:[0-9]+]], xzr, #0x1
-; CHECK: csinc x0, x[[REG]], x[[REG]], le
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc x0, x[[REG]], gt
   %cmp = icmp sgt i64 %a, %b
   %. = select i1 %cmp, i64 2, i64 1
   ret i64 %.
@@ -214,9 +211,20 @@ define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp {
 entry:
 ; CHECK-LABEL: foo18:
 ; CHECK: cmp x0, x1
-; CHECK: orr x[[REG:[0-9]+]], xzr, #0x1
-; CHECK: csinc x0, x[[REG]], x[[REG]], gt
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc x0, x[[REG]], le
   %cmp = icmp sgt i64 %a, %b
   %. = select i1 %cmp, i64 1, i64 2
   ret i64 %.
 }
+
+define i64 @foo19(i64 %a, i64 %b, i64 %c) {
+entry:
+; CHECK-LABEL: foo19:
+; CHECK: cinc x0, x2
+; CHECK-NOT: add
+  %cmp = icmp ult i64 %a, %b
+  %inc = zext i1 %cmp to i64
+  %inc.c = add i64 %inc, %c
+  ret i64 %inc.c
+}
diff --git a/test/CodeGen/ARM64/cvt.ll b/test/CodeGen/AArch64/arm64-cvt.ll
index b55a42f..420a8bc 100644
--- a/test/CodeGen/ARM64/cvt.ll
+++ b/test/CodeGen/AArch64/arm64-cvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ;
 ; Floating-point scalar convert to signed integer (to nearest with ties to away)
@@ -7,7 +7,7 @@ define i32 @fcvtas_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtas_1w1s:
 ;CHECK: fcvtas w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -15,7 +15,7 @@ define i64 @fcvtas_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtas_1x1s:
 ;CHECK: fcvtas x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -23,7 +23,7 @@ define i32 @fcvtas_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtas_1w1d:
 ;CHECK: fcvtas w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -31,14 +31,14 @@ define i64 @fcvtas_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtas_1x1d:
 ;CHECK: fcvtas x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtas.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtas.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtas.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtas.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to unsigned integer
@@ -47,7 +47,7 @@ define i32 @fcvtau_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtau_1w1s:
 ;CHECK: fcvtau w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -55,7 +55,7 @@ define i64 @fcvtau_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtau_1x1s:
 ;CHECK: fcvtau x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -63,7 +63,7 @@ define i32 @fcvtau_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtau_1w1d:
 ;CHECK: fcvtau w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -71,14 +71,14 @@ define i64 @fcvtau_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtau_1x1d:
 ;CHECK: fcvtau x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtau.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtau.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtau.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtau.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to signed integer (toward -Inf)
@@ -87,7 +87,7 @@ define i32 @fcvtms_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtms_1w1s:
 ;CHECK: fcvtms w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -95,7 +95,7 @@ define i64 @fcvtms_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtms_1x1s:
 ;CHECK: fcvtms x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -103,7 +103,7 @@ define i32 @fcvtms_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtms_1w1d:
 ;CHECK: fcvtms w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -111,14 +111,14 @@ define i64 @fcvtms_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtms_1x1d:
 ;CHECK: fcvtms x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtms.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtms.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtms.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtms.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to unsigned integer (toward -Inf)
@@ -127,7 +127,7 @@ define i32 @fcvtmu_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtmu_1w1s:
 ;CHECK: fcvtmu w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -135,7 +135,7 @@ define i64 @fcvtmu_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtmu_1x1s:
 ;CHECK: fcvtmu x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -143,7 +143,7 @@ define i32 @fcvtmu_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtmu_1w1d:
 ;CHECK: fcvtmu w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -151,14 +151,14 @@ define i64 @fcvtmu_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtmu_1x1d:
 ;CHECK: fcvtmu x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtmu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtmu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtmu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtmu.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to signed integer (to nearest with ties to even)
@@ -167,7 +167,7 @@ define i32 @fcvtns_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtns_1w1s:
 ;CHECK: fcvtns w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -175,7 +175,7 @@ define i64 @fcvtns_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtns_1x1s:
 ;CHECK: fcvtns x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -183,7 +183,7 @@ define i32 @fcvtns_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtns_1w1d:
 ;CHECK: fcvtns w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -191,14 +191,14 @@ define i64 @fcvtns_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtns_1x1d:
 ;CHECK: fcvtns x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtns.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtns.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtns.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtns.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
@@ -207,7 +207,7 @@ define i32 @fcvtnu_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtnu_1w1s:
 ;CHECK: fcvtnu w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -215,7 +215,7 @@ define i64 @fcvtnu_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtnu_1x1s:
 ;CHECK: fcvtnu x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -223,7 +223,7 @@ define i32 @fcvtnu_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtnu_1w1d:
 ;CHECK: fcvtnu w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -231,14 +231,14 @@ define i64 @fcvtnu_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtnu_1x1d:
 ;CHECK: fcvtnu x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtnu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtnu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtnu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtnu.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to signed integer (toward +Inf)
@@ -247,7 +247,7 @@ define i32 @fcvtps_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtps_1w1s:
 ;CHECK: fcvtps w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -255,7 +255,7 @@ define i64 @fcvtps_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtps_1x1s:
 ;CHECK: fcvtps x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -263,7 +263,7 @@ define i32 @fcvtps_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtps_1w1d:
 ;CHECK: fcvtps w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -271,14 +271,14 @@ define i64 @fcvtps_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtps_1x1d:
 ;CHECK: fcvtps x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtps.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtps.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtps.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtps.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to unsigned integer (toward +Inf)
@@ -287,7 +287,7 @@ define i32 @fcvtpu_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtpu_1w1s:
 ;CHECK: fcvtpu w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -295,7 +295,7 @@ define i64 @fcvtpu_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtpu_1x1s:
 ;CHECK: fcvtpu x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -303,7 +303,7 @@ define i32 @fcvtpu_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtpu_1w1d:
 ;CHECK: fcvtpu w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -311,14 +311,14 @@ define i64 @fcvtpu_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtpu_1x1d:
 ;CHECK: fcvtpu x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtpu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtpu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtpu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtpu.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double) nounwind readnone
 
 ;
 ;  Floating-point scalar convert to signed integer (toward zero)
@@ -327,7 +327,7 @@ define i32 @fcvtzs_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtzs_1w1s:
 ;CHECK: fcvtzs w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -335,7 +335,7 @@ define i64 @fcvtzs_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtzs_1x1s:
 ;CHECK: fcvtzs x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -343,7 +343,7 @@ define i32 @fcvtzs_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtzs_1w1d:
 ;CHECK: fcvtzs w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -351,14 +351,14 @@ define i64 @fcvtzs_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtzs_1x1d:
 ;CHECK: fcvtzs x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtzs.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzs.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtzs.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzs.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to unsigned integer (toward zero)
@@ -367,7 +367,7 @@ define i32 @fcvtzu_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtzu_1w1s:
 ;CHECK: fcvtzu w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -375,7 +375,7 @@ define i64 @fcvtzu_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtzu_1x1s:
 ;CHECK: fcvtzu x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -383,7 +383,7 @@ define i32 @fcvtzu_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtzu_1w1d:
 ;CHECK: fcvtzu w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -391,11 +391,11 @@ define i64 @fcvtzu_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtzu_1x1d:
 ;CHECK: fcvtzu x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtzu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtzu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzu.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/dagcombiner-convergence.ll b/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
index a45e313..a45e313 100644
--- a/test/CodeGen/ARM64/dagcombiner-convergence.ll
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
new file mode 100644
index 0000000..2cf0135
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mcpu=cyclone < %s | FileCheck %s
+
+target datalayout = "e-i64:64-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+%"struct.SU" = type { i32, %"struct.SU"*, i32*, i32, i32, %"struct.BO", i32, [5 x i8] }
+%"struct.BO" = type { %"struct.RE" }
+
+%"struct.RE" = type { i32, i32, i32, i32 }
+
+; This is a read-modify-write of some bifields combined into an i48.  It gets
+; legalized into i32 and i16 accesses.  Only a single store of zero to the low
+; i32 part should be live.
+
+; CHECK-LABEL: test:
+; CHECK-NOT: ldr
+; CHECK: str wzr
+; CHECK-NOT: str
+define void @test(%"struct.SU"* nocapture %su) {
+entry:
+  %r1 = getelementptr inbounds %"struct.SU"* %su, i64 1, i32 5
+  %r2 = bitcast %"struct.BO"* %r1 to i48*
+  %r3 = load i48* %r2, align 8
+  %r4 = and i48 %r3, -4294967296
+  %r5 = or i48 0, %r4
+  store i48 %r5, i48* %r2, align 8
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
new file mode 100644
index 0000000..2e4b658
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
@@ -0,0 +1,46 @@
+; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s
+; Test case for a DAG combiner bug where we combined an indexed load
+; with an extension (sext, zext, or any) into a regular extended load,
+; i.e., dropping the indexed value.
+; <rdar://problem/16389332>
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+%class.A = type { i64, i64 }
+%class.C = type { i64 }
+
+; CHECK-LABEL: XX:
+; CHECK: ldr
+define void @XX(%class.A* %K) {
+entry:
+  br i1 undef, label %if.then, label %lor.rhs.i
+
+lor.rhs.i:                                        ; preds = %entry
+  %tmp = load i32* undef, align 4
+  %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
+  %tmp1 = load i64* %y.i.i.i, align 8
+  %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
+  %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17
+  %add12.i = add nsw i32 0, %div11.i
+  %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32
+  %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32
+  %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13
+  %add16.i = add nsw i32 %add12.i, %div15.i
+  %rem.i.i = srem i32 %add16.i, %tmp
+  %idxprom = sext i32 %rem.i.i to i64
+  %arrayidx = getelementptr inbounds %class.C** undef, i64 %idxprom
+  %tobool533 = icmp eq %class.C* undef, null
+  br i1 %tobool533, label %while.end, label %while.body
+
+if.then:                                          ; preds = %entry
+  unreachable
+
+while.body:                                       ; preds = %lor.rhs.i
+  unreachable
+
+while.end:                                        ; preds = %lor.rhs.i
+  %tmp3 = load %class.C** %arrayidx, align 8
+  unreachable
+}
diff --git a/test/CodeGen/ARM64/dagcombiner-load-slicing.ll b/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
index 0679014..0679014 100644
--- a/test/CodeGen/ARM64/dagcombiner-load-slicing.ll
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
diff --git a/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll b/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
new file mode 100644
index 0000000..9bb4b71
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test1() #0 {
+  %tmp1 = alloca i8
+  %tmp2 = alloca i32, i32 4096
+  %tmp3 = icmp eq i8* %tmp1, null
+  %tmp4 = zext i1 %tmp3 to i32
+
+  ret i32 %tmp4
+
+  ; CHECK-LABEL: test1
+  ; CHECK:   adds [[TEMP:[a-z0-9]+]], sp, #4, lsl #12
+  ; CHECK:   adds [[TEMP]], [[TEMP]], #15
+}
diff --git a/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll b/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
new file mode 100644
index 0000000..1bbcf50
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple="arm64-apple-ios" < %s | FileCheck %s
+;
+; Check that the dead register definition pass is considering implicit defs.
+; When rematerializing through truncates, the coalescer may produce instructions
+; with dead defs, but live implicit-defs of subregs:
+; E.g. %X1<def, dead> = MOVi64imm 2, %W1<imp-def>; %X1:GPR64, %W1:GPR32
+; These instructions are live, and their definitions should not be rewritten.
+;
+; <rdar://problem/16492408>
+
+define void @testcase() {
+; CHECK: testcase:
+; CHECK-NOT: orr xzr, xzr, #0x2
+
+bb1:
+  %tmp1 = tail call float @ceilf(float 2.000000e+00)
+  %tmp2 = fptoui float %tmp1 to i64
+  br i1 undef, label %bb2, label %bb3
+
+bb2:
+  tail call void @foo()
+  br label %bb3
+
+bb3:
+  %tmp3 = trunc i64 %tmp2 to i32
+  tail call void @bar(i32 %tmp3)
+  ret void
+}
+
+declare void @foo()
+declare void @bar(i32)
+declare float @ceilf(float) nounwind readnone
diff --git a/test/CodeGen/ARM64/dup.ll b/test/CodeGen/AArch64/arm64-dup.ll
index e659575..0c56b46 100644
--- a/test/CodeGen/ARM64/dup.ll
+++ b/test/CodeGen/AArch64/arm64-dup.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 define <8 x i8> @v_dup8(i8 %A) nounwind {
 ;CHECK-LABEL: v_dup8:
@@ -297,10 +297,11 @@ define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
 ; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
 ; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
 ; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
+;
+; *However*, it is a dup vD.4h, vN.h[2*idx].
 define <4 x i16> @test_build_illegal(<4 x i32> %in) {
 ; CHECK-LABEL: test_build_illegal:
-; CHECK: umov.s [[WTMP:w[0-9]+]], v0[3]
-; CHECK: dup.4h v0, [[WTMP]]
+; CHECK: dup.4h v0, v0[6]
   %val = extractelement <4 x i32> %in, i32 3
   %smallval = trunc i32 %val to i16
   %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
diff --git a/test/CodeGen/ARM64/early-ifcvt.ll b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
index a5c1e26..17d783a 100644
--- a/test/CodeGen/ARM64/early-ifcvt.ll
+++ b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -322,7 +322,7 @@ done:
 }
 
 ; CHECK: tbnz_32
-; CHECK: {{ands.*xzr,|tst}} x2, #0x80
+; CHECK: {{ands.*xzr,|tst}} w2, #0x80
 ; CHECK-NEXT: csel w0, w1, w0, ne
 ; CHECK-NEXT: ret
 define i32 @tbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
@@ -358,7 +358,7 @@ done:
 }
 
 ; CHECK: tbz_32
-; CHECK: {{ands.*xzr,|tst}} x2, #0x80
+; CHECK: {{ands.*xzr,|tst}} w2, #0x80
 ; CHECK-NEXT: csel w0, w1, w0, eq
 ; CHECK-NEXT: ret
 define i32 @tbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
diff --git a/test/CodeGen/ARM64/elf-calls.ll b/test/CodeGen/AArch64/arm64-elf-calls.ll
index 8c40203..8c40203 100644
--- a/test/CodeGen/ARM64/elf-calls.ll
+++ b/test/CodeGen/AArch64/arm64-elf-calls.ll
diff --git a/test/CodeGen/ARM64/elf-constpool.ll b/test/CodeGen/AArch64/arm64-elf-constpool.ll
index 95d3343..95d3343 100644
--- a/test/CodeGen/ARM64/elf-constpool.ll
+++ b/test/CodeGen/AArch64/arm64-elf-constpool.ll
diff --git a/test/CodeGen/ARM64/elf-globals.ll b/test/CodeGen/AArch64/arm64-elf-globals.ll
index 598c96a..4ed44e7 100644
--- a/test/CodeGen/ARM64/elf-globals.ll
+++ b/test/CodeGen/AArch64/arm64-elf-globals.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 | FileCheck %s --check-prefix=CHECK-FAST
-; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s | FileCheck %s --check-prefix=CHECK-PIC
-; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s | FileCheck %s --check-prefix=CHECK-FAST-PIC
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s -mcpu=cyclone | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC
 
 @var8 = external global i8, align 1
 @var16 = external global i16, align 2
diff --git a/test/CodeGen/ARM64/ext.ll b/test/CodeGen/AArch64/arm64-ext.ll
index 57d6e0c..67860de 100644
--- a/test/CodeGen/ARM64/ext.ll
+++ b/test/CodeGen/AArch64/arm64-ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: test_vextd:
@@ -65,6 +65,15 @@ define <8 x i8> @test_vextd_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 	ret <8 x i8> %tmp3
 }
 
+define <8 x i8> @test_vextd_undef2(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd_undef2:
+;CHECK: {{ext.8b.*#6}}
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
+  ret <8 x i8> %tmp3
+}
+
 define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: test_vextRq_undef:
 ;CHECK: {{ext.16b.*#7}}
@@ -74,6 +83,14 @@ define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 	ret <16 x i8> %tmp3
 }
 
+define <8 x i16> @test_vextRq_undef2(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vextRq_undef2:
+;CHECK: {{ext.16b.*#10}}
+  %tmp1 = load <8 x i16>* %A
+  %vext = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x i16> %vext;
+}
+
 ; Tests for ReconstructShuffle function. Indices have to be carefully
 ; chosen to reach lowering phase as a BUILD_VECTOR.
 
diff --git a/test/CodeGen/ARM64/extend-int-to-fp.ll b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
index 599a697..048fdb0 100644
--- a/test/CodeGen/ARM64/extend-int-to-fp.ll
+++ b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <4 x float> @foo(<4 x i16> %a) nounwind {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/ARM64/extend.ll b/test/CodeGen/AArch64/arm64-extend.ll
index 4d20543..afcaca2 100644
--- a/test/CodeGen/ARM64/extend.ll
+++ b/test/CodeGen/AArch64/arm64-extend.ll
@@ -5,7 +5,7 @@ define i64 @foo(i32 %i) {
 ; CHECK: foo
 ; CHECK:  adrp  x[[REG:[0-9]+]], _array@GOTPAGE
 ; CHECK:  ldr x[[REG1:[0-9]+]], [x[[REG]], _array@GOTPAGEOFF]
-; CHECK:  ldrsw x0, [x[[REG1]], x0, sxtw #2]
+; CHECK:  ldrsw x0, [x[[REG1]], w0, sxtw #2]
 ; CHECK:  ret
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds [0 x i32]* @array, i64 0, i64 %idxprom
diff --git a/test/CodeGen/ARM64/extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll
index a239403..a239403 100644
--- a/test/CodeGen/ARM64/extern-weak.ll
+++ b/test/CodeGen/AArch64/arm64-extern-weak.ll
diff --git a/test/CodeGen/ARM64/extload-knownzero.ll b/test/CodeGen/AArch64/arm64-extload-knownzero.ll
index 14e5fd3..14e5fd3 100644
--- a/test/CodeGen/ARM64/extload-knownzero.ll
+++ b/test/CodeGen/AArch64/arm64-extload-knownzero.ll
diff --git a/test/CodeGen/ARM64/extract.ll b/test/CodeGen/AArch64/arm64-extract.ll
index 119751c..0198466 100644
--- a/test/CodeGen/ARM64/extract.ll
+++ b/test/CodeGen/AArch64/arm64-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc -arm64-extr-generation=true -verify-machineinstrs < %s \
+; RUN: llc -aarch64-extr-generation=true -verify-machineinstrs < %s \
 ; RUN: -march=arm64 | FileCheck %s
 
 define i64 @ror_i64(i64 %in) {
@@ -6,7 +6,7 @@ define i64 @ror_i64(i64 %in) {
     %left = shl i64 %in, 19
     %right = lshr i64 %in, 45
     %val5 = or i64 %left, %right
-; CHECK: extr {{x[0-9]+}}, x0, x0, #45
+; CHECK: ror {{x[0-9]+}}, x0, #45
     ret i64 %val5
 }
 
@@ -15,7 +15,7 @@ define i32 @ror_i32(i32 %in) {
     %left = shl i32 %in, 9
     %right = lshr i32 %in, 23
     %val5 = or i32 %left, %right
-; CHECK: extr {{w[0-9]+}}, w0, w0, #23
+; CHECK: ror {{w[0-9]+}}, w0, #23
     ret i32 %val5
 }
 
diff --git a/test/CodeGen/ARM64/extract_subvector.ll b/test/CodeGen/AArch64/arm64-extract_subvector.ll
index 20c05fb..8b15a64 100644
--- a/test/CodeGen/ARM64/extract_subvector.ll
+++ b/test/CodeGen/AArch64/arm64-extract_subvector.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 ; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
 
diff --git a/test/CodeGen/ARM64/fast-isel-addr-offset.ll b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
index a4326dc..ebd847e 100644
--- a/test/CodeGen/ARM64/fast-isel-addr-offset.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
@@ -9,7 +9,7 @@ entry:
 ; CHECK: @foo
 ; CHECK: adrp x[[REG:[0-9]+]], _sortlist@GOTPAGE
 ; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist@GOTPAGEOFF]
-; CHECK: movz x[[REG2:[0-9]+]], #20000
+; CHECK: movz x[[REG2:[0-9]+]], #0x4e20
 ; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
 ; CHECK: ldr w0, [x[[REG3]]]
 ; CHECK: ret
@@ -22,7 +22,7 @@ entry:
 ; CHECK: @foo2
 ; CHECK: adrp x[[REG:[0-9]+]], _sortlist2@GOTPAGE
 ; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist2@GOTPAGEOFF]
-; CHECK: movz x[[REG2:[0-9]+]], #40000
+; CHECK: movz x[[REG2:[0-9]+]], #0x9c40
 ; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
 ; CHECK: ldr x0, [x[[REG3]]]
 ; CHECK: ret
@@ -37,9 +37,9 @@ entry:
 define signext i8 @foo3() nounwind ssp {
 entry:
 ; CHECK: @foo3
-; CHECK: movz x[[REG:[0-9]+]], #2874, lsl #32
-; CHECK: movk x[[REG]], #29646, lsl #16
-; CHECK: movk x[[REG]], #12274
+; CHECK: movz x[[REG:[0-9]+]], #0xb3a, lsl #32
+; CHECK: movk x[[REG]], #0x73ce, lsl #16
+; CHECK: movk x[[REG]], #0x2ff2
   %0 = load i8** @pd2, align 8
   %arrayidx = getelementptr inbounds i8* %0, i64 12345678901234
   %1 = load i8* %arrayidx, align 1
diff --git a/test/CodeGen/ARM64/fast-isel-alloca.ll b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
index 8bbee16..1706e9e 100644
--- a/test/CodeGen/ARM64/fast-isel-alloca.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
@@ -14,6 +14,7 @@ entry:
 define void @main() nounwind {
 entry:
 ; CHECK: main
+; CHECK: mov x29, sp
 ; CHECK: mov x[[REG:[0-9]+]], sp
 ; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
 ; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
diff --git a/test/CodeGen/ARM64/fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
index 8fd32fd..37a8295 100644
--- a/test/CodeGen/ARM64/fast-isel-br.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
 
 define void @branch1() nounwind uwtable ssp {
   %x = alloca i32, align 4
diff --git a/test/CodeGen/ARM64/fast-isel-call.ll b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
index be0ca68..8d756ae 100644
--- a/test/CodeGen/ARM64/fast-isel-call.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64_be-linux-gnu | FileCheck %s --check-prefix=CHECK-BE
 
 define void @call0() nounwind {
 entry:
@@ -24,8 +25,8 @@ entry:
 define i32 @foo1(i32 %a) nounwind {
 entry:
 ; CHECK: foo1
-; CHECK: stur w0, [fp, #-4]
-; CHECK-NEXT: ldur w0, [fp, #-4]
+; CHECK: stur w0, [x29, #-4]
+; CHECK-NEXT: ldur w0, [x29, #-4]
 ; CHECK-NEXT: bl _call1
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
@@ -89,3 +90,11 @@ entry:
 }
 
 declare i32 @func2(i64 zeroext, i32 signext, i16 zeroext, i8 signext, i1 zeroext, i1 zeroext)
+
+declare void @callee_b0f(i8 %bp10, i8 %bp11, i8 %bp12, i8 %bp13, i8 %bp14, i8 %bp15, i8 %bp17, i8 %bp18, i8 %bp19)
+define void @caller_b1f() {
+entry:
+  ; CHECK-BE: strb w{{.*}}, [sp, #7]
+  call void @callee_b0f(i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 42)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/fast-isel-conversion.ll b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
index 4e62e33..c5417de 100644
--- a/test/CodeGen/ARM64/fast-isel-conversion.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
 
 ;; Test various conversions.
 define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
@@ -57,9 +57,10 @@ entry:
 ; CHECK: uxth w0, w0
 ; CHECK: str w0, [sp, #8]
 ; CHECK: ldr w0, [sp, #8]
-; CHECK: uxtw x3, w0
+; CHECK: mov x3, x0
+; CHECK: ubfx x3, x3, #0, #32
 ; CHECK: str x3, [sp]
-; CHECK: ldr x0, [sp], #16
+; CHECK: ldr x0, [sp]
 ; CHECK: ret
   %a.addr = alloca i8, align 1
   %b.addr = alloca i16, align 2
@@ -113,9 +114,10 @@ entry:
 ; CHECK: sxth w0, w0
 ; CHECK: str w0, [sp, #8]
 ; CHECK: ldr w0, [sp, #8]
-; CHECK: sxtw x3, w0
+; CHECK: mov x3, x0
+; CHECK: sxtw x3, w3
 ; CHECK: str x3, [sp]
-; CHECK: ldr x0, [sp], #16
+; CHECK: ldr x0, [sp]
 ; CHECK: ret
   %a.addr = alloca i8, align 1
   %b.addr = alloca i16, align 2
@@ -139,19 +141,28 @@ entry:
 }
 
 ; Test sext i8 to i64
-define i64 @sext_2(i8 signext %a) nounwind ssp {
-entry:
-; CHECK: sext_2
-; CHECK: sxtb x0, w0
-  %conv = sext i8 %a to i64
-  ret i64 %conv
+
+define zeroext i64 @sext_i8_i64(i8 zeroext %in) {
+; CHECK-LABEL: sext_i8_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: sxtb x0, w[[TMP]]
+  %big = sext i8 %in to i64
+  ret i64 %big
+}
+
+define zeroext i64 @sext_i16_i64(i16 zeroext %in) {
+; CHECK-LABEL: sext_i16_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: sxth x0, w[[TMP]]
+  %big = sext i16 %in to i64
+  ret i64 %big
 }
 
 ; Test sext i1 to i32
 define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
 entry:
 ; CHECK: sext_i1_i32
-; CHECK: sbfm w0, w0, #0, #0
+; CHECK: sbfx w0, w0, #0, #1
   %conv = sext i1 %a to i32
   ret i32 %conv
 }
@@ -160,7 +171,7 @@ entry:
 define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
 entry:
 ; CHECK: sext_i1_i16
-; CHECK: sbfm w0, w0, #0, #0
+; CHECK: sbfx w0, w0, #0, #1
   %conv = sext i1 %a to i16
   ret i16 %conv
 }
@@ -169,7 +180,7 @@ entry:
 define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
 entry:
 ; CHECK: sext_i1_i8
-; CHECK: sbfm w0, w0, #0, #0
+; CHECK: sbfx w0, w0, #0, #1
   %conv = sext i1 %a to i8
   ret i8 %conv
 }
@@ -232,7 +243,7 @@ entry:
 define float @sitofp_sw_i1(i1 %a) nounwind ssp {
 entry:
 ; CHECK: sitofp_sw_i1
-; CHECK: sbfm w0, w0, #0, #0
+; CHECK: sbfx w0, w0, #0, #1
 ; CHECK: scvtf s0, w0
   %conv = sitofp i1 %a to float
   ret float %conv
@@ -414,3 +425,18 @@ define void @stack_trunc() nounwind {
   store i8 %d, i8* %a, align 1
   ret void
 }
+
+define zeroext i64 @zext_i8_i64(i8 zeroext %in) {
+; CHECK-LABEL: zext_i8_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: ubfx x0, x[[TMP]], #0, #8
+  %big = zext i8 %in to i64
+  ret i64 %big
+}
+define zeroext i64 @zext_i16_i64(i16 zeroext %in) {
+; CHECK-LABEL: zext_i16_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: ubfx x0, x[[TMP]], #0, #16
+  %big = zext i16 %in to i64
+  ret i64 %big
+}
diff --git a/test/CodeGen/ARM64/fast-isel-fcmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
index cf71fab..f030596 100644
--- a/test/CodeGen/ARM64/fast-isel-fcmp.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
@@ -1,145 +1,145 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin | FileCheck %s
 
 define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
 entry:
-; CHECK: @fcmp_float1
+; CHECK-LABEL: @fcmp_float1
 ; CHECK: fcmp s0, #0.0
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+; CHECK: cset w{{[0-9]+}}, ne
   %cmp = fcmp une float %a, 0.000000e+00
   ret i1 %cmp
 }
 
 define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
 entry:
-; CHECK: @fcmp_float2
+; CHECK-LABEL: @fcmp_float2
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+; CHECK: cset w{{[0-9]+}}, ne
   %cmp = fcmp une float %a, %b
   ret i1 %cmp
 }
 
 define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
 entry:
-; CHECK: @fcmp_double1
+; CHECK-LABEL: @fcmp_double1
 ; CHECK: fcmp d0, #0.0
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+; CHECK: cset w{{[0-9]+}}, ne
   %cmp = fcmp une double %a, 0.000000e+00
   ret i1 %cmp
 }
 
 define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
 entry:
-; CHECK: @fcmp_double2
+; CHECK-LABEL: @fcmp_double2
 ; CHECK: fcmp d0, d1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+; CHECK: cset w{{[0-9]+}}, ne
   %cmp = fcmp une double %a, %b
   ret i1 %cmp
 }
 
 ; Check each fcmp condition
 define float @fcmp_oeq(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_oeq
+; CHECK-LABEL: @fcmp_oeq
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ne
+; CHECK: cset w{{[0-9]+}}, eq
   %cmp = fcmp oeq float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ogt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ogt
+; CHECK-LABEL: @fcmp_ogt
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, le
+; CHECK: cset w{{[0-9]+}}, gt
   %cmp = fcmp ogt float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_oge(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_oge
+; CHECK-LABEL: @fcmp_oge
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, lt
+; CHECK: cset w{{[0-9]+}}, ge
   %cmp = fcmp oge float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_olt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_olt
+; CHECK-LABEL: @fcmp_olt
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, pl
+; CHECK: cset w{{[0-9]+}}, mi
   %cmp = fcmp olt float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ole(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ole
+; CHECK-LABEL: @fcmp_ole
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, hi
+; CHECK: cset w{{[0-9]+}}, ls
   %cmp = fcmp ole float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ord(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ord
+; CHECK-LABEL: @fcmp_ord
 ; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, vs
+; CHECK: cset {{w[0-9]+}}, vc
   %cmp = fcmp ord float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_uno(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_uno
+; CHECK-LABEL: @fcmp_uno
 ; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, vc
+; CHECK: cset {{w[0-9]+}}, vs
   %cmp = fcmp uno float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ugt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ugt
+; CHECK-LABEL: @fcmp_ugt
 ; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ls
+; CHECK: cset {{w[0-9]+}}, hi
   %cmp = fcmp ugt float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_uge(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_uge
+; CHECK-LABEL: @fcmp_uge
 ; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, mi
+; CHECK: cset {{w[0-9]+}}, pl
   %cmp = fcmp uge float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ult(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ult
+; CHECK-LABEL: @fcmp_ult
 ; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ge
+; CHECK: cset {{w[0-9]+}}, lt
   %cmp = fcmp ult float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ule(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ule
+; CHECK-LABEL: @fcmp_ule
 ; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, gt
+; CHECK: cset {{w[0-9]+}}, le
   %cmp = fcmp ule float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_une(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_une
+; CHECK-LABEL: @fcmp_une
 ; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, eq
+; CHECK: cset {{w[0-9]+}}, ne
   %cmp = fcmp une float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
diff --git a/test/CodeGen/ARM64/fast-isel-gv.ll b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
index cb3df14..dc4d895 100644
--- a/test/CodeGen/ARM64/fast-isel-gv.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
@@ -18,10 +18,10 @@ entry:
 ; CHECK: @Rand
 ; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
 ; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
-; CHECK: movz x[[REG3:[0-9]+]], #1309
+; CHECK: movz x[[REG3:[0-9]+]], #0x51d
 ; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
 ; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
-; CHECK: movz x[[REG6:[0-9]+]], #13849
+; CHECK: movz x[[REG6:[0-9]+]], #0x3619
 ; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
 ; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
 ; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
diff --git a/test/CodeGen/ARM64/fast-isel-icmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
index 22af542..971be5c 100644
--- a/test/CodeGen/ARM64/fast-isel-icmp.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
@@ -4,7 +4,7 @@ define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
 entry:
 ; CHECK: icmp_eq_imm
 ; CHECK: cmp  w0, #31
-; CHECK: csinc w0, wzr, wzr, ne
+; CHECK: cset w0, eq
   %cmp = icmp eq i32 %a, 31
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -14,7 +14,7 @@ define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
 entry:
 ; CHECK: icmp_eq_neg_imm
 ; CHECK: cmn  w0, #7
-; CHECK: csinc w0, wzr, wzr, ne
+; CHECK: cset w0, eq
   %cmp = icmp eq i32 %a, -7
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -24,7 +24,7 @@ define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK: icmp_eq
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, ne
+; CHECK: cset w0, eq
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -34,7 +34,7 @@ define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK: icmp_ne
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, eq
+; CHECK: cset w0, ne
   %cmp = icmp ne i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -44,7 +44,7 @@ define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK: icmp_ugt
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, ls
+; CHECK: cset w0, hi
   %cmp = icmp ugt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -54,7 +54,7 @@ define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK: icmp_uge
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, cc
+; CHECK: cset w0, hs
   %cmp = icmp uge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -64,7 +64,7 @@ define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK: icmp_ult
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, cs
+; CHECK: cset w0, lo
   %cmp = icmp ult i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -74,7 +74,7 @@ define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK: icmp_ule
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, hi
+; CHECK: cset w0, ls
   %cmp = icmp ule i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -84,7 +84,7 @@ define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK: icmp_sgt
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, le
+; CHECK: cset w0, gt
   %cmp = icmp sgt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -94,7 +94,7 @@ define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK: icmp_sge
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, lt
+; CHECK: cset w0, ge
   %cmp = icmp sge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -104,7 +104,7 @@ define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK: icmp_slt
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, ge
+; CHECK: cset w0, lt
   %cmp = icmp slt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -114,7 +114,7 @@ define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK: icmp_sle
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, gt
+; CHECK: cset w0, le
   %cmp = icmp sle i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -124,7 +124,7 @@ define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
 entry:
 ; CHECK: icmp_i64
 ; CHECK: cmp  x0, x1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, gt
+; CHECK: cset w{{[0-9]+}}, le
   %cmp = icmp sle i64 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -136,7 +136,7 @@ entry:
 ; CHECK: sxth w0, w0
 ; CHECK: sxth w1, w1
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, ne
+; CHECK: cset w0, eq
   %cmp = icmp eq i16 %a, %b
   ret i1 %cmp
 }
@@ -147,7 +147,7 @@ entry:
 ; CHECK: sxtb w0, w0
 ; CHECK: sxtb w1, w1
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, ne
+; CHECK: cset w0, eq
   %cmp = icmp eq i8 %a, %b
   ret i1 %cmp
 }
@@ -158,7 +158,7 @@ entry:
 ; CHECK: uxth w0, w0
 ; CHECK: uxth w1, w1
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, cs
+; CHECK: cset w0, lo
   %cmp = icmp ult i16 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -170,7 +170,7 @@ entry:
 ; CHECK: sxtb w0, w0
 ; CHECK: sxtb w1, w1
 ; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, le
+; CHECK: cset w0, gt
   %cmp = icmp sgt i8 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -182,7 +182,7 @@ entry:
 ; CHECK: icmp_i16_signed_const
 ; CHECK: sxth w0, w0
 ; CHECK: cmn  w0, #233
-; CHECK: csinc w0, wzr, wzr, ge
+; CHECK: cset w0, lt
 ; CHECK: and w0, w0, #0x1
   %cmp = icmp slt i16 %a, -233
   %conv2 = zext i1 %cmp to i32
@@ -194,7 +194,7 @@ entry:
 ; CHECK: icmp_i8_signed_const
 ; CHECK: sxtb w0, w0
 ; CHECK: cmp  w0, #124
-; CHECK: csinc w0, wzr, wzr, le
+; CHECK: cset w0, gt
 ; CHECK: and w0, w0, #0x1
   %cmp = icmp sgt i8 %a, 124
   %conv2 = zext i1 %cmp to i32
@@ -206,7 +206,7 @@ entry:
 ; CHECK: icmp_i1_unsigned_const
 ; CHECK: and w0, w0, #0x1
 ; CHECK: cmp  w0, #0
-; CHECK: csinc w0, wzr, wzr, cs
+; CHECK: cset w0, lo
 ; CHECK: and w0, w0, #0x1
   %cmp = icmp ult i1 %a, 0
   %conv2 = zext i1 %cmp to i32
diff --git a/test/CodeGen/ARM64/fast-isel-indirectbr.ll b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
index 70335ac..70335ac 100644
--- a/test/CodeGen/ARM64/fast-isel-indirectbr.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
diff --git a/test/CodeGen/ARM64/fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
index 6443d82..a3d5f6c 100644
--- a/test/CodeGen/ARM64/fast-isel-intrinsic.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -4,11 +4,11 @@
 @temp = common global [80 x i8] zeroinitializer, align 16
 
 define void @t1() {
-; ARM64: t1
+; ARM64-LABEL: t1
 ; ARM64: adrp x8, _message@PAGE
 ; ARM64: add x0, x8, _message@PAGEOFF
 ; ARM64: movz w9, #0
-; ARM64: movz x2, #80
+; ARM64: movz x2, #0x50
 ; ARM64: uxtb w1, w9
 ; ARM64: bl _memset
   call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i32 16, i1 false)
@@ -18,12 +18,12 @@ define void @t1() {
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
 
 define void @t2() {
-; ARM64: t2
+; ARM64-LABEL: t2
 ; ARM64: adrp x8, _temp@GOTPAGE
 ; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp x8, _message@PAGE
 ; ARM64: add x1, x8, _message@PAGEOFF
-; ARM64: movz x2, #80
+; ARM64: movz x2, #0x50
 ; ARM64: bl _memcpy
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 80, i32 16, i1 false)
   ret void
@@ -32,12 +32,12 @@ define void @t2() {
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
 
 define void @t3() {
-; ARM64: t3
+; ARM64-LABEL: t3
 ; ARM64: adrp x8, _temp@GOTPAGE
 ; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp x8, _message@PAGE
 ; ARM64: add x1, x8, _message@PAGEOFF
-; ARM64: movz x2, #20
+; ARM64: movz x2, #0x14
 ; ARM64: bl _memmove
   call void @llvm.memmove.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 20, i32 16, i1 false)
   ret void
@@ -46,7 +46,7 @@ define void @t3() {
 declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
 
 define void @t4() {
-; ARM64: t4
+; ARM64-LABEL: t4
 ; ARM64: adrp x8, _temp@GOTPAGE
 ; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp x9, _message@PAGE
@@ -63,7 +63,7 @@ define void @t4() {
 }
 
 define void @t5() {
-; ARM64: t5
+; ARM64-LABEL: t5
 ; ARM64: adrp x8, _temp@GOTPAGE
 ; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp x9, _message@PAGE
@@ -80,7 +80,7 @@ define void @t5() {
 }
 
 define void @t6() {
-; ARM64: t6
+; ARM64-LABEL: t6
 ; ARM64: adrp x8, _temp@GOTPAGE
 ; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp x9, _message@PAGE
@@ -97,7 +97,7 @@ define void @t6() {
 }
 
 define void @t7() {
-; ARM64: t7
+; ARM64-LABEL: t7
 ; ARM64: adrp x8, _temp@GOTPAGE
 ; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp x9, _message@PAGE
@@ -116,7 +116,7 @@ define void @t7() {
 }
 
 define void @t8() {
-; ARM64: t8
+; ARM64-LABEL: t8
 ; ARM64: adrp x8, _temp@GOTPAGE
 ; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp x9, _message@PAGE
diff --git a/test/CodeGen/ARM64/fast-isel-materialize.ll b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
index fa2daf7..ffac131 100644
--- a/test/CodeGen/ARM64/fast-isel-materialize.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
@@ -3,14 +3,14 @@
 ; Materialize using fmov
 define void @float_(float* %value) {
 ; CHECK: @float_
-; CHECK: fmov s0, #1.250000e+00
+; CHECK: fmov s0, #1.25000000
   store float 1.250000e+00, float* %value, align 4
   ret void
 }
 
 define void @double_(double* %value) {
 ; CHECK: @double_
-; CHECK: fmov d0, #1.250000e+00
+; CHECK: fmov d0, #1.25000000
   store double 1.250000e+00, double* %value, align 8
   ret void
 }
diff --git a/test/CodeGen/ARM64/fast-isel-noconvert.ll b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
index 3517970..483d179 100644
--- a/test/CodeGen/ARM64/fast-isel-noconvert.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
@@ -33,4 +33,36 @@ define <2 x i64> @test_fptosi(<2 x double> %in) {
 
   %res = fptosi <2 x double> %in to <2 x i64>
   ret <2 x i64> %res
-}
-\ No newline at end of file
+}
+
+define fp128 @uitofp_i32_fp128(i32 %a) {
+entry:
+; CHECK-LABEL: uitofp_i32_fp128
+; CHECK: bl ___floatunsitf
+  %conv = uitofp i32 %a to fp128
+  ret fp128 %conv
+}
+
+define fp128 @uitofp_i64_fp128(i64 %a) {
+entry:
+; CHECK-LABEL: uitofp_i64_fp128
+; CHECK: bl ___floatunditf
+  %conv = uitofp i64 %a to fp128
+  ret fp128 %conv
+}
+
+define i32 @uitofp_fp128_i32(fp128 %a) {
+entry:
+; CHECK-LABEL: uitofp_fp128_i32
+; CHECK: ___fixunstfsi
+  %conv = fptoui fp128 %a to i32
+  ret i32 %conv
+}
+
+define i64 @uitofp_fp128_i64(fp128 %a) {
+entry:
+; CHECK-LABEL: uitofp_fp128_i64
+; CHECK: ___fixunstfdi
+  %conv = fptoui fp128 %a to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
new file mode 100644
index 0000000..d5bdbaa
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2> %t
+; RUN: FileCheck %s < %t --check-prefix=CHECK-SSA
+; REQUIRES: asserts
+
+; CHECK-SSA-LABEL: Machine code for function t1
+
+; CHECK-SSA: [[QUOTREG:%vreg[0-9]+]]<def> = SDIVWr
+; CHECK-SSA-NOT: [[QUOTREG]]<def> =
+; CHECK-SSA: {{%vreg[0-9]+}}<def> = MSUBWrrr [[QUOTREG]]
+
+; CHECK-SSA-LABEL: Machine code for function t2
+
+define i32 @t1(i32 %a, i32 %b) {
+; CHECK: @t1
+; CHECK: sdiv [[TMP:w[0-9]+]], w0, w1
+; CHECK: msub w0, [[TMP]], w1, w0
+  %1 = srem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t2(i64 %a, i64 %b) {
+; CHECK: @t2
+; CHECK: sdiv [[TMP:x[0-9]+]], x0, x1
+; CHECK: msub x0, [[TMP]], x1, x0
+  %1 = srem i64 %a, %b
+  ret i64 %1
+}
+
+define i32 @t3(i32 %a, i32 %b) {
+; CHECK: @t3
+; CHECK: udiv [[TMP:w[0-9]+]], w0, w1
+; CHECK: msub w0, [[TMP]], w1, w0
+  %1 = urem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t4(i64 %a, i64 %b) {
+; CHECK: @t4
+; CHECK: udiv [[TMP:x[0-9]+]], x0, x1
+; CHECK: msub x0, [[TMP]], x1, x0
+  %1 = urem i64 %a, %b
+  ret i64 %1
+}
diff --git a/test/CodeGen/ARM64/fast-isel-ret.ll b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
index d91fd28..d91fd28 100644
--- a/test/CodeGen/ARM64/fast-isel-ret.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
diff --git a/test/CodeGen/ARM64/fast-isel-select.ll b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
index 1cc207f..1cc207f 100644
--- a/test/CodeGen/ARM64/fast-isel-select.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
diff --git a/test/CodeGen/ARM64/fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll
index ba718d3..0194b3a 100644
--- a/test/CodeGen/ARM64/fast-isel.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel.ll
@@ -87,7 +87,7 @@ entry:
 
 define void @t6() nounwind {
 ; CHECK: t6
-; CHECK: brk #1
+; CHECK: brk #0x1
   tail call void @llvm.trap()
   ret void
 }
diff --git a/test/CodeGen/ARM64/fastcc-tailcall.ll b/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
index 8a744c5..8a744c5 100644
--- a/test/CodeGen/ARM64/fastcc-tailcall.ll
+++ b/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
diff --git a/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll b/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
index af9fe05..af9fe05 100644
--- a/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
+++ b/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
diff --git a/test/CodeGen/ARM64/fcmp-opt.ll b/test/CodeGen/AArch64/arm64-fcmp-opt.ll
index 17412dd..41027d4 100644
--- a/test/CodeGen/ARM64/fcmp-opt.ll
+++ b/test/CodeGen/AArch64/arm64-fcmp-opt.ll
@@ -1,146 +1,175 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -aarch64-neon-syntax=apple | FileCheck %s
 ; rdar://10263824
 
 define i1 @fcmp_float1(float %a) nounwind ssp {
 entry:
-; CHECK: @fcmp_float1
+; CHECK-LABEL: @fcmp_float1
 ; CHECK: fcmp s0, #0.0
-; CHECK: csinc w0, wzr, wzr, eq
+; CHECK: cset w0, ne
   %cmp = fcmp une float %a, 0.000000e+00
   ret i1 %cmp
 }
 
 define i1 @fcmp_float2(float %a, float %b) nounwind ssp {
 entry:
-; CHECK: @fcmp_float2
+; CHECK-LABEL: @fcmp_float2
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w0, wzr, wzr, eq
+; CHECK: cset w0, ne
   %cmp = fcmp une float %a, %b
   ret i1 %cmp
 }
 
 define i1 @fcmp_double1(double %a) nounwind ssp {
 entry:
-; CHECK: @fcmp_double1
+; CHECK-LABEL: @fcmp_double1
 ; CHECK: fcmp d0, #0.0
-; CHECK: csinc w0, wzr, wzr, eq
+; CHECK: cset w0, ne
   %cmp = fcmp une double %a, 0.000000e+00
   ret i1 %cmp
 }
 
 define i1 @fcmp_double2(double %a, double %b) nounwind ssp {
 entry:
-; CHECK: @fcmp_double2
+; CHECK-LABEL: @fcmp_double2
 ; CHECK: fcmp d0, d1
-; CHECK: csinc w0, wzr, wzr, eq
+; CHECK: cset w0, ne
   %cmp = fcmp une double %a, %b
   ret i1 %cmp
 }
 
 ; Check each fcmp condition
 define float @fcmp_oeq(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_oeq
+; CHECK-LABEL: @fcmp_oeq
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ne
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], eq
+
   %cmp = fcmp oeq float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ogt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ogt
+; CHECK-LABEL: @fcmp_ogt
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, le
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], gt
+
   %cmp = fcmp ogt float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_oge(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_oge
+; CHECK-LABEL: @fcmp_oge
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, lt
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ge
+
   %cmp = fcmp oge float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_olt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_olt
+; CHECK-LABEL: @fcmp_olt
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, pl
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], mi
+
   %cmp = fcmp olt float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ole(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ole
+; CHECK-LABEL: @fcmp_ole
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, hi
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ls
+
   %cmp = fcmp ole float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ord(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ord
+; CHECK-LABEL: @fcmp_ord
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, vs
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], vc
   %cmp = fcmp ord float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_uno(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_uno
+; CHECK-LABEL: @fcmp_uno
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, vc
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], vs
   %cmp = fcmp uno float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ugt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ugt
+; CHECK-LABEL: @fcmp_ugt
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ls
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], hi
   %cmp = fcmp ugt float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_uge(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_uge
+; CHECK-LABEL: @fcmp_uge
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, mi
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], pl
   %cmp = fcmp uge float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ult(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ult
+; CHECK-LABEL: @fcmp_ult
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ge
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], lt
   %cmp = fcmp ult float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_ule(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ule
+; CHECK-LABEL: @fcmp_ule
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, gt
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], le
   %cmp = fcmp ule float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
 }
 
 define float @fcmp_une(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_une
+; CHECK-LABEL: @fcmp_une
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ne
   %cmp = fcmp une float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
@@ -149,11 +178,12 @@ define float @fcmp_une(float %a, float %b) nounwind ssp {
 ; Possible opportunity for improvement.  See comment in
 ; ARM64TargetLowering::LowerSETCC()
 define float @fcmp_one(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_one
+; CHECK-LABEL: @fcmp_one
 ;	fcmp	s0, s1
-;	orr	w0, wzr, #0x1
-;	csel	w1, w0, wzr, mi
-;	csel	w0, w0, wzr, gt
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel [[TMP:s[0-9]+]], s[[ONE]], s[[ZERO]], mi
+; CHECK: fcsel s0, s[[ONE]], [[TMP]], gt
   %cmp = fcmp one float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
@@ -162,11 +192,12 @@ define float @fcmp_one(float %a, float %b) nounwind ssp {
 ; Possible opportunity for improvement.  See comment in
 ; ARM64TargetLowering::LowerSETCC()
 define float @fcmp_ueq(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ueq
+; CHECK-LABEL: @fcmp_ueq
 ; CHECK: fcmp s0, s1
-;        orr w0, wzr, #0x1
-; CHECK: csel [[REG1:w[0-9]]], [[REG2:w[0-9]+]], wzr, eq
-; CHECK: csel {{w[0-9]+}}, [[REG2]], [[REG1]], vs
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel [[TMP:s[0-9]+]], s[[ONE]], s[[ZERO]], eq
+; CHECK: fcsel s0, s[[ONE]], [[TMP]], vs
   %cmp = fcmp ueq float %a, %b
   %conv = uitofp i1 %cmp to float
   ret float %conv
diff --git a/test/CodeGen/ARM64/fcopysign.ll b/test/CodeGen/AArch64/arm64-fcopysign.ll
index 094ce7a..66241df 100644
--- a/test/CodeGen/ARM64/fcopysign.ll
+++ b/test/CodeGen/AArch64/arm64-fcopysign.ll
@@ -5,7 +5,7 @@
 define float @test1(float %x, float %y) nounwind {
 entry:
 ; CHECK-LABEL: test1:
-; CHECK: movi.4s	v2, #128, lsl #24
+; CHECK: movi.4s	v2, #0x80, lsl #24
 ; CHECK: bit.16b	v0, v1, v2
   %0 = tail call float @copysignf(float %x, float %y) nounwind readnone
   ret float %0
@@ -37,7 +37,7 @@ define float @test4() nounwind {
 entry:
 ; CHECK-LABEL: test4:
 ; CHECK: fcvt s0, d0
-; CHECK: movi.4s v[[CONST:[0-9]+]], #128, lsl #24
+; CHECK: movi.4s v[[CONST:[0-9]+]], #0x80, lsl #24
 ; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
   %0 = tail call double (...)* @bar() nounwind
   %1 = fptrunc double %0 to float
diff --git a/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
index 77981f2..e51c38b 100644
--- a/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
+++ b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; DAGCombine to transform a conversion of an extract_vector_elt to an
 ; extract_vector_elt of a conversion, which saves a round trip of copies
@@ -8,8 +8,8 @@ define double @foo0(<2 x i64> %a) nounwind {
 ; CHECK:  scvtf.2d  [[REG:v[0-9]+]], v0, #9
 ; CHECK-NEXT:  ins.d v0[0], [[REG]][1]
   %vecext = extractelement <2 x i64> %a, i32 1
-  %fcvt_n = tail call double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
+  %fcvt_n = tail call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
   ret double %fcvt_n
 }
 
-declare double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
+declare double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/fmadd.ll b/test/CodeGen/AArch64/arm64-fmadd.ll
index d00aaef..c791900 100644
--- a/test/CodeGen/ARM64/fmadd.ll
+++ b/test/CodeGen/AArch64/arm64-fmadd.ll
@@ -3,7 +3,7 @@
 define float @fma32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: fma32:
-; CHECK: fmadd
+; CHECK: fmadd s0, s0, s1, s2
   %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %0
 }
@@ -11,7 +11,7 @@ entry:
 define float @fnma32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: fnma32:
-; CHECK: fnmadd
+; CHECK: fnmadd s0, s0, s1, s2
   %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
   %mul = fmul float %0, -1.000000e+00
   ret float %mul
@@ -20,7 +20,7 @@ entry:
 define float @fms32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: fms32:
-; CHECK: fmsub
+; CHECK: fmsub s0, s0, s1, s2
   %mul = fmul float %b, -1.000000e+00
   %0 = tail call float @llvm.fma.f32(float %a, float %mul, float %c)
   ret float %0
@@ -29,7 +29,7 @@ entry:
 define float @fms32_com(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: fms32_com:
-; CHECK: fmsub
+; CHECK: fmsub s0, s1, s0, s2
   %mul = fmul float %b, -1.000000e+00
   %0 = tail call float @llvm.fma.f32(float %mul, float %a, float %c)
   ret float %0
@@ -38,7 +38,7 @@ entry:
 define float @fnms32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: fnms32:
-; CHECK: fnmsub
+; CHECK: fnmsub s0, s0, s1, s2
   %mul = fmul float %c, -1.000000e+00
   %0 = tail call float @llvm.fma.f32(float %a, float %b, float %mul)
   ret float %0
@@ -46,7 +46,7 @@ entry:
 
 define double @fma64(double %a, double %b, double %c) nounwind readnone ssp {
 ; CHECK-LABEL: fma64:
-; CHECK: fmadd
+; CHECK: fmadd d0, d0, d1, d2
 entry:
   %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
   ret double %0
@@ -54,7 +54,7 @@ entry:
 
 define double @fnma64(double %a, double %b, double %c) nounwind readnone ssp {
 ; CHECK-LABEL: fnma64:
-; CHECK: fnmadd
+; CHECK: fnmadd d0, d0, d1, d2
 entry:
   %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
   %mul = fmul double %0, -1.000000e+00
@@ -63,7 +63,7 @@ entry:
 
 define double @fms64(double %a, double %b, double %c) nounwind readnone ssp {
 ; CHECK-LABEL: fms64:
-; CHECK: fmsub
+; CHECK: fmsub d0, d0, d1, d2
 entry:
   %mul = fmul double %b, -1.000000e+00
   %0 = tail call double @llvm.fma.f64(double %a, double %mul, double %c)
@@ -72,7 +72,7 @@ entry:
 
 define double @fms64_com(double %a, double %b, double %c) nounwind readnone ssp {
 ; CHECK-LABEL: fms64_com:
-; CHECK: fmsub
+; CHECK: fmsub d0, d1, d0, d2
 entry:
   %mul = fmul double %b, -1.000000e+00
   %0 = tail call double @llvm.fma.f64(double %mul, double %a, double %c)
@@ -81,7 +81,7 @@ entry:
 
 define double @fnms64(double %a, double %b, double %c) nounwind readnone ssp {
 ; CHECK-LABEL: fnms64:
-; CHECK: fnmsub
+; CHECK: fnmsub d0, d0, d1, d2
 entry:
   %mul = fmul double %c, -1.000000e+00
   %0 = tail call double @llvm.fma.f64(double %a, double %b, double %mul)
diff --git a/test/CodeGen/AArch64/arm64-fmax.ll b/test/CodeGen/AArch64/arm64-fmax.ll
new file mode 100644
index 0000000..94b7454
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmax.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
+
+define double @test_direct(float %in) #1 {
+; CHECK-LABEL: test_direct:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double 0.000000e+00, double %longer
+  ret double %val
+
+; CHECK: fmax
+}
+
+define double @test_cross(float %in) #1 {
+; CHECK-LABEL: test_cross:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double %longer, double 0.000000e+00
+  ret double %val
+
+; CHECK: fmin
+}
+
+; This isn't a min or a max, but passes the first condition for swapping the
+; results. Make sure they're put back before we resort to the normal fcsel.
+define float @test_cross_fail(float %lhs, float %rhs) {
+; CHECK-LABEL: test_cross_fail:
+  %tst = fcmp une float %lhs, %rhs
+  %res = select i1 %tst, float %rhs, float %lhs
+  ret float %res
+
+  ; The register allocator would have to decide to be deliberately obtuse before
+  ; other register were used.
+; CHECK: fcsel s0, s1, s0, ne
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-fminv.ll b/test/CodeGen/AArch64/arm64-fminv.ll
new file mode 100644
index 0000000..f4c9735
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fminv.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+define float @test_fminv_v2f32(<2 x float> %in) {
+; CHECK: test_fminv_v2f32:
+; CHECK: fminp s0, v0.2s
+  %min = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %in)
+  ret float %min
+}
+
+define float @test_fminv_v4f32(<4 x float> %in) {
+; CHECK: test_fminv_v4f32:
+; CHECK: fminv s0, v0.4s
+  %min = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %in)
+  ret float %min
+}
+
+define double @test_fminv_v2f64(<2 x double> %in) {
+; CHECK: test_fminv_v2f64:
+; CHECK: fminp d0, v0.2d
+  %min = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxv_v2f32:
+; CHECK: fmaxp s0, v0.2s
+  %max = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %in)
+  ret float %max
+}
+
+define float @test_fmaxv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxv_v4f32:
+; CHECK: fmaxv s0, v0.4s
+  %max = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %in)
+  ret float %max
+}
+
+define double @test_fmaxv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxv_v2f64:
+; CHECK: fmaxp d0, v0.2d
+  %max = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double>)
+
+define float @test_fminnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fminnmv_v2f32:
+; CHECK: fminnmp s0, v0.2s
+  %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %in)
+  ret float %minnm
+}
+
+define float @test_fminnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fminnmv_v4f32:
+; CHECK: fminnmv s0, v0.4s
+  %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %in)
+  ret float %minnm
+}
+
+define double @test_fminnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fminnmv_v2f64:
+; CHECK: fminnmp d0, v0.2d
+  %minnm = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %minnm
+}
+
+declare float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxnmv_v2f32:
+; CHECK: fmaxnmp s0, v0.2s
+  %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
+  ret float %maxnm
+}
+
+define float @test_fmaxnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxnmv_v4f32:
+; CHECK: fmaxnmv s0, v0.4s
+  %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
+  ret float %maxnm
+}
+
+define double @test_fmaxnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxnmv_v2f64:
+; CHECK: fmaxnmp d0, v0.2d
+  %maxnm = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %maxnm
+}
+
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/fmuladd.ll b/test/CodeGen/AArch64/arm64-fmuladd.ll
index 174d830..6c5eeca 100644
--- a/test/CodeGen/ARM64/fmuladd.ll
+++ b/test/CodeGen/AArch64/arm64-fmuladd.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define float @test_f32(float* %A, float* %B, float* %C) nounwind {
 ;CHECK-LABEL: test_f32:
diff --git a/test/CodeGen/ARM64/fold-address.ll b/test/CodeGen/AArch64/arm64-fold-address.ll
index 96cc3e9..96cc3e9 100644
--- a/test/CodeGen/ARM64/fold-address.ll
+++ b/test/CodeGen/AArch64/arm64-fold-address.ll
diff --git a/test/CodeGen/ARM64/fold-lsl.ll b/test/CodeGen/AArch64/arm64-fold-lsl.ll
index a856c96..ec65e46 100644
--- a/test/CodeGen/ARM64/fold-lsl.ll
+++ b/test/CodeGen/AArch64/arm64-fold-lsl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 ;
 ; <rdar://problem/14486451>
 
@@ -8,7 +8,7 @@
 
 define i16 @load_halfword(%struct.a* %ctx, i32 %xor72) nounwind {
 ; CHECK-LABEL: load_halfword:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
 ; CHECK: ldrh w0, [x0, [[REG]], lsl #1]
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
@@ -20,7 +20,7 @@ define i16 @load_halfword(%struct.a* %ctx, i32 %xor72) nounwind {
 
 define i32 @load_word(%struct.b* %ctx, i32 %xor72) nounwind {
 ; CHECK-LABEL: load_word:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
 ; CHECK: ldr w0, [x0, [[REG]], lsl #2]
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
@@ -32,7 +32,7 @@ define i32 @load_word(%struct.b* %ctx, i32 %xor72) nounwind {
 
 define i64 @load_doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
 ; CHECK-LABEL: load_doubleword:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
 ; CHECK: ldr x0, [x0, [[REG]], lsl #3]
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
@@ -44,7 +44,7 @@ define i64 @load_doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
 
 define void @store_halfword(%struct.a* %ctx, i32 %xor72, i16 %val) nounwind {
 ; CHECK-LABEL: store_halfword:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
 ; CHECK: strh w2, [x0, [[REG]], lsl #1]
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
@@ -56,7 +56,7 @@ define void @store_halfword(%struct.a* %ctx, i32 %xor72, i16 %val) nounwind {
 
 define void @store_word(%struct.b* %ctx, i32 %xor72, i32 %val) nounwind {
 ; CHECK-LABEL: store_word:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
 ; CHECK: str w2, [x0, [[REG]], lsl #2]
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
@@ -68,7 +68,7 @@ define void @store_word(%struct.b* %ctx, i32 %xor72, i32 %val) nounwind {
 
 define void @store_doubleword(%struct.c* %ctx, i32 %xor72, i64 %val) nounwind {
 ; CHECK-LABEL: store_doubleword:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
 ; CHECK: str x2, [x0, [[REG]], lsl #3]
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
diff --git a/test/CodeGen/AArch64/arm64-fp-contract-zero.ll b/test/CodeGen/AArch64/arm64-fp-contract-zero.ll
new file mode 100644
index 0000000..f982cbb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp-contract-zero.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=arm64 -fp-contract=fast -o - %s | FileCheck %s
+
+
+; Make sure we don't try to fold an fneg into +0.0, creating an illegal constant
+; -0.0. It's also good, though not essential, that we don't resort to a litpool.
+define double @test_fms_fold(double %a, double %b) {
+; CHECK-LABEL: test_fms_fold:
+; CHECK: fmov {{d[0-9]+}}, xzr
+; CHECK: ret
+  %mul = fmul double %a, 0.000000e+00
+  %mul1 = fmul double %b, 0.000000e+00
+  %sub = fsub double %mul, %mul1
+  ret double %sub
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM64/fp-imm.ll b/test/CodeGen/AArch64/arm64-fp-imm.ll
index 6e271e0..6e271e0 100644
--- a/test/CodeGen/ARM64/fp-imm.ll
+++ b/test/CodeGen/AArch64/arm64-fp-imm.ll
diff --git a/test/CodeGen/ARM64/fp.ll b/test/CodeGen/AArch64/arm64-fp.ll
index 08b1b67..08b1b67 100644
--- a/test/CodeGen/ARM64/fp.ll
+++ b/test/CodeGen/AArch64/arm64-fp.ll
diff --git a/test/CodeGen/ARM64/fp128-folding.ll b/test/CodeGen/AArch64/arm64-fp128-folding.ll
index 6a7d203..6a7d203 100644
--- a/test/CodeGen/ARM64/fp128-folding.ll
+++ b/test/CodeGen/AArch64/arm64-fp128-folding.ll
diff --git a/test/CodeGen/ARM64/fp128.ll b/test/CodeGen/AArch64/arm64-fp128.ll
index 21eb893..57bbb93 100644
--- a/test/CodeGen/ARM64/fp128.ll
+++ b/test/CodeGen/AArch64/arm64-fp128.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone < %s | FileCheck %s
 
 @lhs = global fp128 zeroinitializer, align 16
 @rhs = global fp128 zeroinitializer, align 16
@@ -133,7 +133,7 @@ define i1 @test_setcc1() {
   %val = fcmp ole fp128 %lhs, %rhs
 ; CHECK: bl __letf2
 ; CHECK: cmp w0, #0
-; CHECK: csinc w0, wzr, wzr, gt
+; CHECK: cset w0, le
 
   ret i1 %val
 ; CHECK: ret
@@ -150,11 +150,11 @@ define i1 @test_setcc2() {
   %val = fcmp ugt fp128 %lhs, %rhs
 ; CHECK: bl      __gttf2
 ; CHECK: cmp     w0, #0
-; CHECK: csinc   [[GT:w[0-9]+]], wzr, wzr, le
+; CHECK: cset   [[GT:w[0-9]+]], gt
 
 ; CHECK: bl      __unordtf2
 ; CHECK: cmp     w0, #0
-; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
 ; CHECK: orr     w0, [[UNORDERED]], [[GT]]
 
   ret i1 %val
@@ -173,11 +173,11 @@ define i32 @test_br_cc() {
   %cond = fcmp olt fp128 %lhs, %rhs
 ; CHECK: bl      __getf2
 ; CHECK: cmp     w0, #0
-; CHECK: csinc   [[OGE:w[0-9]+]], wzr, wzr, lt
+; CHECK: cset   [[OGE:w[0-9]+]], ge
 
 ; CHECK: bl      __unordtf2
 ; CHECK: cmp     w0, #0
-; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
 
 ; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
 ; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
@@ -186,13 +186,13 @@ define i32 @test_br_cc() {
 iftrue:
   ret i32 42
 ; CHECK-NEXT: BB#
-; CHECK-NEXT: movz w0, #42
+; CHECK-NEXT: movz w0, #0x2a
 ; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
 
 iffalse:
   ret i32 29
 ; CHECK: [[RET29]]:
-; CHECK-NEXT: movz w0, #29
+; CHECK-NEXT: movz w0, #0x1d
 ; CHECK-NEXT: [[REALRET]]:
 ; CHECK: ret
 }
@@ -202,11 +202,10 @@ define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
 
   %val = select i1 %cond, fp128 %lhs, fp128 %rhs
   store fp128 %val, fp128* @lhs, align 16
-; CHECK: and [[BIT:w[0-9]+]], w0, #0x1
-; CHECK: cmp [[BIT]], #0
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: BB#
-; CHECK-NEXT: orr v[[VAL:[0-9]+]].16b, v0.16b, v0.16b
+; CHECK-NEXT: mov v[[VAL:[0-9]+]].16b, v0.16b
 ; CHECK-NEXT: [[IFFALSE]]:
 ; CHECK: str q[[VAL]], [{{x[0-9]+}}, :lo12:lhs]
   ret void
@@ -265,7 +264,7 @@ define fp128 @test_neg(fp128 %in) {
   ; Could in principle be optimized to fneg which we can't select, this makes
   ; sure that doesn't happen.
   %ret = fsub fp128 0xL00000000000000008000000000000000, %in
-; CHECK: orr v1.16b, v0.16b, v0.16b
+; CHECK: mov v1.16b, v0.16b
 ; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:[[MINUS0]]]
 ; CHECK: bl __subtf3
 
diff --git a/test/CodeGen/ARM64/frame-index.ll b/test/CodeGen/AArch64/arm64-frame-index.ll
index 4a91ff3..4a91ff3 100644
--- a/test/CodeGen/ARM64/frame-index.ll
+++ b/test/CodeGen/AArch64/arm64-frame-index.ll
diff --git a/test/CodeGen/ARM64/frameaddr.ll b/test/CodeGen/AArch64/arm64-frameaddr.ll
index d0635ad..469078c 100644
--- a/test/CodeGen/ARM64/frameaddr.ll
+++ b/test/CodeGen/AArch64/arm64-frameaddr.ll
@@ -3,10 +3,10 @@
 define i8* @t() nounwind {
 entry:
 ; CHECK-LABEL: t:
-; CHECK: stp fp, lr, [sp, #-16]!
-; CHECK: mov fp, sp
-; CHECK: mov x0, fp
-; CHECK: ldp fp, lr, [sp], #16
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: mov x0, x29
+; CHECK: ldp x29, x30, [sp], #16
 ; CHECK: ret
 	%0 = call i8* @llvm.frameaddress(i32 0)
         ret i8* %0
diff --git a/test/CodeGen/ARM64/global-address.ll b/test/CodeGen/AArch64/arm64-global-address.ll
index 005f414..005f414 100644
--- a/test/CodeGen/ARM64/global-address.ll
+++ b/test/CodeGen/AArch64/arm64-global-address.ll
diff --git a/test/CodeGen/ARM64/hello.ll b/test/CodeGen/AArch64/arm64-hello.ll
index f870fff..a6346fb 100644
--- a/test/CodeGen/ARM64/hello.ll
+++ b/test/CodeGen/AArch64/arm64-hello.ll
@@ -2,27 +2,27 @@
 ; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
 
 ; CHECK-LABEL: main:
-; CHECK:	stp	fp, lr, [sp, #-16]!
-; CHECK-NEXT:	mov	fp, sp
+; CHECK:	stp	x29, x30, [sp, #-16]!
+; CHECK-NEXT:	mov	x29, sp
 ; CHECK-NEXT:	sub	sp, sp, #16
-; CHECK-NEXT:	stur	wzr, [fp, #-4]
+; CHECK-NEXT:	stur	wzr, [x29, #-4]
 ; CHECK:	adrp	x0, L_.str@PAGE
 ; CHECK:	add	x0, x0, L_.str@PAGEOFF
 ; CHECK-NEXT:	bl	_puts
-; CHECK-NEXT:	mov	sp, fp
-; CHECK-NEXT:	ldp	fp, lr, [sp], #16
+; CHECK-NEXT:	mov	sp, x29
+; CHECK-NEXT:	ldp	x29, x30, [sp], #16
 ; CHECK-NEXT:	ret
 
 ; CHECK-LINUX-LABEL: main:
-; CHECK-LINUX:	stp	fp, lr, [sp, #-16]!
-; CHECK-LINUX-NEXT:	mov	fp, sp
+; CHECK-LINUX:	stp	x29, x30, [sp, #-16]!
+; CHECK-LINUX-NEXT:	mov	x29, sp
 ; CHECK-LINUX-NEXT:	sub	sp, sp, #16
-; CHECK-LINUX-NEXT:	stur	wzr, [fp, #-4]
+; CHECK-LINUX-NEXT:	stur	wzr, [x29, #-4]
 ; CHECK-LINUX:	adrp	x0, .L.str
 ; CHECK-LINUX:	add	x0, x0, :lo12:.L.str
 ; CHECK-LINUX-NEXT:	bl	puts
-; CHECK-LINUX-NEXT:	mov	sp, fp
-; CHECK-LINUX-NEXT:	ldp	fp, lr, [sp], #16
+; CHECK-LINUX-NEXT:	mov	sp, x29
+; CHECK-LINUX-NEXT:	ldp	x29, x30, [sp], #16
 ; CHECK-LINUX-NEXT:	ret
 
 @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
diff --git a/test/CodeGen/ARM64/i16-subreg-extract.ll b/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
index fc2e8b5..ba759e3 100644
--- a/test/CodeGen/ARM64/i16-subreg-extract.ll
+++ b/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @foo(<4 x i16>* %__a) nounwind {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/ARM64/icmp-opt.ll b/test/CodeGen/AArch64/arm64-icmp-opt.ll
index f88399b..7b12ed7 100644
--- a/test/CodeGen/ARM64/icmp-opt.ll
+++ b/test/CodeGen/AArch64/arm64-icmp-opt.ll
@@ -10,7 +10,7 @@ entry:
 ; CHECK-LABEL: t1:
 ; CHECK-NOT: movn
 ; CHECK: cmp  x0, #0
-; CHECK: csinc w0, wzr, wzr, lt
+; CHECK: cset w0, ge
   %cmp = icmp sgt i64 %a, -1
   %conv = zext i1 %cmp to i32
   ret i32 %conv
diff --git a/test/CodeGen/ARM64/illegal-float-ops.ll b/test/CodeGen/AArch64/arm64-illegal-float-ops.ll
index 9a35fe5..9a35fe5 100644
--- a/test/CodeGen/ARM64/illegal-float-ops.ll
+++ b/test/CodeGen/AArch64/arm64-illegal-float-ops.ll
diff --git a/test/CodeGen/ARM64/indexed-memory.ll b/test/CodeGen/AArch64/arm64-indexed-memory.ll
index e390ed7..e501c6e 100644
--- a/test/CodeGen/ARM64/indexed-memory.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
 
 define void @store64(i64** nocapture %out, i64 %index, i64 %spacing) nounwind noinline ssp {
 ; CHECK-LABEL: store64:
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
new file mode 100644
index 0000000..c118f10
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s
+
+; This used to assert with "Overran sorted position" in AssignTopologicalOrder
+; due to a cycle created in performPostLD1Combine.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp
+define void @f(double* %P1) #0 {
+entry:
+  %arrayidx4 = getelementptr inbounds double* %P1, i64 1
+  %0 = load double* %arrayidx4, align 8, !tbaa !1
+  %1 = load double* %P1, align 8, !tbaa !1
+  %2 = insertelement <2 x double> undef, double %0, i32 0
+  %3 = insertelement <2 x double> %2, double %1, i32 1
+  %4 = fsub <2 x double> zeroinitializer, %3
+  %5 = fmul <2 x double> undef, %4
+  %6 = extractelement <2 x double> %5, i32 0
+  %cmp168 = fcmp olt double %6, undef
+  br i1 %cmp168, label %if.then172, label %return
+
+if.then172:                                       ; preds = %cond.end90
+  %7 = tail call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false)
+  br label %return
+
+return:                                           ; preds = %if.then172, %cond.end90, %entry
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) #1
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"double", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
new file mode 100644
index 0000000..9ee4063
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -0,0 +1,6174 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+
+@ptr = global i8* null
+
+define <8 x i8> @test_v8i8_pre_load(<8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  %val = load <8 x i8>* %newaddr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret <8 x i8> %val
+}
+
+define <8 x i8> @test_v8i8_post_load(<8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  %val = load <8 x i8>* %addr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret <8 x i8> %val
+}
+
+define void @test_v8i8_pre_store(<8 x i8> %in, <8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  store <8 x i8> %in, <8 x i8>* %newaddr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret void
+}
+
+define void @test_v8i8_post_store(<8 x i8> %in, <8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  store <8 x i8> %in, <8 x i8>* %addr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret void
+}
+
+define <4 x i16> @test_v4i16_pre_load(<4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  %val = load <4 x i16>* %newaddr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret <4 x i16> %val
+}
+
+define <4 x i16> @test_v4i16_post_load(<4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  %val = load <4 x i16>* %addr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret <4 x i16> %val
+}
+
+define void @test_v4i16_pre_store(<4 x i16> %in, <4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  store <4 x i16> %in, <4 x i16>* %newaddr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret void
+}
+
+define void @test_v4i16_post_store(<4 x i16> %in, <4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  store <4 x i16> %in, <4 x i16>* %addr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret void
+}
+
+define <2 x i32> @test_v2i32_pre_load(<2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  %val = load <2 x i32>* %newaddr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret <2 x i32> %val
+}
+
+define <2 x i32> @test_v2i32_post_load(<2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  %val = load <2 x i32>* %addr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret <2 x i32> %val
+}
+
+define void @test_v2i32_pre_store(<2 x i32> %in, <2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  store <2 x i32> %in, <2 x i32>* %newaddr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret void
+}
+
+define void @test_v2i32_post_store(<2 x i32> %in, <2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  store <2 x i32> %in, <2 x i32>* %addr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret void
+}
+
+define <2 x float> @test_v2f32_pre_load(<2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  %val = load <2 x float>* %newaddr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret <2 x float> %val
+}
+
+define <2 x float> @test_v2f32_post_load(<2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  %val = load <2 x float>* %addr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret <2 x float> %val
+}
+
+define void @test_v2f32_pre_store(<2 x float> %in, <2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  store <2 x float> %in, <2 x float>* %newaddr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret void
+}
+
+define void @test_v2f32_post_store(<2 x float> %in, <2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  store <2 x float> %in, <2 x float>* %addr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret void
+}
+
+define <1 x i64> @test_v1i64_pre_load(<1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  %val = load <1 x i64>* %newaddr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret <1 x i64> %val
+}
+
+define <1 x i64> @test_v1i64_post_load(<1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  %val = load <1 x i64>* %addr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret <1 x i64> %val
+}
+
+define void @test_v1i64_pre_store(<1 x i64> %in, <1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  store <1 x i64> %in, <1 x i64>* %newaddr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret void
+}
+
+define void @test_v1i64_post_store(<1 x i64> %in, <1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  store <1 x i64> %in, <1 x i64>* %addr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret void
+}
+
+define <16 x i8> @test_v16i8_pre_load(<16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  %val = load <16 x i8>* %newaddr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret <16 x i8> %val
+}
+
+define <16 x i8> @test_v16i8_post_load(<16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  %val = load <16 x i8>* %addr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret <16 x i8> %val
+}
+
+define void @test_v16i8_pre_store(<16 x i8> %in, <16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  store <16 x i8> %in, <16 x i8>* %newaddr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret void
+}
+
+define void @test_v16i8_post_store(<16 x i8> %in, <16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  store <16 x i8> %in, <16 x i8>* %addr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret void
+}
+
+define <8 x i16> @test_v8i16_pre_load(<8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  %val = load <8 x i16>* %newaddr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret <8 x i16> %val
+}
+
+define <8 x i16> @test_v8i16_post_load(<8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  %val = load <8 x i16>* %addr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret <8 x i16> %val
+}
+
+define void @test_v8i16_pre_store(<8 x i16> %in, <8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  store <8 x i16> %in, <8 x i16>* %newaddr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret void
+}
+
+define void @test_v8i16_post_store(<8 x i16> %in, <8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  store <8 x i16> %in, <8 x i16>* %addr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret void
+}
+
+define <4 x i32> @test_v4i32_pre_load(<4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  %val = load <4 x i32>* %newaddr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret <4 x i32> %val
+}
+
+define <4 x i32> @test_v4i32_post_load(<4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  %val = load <4 x i32>* %addr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret <4 x i32> %val
+}
+
+define void @test_v4i32_pre_store(<4 x i32> %in, <4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  store <4 x i32> %in, <4 x i32>* %newaddr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret void
+}
+
+define void @test_v4i32_post_store(<4 x i32> %in, <4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  store <4 x i32> %in, <4 x i32>* %addr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret void
+}
+
+
+define <4 x float> @test_v4f32_pre_load(<4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  %val = load <4 x float>* %newaddr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret <4 x float> %val
+}
+
+define <4 x float> @test_v4f32_post_load(<4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  %val = load <4 x float>* %addr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret <4 x float> %val
+}
+
+define void @test_v4f32_pre_store(<4 x float> %in, <4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  store <4 x float> %in, <4 x float>* %newaddr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret void
+}
+
+define void @test_v4f32_post_store(<4 x float> %in, <4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  store <4 x float> %in, <4 x float>* %addr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret void
+}
+
+
+define <2 x i64> @test_v2i64_pre_load(<2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  %val = load <2 x i64>* %newaddr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret <2 x i64> %val
+}
+
+define <2 x i64> @test_v2i64_post_load(<2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  %val = load <2 x i64>* %addr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret <2 x i64> %val
+}
+
+define void @test_v2i64_pre_store(<2 x i64> %in, <2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  store <2 x i64> %in, <2 x i64>* %newaddr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret void
+}
+
+define void @test_v2i64_post_store(<2 x i64> %in, <2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  store <2 x i64> %in, <2 x i64>* %addr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret void
+}
+
+
+define <2 x double> @test_v2f64_pre_load(<2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  %val = load <2 x double>* %newaddr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret <2 x double> %val
+}
+
+define <2 x double> @test_v2f64_post_load(<2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  %val = load <2 x double>* %addr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret <2 x double> %val
+}
+
+define void @test_v2f64_pre_store(<2 x double> %in, <2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  store <2 x double> %in, <2 x double>* %newaddr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret void
+}
+
+define void @test_v2f64_post_store(<2 x double> %in, <2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  store <2 x double> %in, <2 x double>* %addr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret void
+}
+
+define i8* @test_v16i8_post_imm_st1_lane(<16 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v16i8_post_imm_st1_lane:
+; CHECK: st1.b { v0 }[3], [x0], #1
+  %elt = extractelement <16 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 1
+  ret i8* %newaddr
+}
+
+define i8* @test_v16i8_post_reg_st1_lane(<16 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v16i8_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2
+; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <16 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 2
+  ret i8* %newaddr
+}
+
+
+define i16* @test_v8i16_post_imm_st1_lane(<8 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v8i16_post_imm_st1_lane:
+; CHECK: st1.h { v0 }[3], [x0], #2
+  %elt = extractelement <8 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 1
+  ret i16* %newaddr
+}
+
+define i16* @test_v8i16_post_reg_st1_lane(<8 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v8i16_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4
+; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <8 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 2
+  ret i16* %newaddr
+}
+
+define i32* @test_v4i32_post_imm_st1_lane(<4 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v4i32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[3], [x0], #4
+  %elt = extractelement <4 x i32> %in, i32 3
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 1
+  ret i32* %newaddr
+}
+
+define i32* @test_v4i32_post_reg_st1_lane(<4 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v4i32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <4 x i32> %in, i32 3
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 2
+  ret i32* %newaddr
+}
+
+define float* @test_v4f32_post_imm_st1_lane(<4 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v4f32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[3], [x0], #4
+  %elt = extractelement <4 x float> %in, i32 3
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 1
+  ret float* %newaddr
+}
+
+define float* @test_v4f32_post_reg_st1_lane(<4 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v4f32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <4 x float> %in, i32 3
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 2
+  ret float* %newaddr
+}
+
+define i64* @test_v2i64_post_imm_st1_lane(<2 x i64> %in, i64* %addr) {
+; CHECK-LABEL: test_v2i64_post_imm_st1_lane:
+; CHECK: st1.d { v0 }[1], [x0], #8
+  %elt = extractelement <2 x i64> %in, i64 1
+  store i64 %elt, i64* %addr
+
+  %newaddr = getelementptr i64* %addr, i64 1
+  ret i64* %newaddr
+}
+
+define i64* @test_v2i64_post_reg_st1_lane(<2 x i64> %in, i64* %addr) {
+; CHECK-LABEL: test_v2i64_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10
+; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x i64> %in, i64 1
+  store i64 %elt, i64* %addr
+
+  %newaddr = getelementptr i64* %addr, i64 2
+  ret i64* %newaddr
+}
+
+define double* @test_v2f64_post_imm_st1_lane(<2 x double> %in, double* %addr) {
+; CHECK-LABEL: test_v2f64_post_imm_st1_lane:
+; CHECK: st1.d { v0 }[1], [x0], #8
+  %elt = extractelement <2 x double> %in, i32 1
+  store double %elt, double* %addr
+
+  %newaddr = getelementptr double* %addr, i32 1
+  ret double* %newaddr
+}
+
+define double* @test_v2f64_post_reg_st1_lane(<2 x double> %in, double* %addr) {
+; CHECK-LABEL: test_v2f64_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10
+; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x double> %in, i32 1
+  store double %elt, double* %addr
+
+  %newaddr = getelementptr double* %addr, i32 2
+  ret double* %newaddr
+}
+
+define i8* @test_v8i8_post_imm_st1_lane(<8 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v8i8_post_imm_st1_lane:
+; CHECK: st1.b { v0 }[3], [x0], #1
+  %elt = extractelement <8 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 1
+  ret i8* %newaddr
+}
+
+define i8* @test_v8i8_post_reg_st1_lane(<8 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v8i8_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2
+; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <8 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 2
+  ret i8* %newaddr
+}
+
+define i16* @test_v4i16_post_imm_st1_lane(<4 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v4i16_post_imm_st1_lane:
+; CHECK: st1.h { v0 }[3], [x0], #2
+  %elt = extractelement <4 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 1
+  ret i16* %newaddr
+}
+
+define i16* @test_v4i16_post_reg_st1_lane(<4 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v4i16_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4
+; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <4 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 2
+  ret i16* %newaddr
+}
+
+define i32* @test_v2i32_post_imm_st1_lane(<2 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v2i32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[1], [x0], #4
+  %elt = extractelement <2 x i32> %in, i32 1
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 1
+  ret i32* %newaddr
+}
+
+define i32* @test_v2i32_post_reg_st1_lane(<2 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v2i32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x i32> %in, i32 1
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 2
+  ret i32* %newaddr
+}
+
+define float* @test_v2f32_post_imm_st1_lane(<2 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v2f32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[1], [x0], #4
+  %elt = extractelement <2 x float> %in, i32 1
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 1
+  ret float* %newaddr
+}
+
+define float* @test_v2f32_post_reg_st1_lane(<2 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v2f32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x float> %in, i32 1
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 2
+  ret float* %newaddr
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld2:
+;CHECK: ld2.16b { v0, v1 }, [x0], #32
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld2:
+;CHECK: ld2.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld2:
+;CHECK: ld2.8b { v0, v1 }, [x0], #16
+  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld2:
+;CHECK: ld2.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld2:
+;CHECK: ld2.8h { v0, v1 }, [x0], #32
+  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld2:
+;CHECK: ld2.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld2:
+;CHECK: ld2.4h { v0, v1 }, [x0], #16
+  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld2:
+;CHECK: ld2.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], #32
+  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], #16
+  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], #32
+  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], #32
+  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], #16
+  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], #32
+  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld3:
+;CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld3:
+;CHECK: ld3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld3:
+;CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld3:
+;CHECK: ld3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld3:
+;CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld3:
+;CHECK: ld3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld3:
+;CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld3:
+;CHECK: ld3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 6
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 6
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld4:
+;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld4:
+;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld4:
+;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld4:
+;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld4:
+;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld4:
+;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld4:
+;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld4:
+;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 8
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 8
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double*)
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x2:
+;CHECK: ld1.16b { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld1x2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x2:
+;CHECK: ld1.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld1x2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x2:
+;CHECK: ld1.8b { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld1x2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x2:
+;CHECK: ld1.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld1x2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x2:
+;CHECK: ld1.8h { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld1x2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x2:
+;CHECK: ld1.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld1x2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x2:
+;CHECK: ld1.4h { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld1x2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x2:
+;CHECK: ld1.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld1x2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld1x2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld1x2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld1x2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld1x2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld1x2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld1x2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld1x2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld1x2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld1x2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld1x2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld1x2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld1x2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld1x2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld1x2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld1x2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld1x2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x3:
+;CHECK: ld1.16b { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x3:
+;CHECK: ld1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x3:
+;CHECK: ld1.8b { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x3:
+;CHECK: ld1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x3:
+;CHECK: ld1.8h { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x3:
+;CHECK: ld1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x3:
+;CHECK: ld1.4h { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x3:
+;CHECK: ld1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 6
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 6
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x4:
+;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x4:
+;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x4:
+;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x4:
+;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x4:
+;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x4:
+;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x4:
+;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x4:
+;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 8
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 8
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld2r:
+;CHECK: ld2r.16b { v0, v1 }, [x0], #2
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld2r:
+;CHECK: ld2r.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld2r:
+;CHECK: ld2r.8b { v0, v1 }, [x0], #2
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld2r:
+;CHECK: ld2r.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld2r:
+;CHECK: ld2r.8h { v0, v1 }, [x0], #4
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld2r:
+;CHECK: ld2r.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld2r:
+;CHECK: ld2r.4h { v0, v1 }, [x0], #4
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld2r:
+;CHECK: ld2r.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], #8
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], #8
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], #16
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], #16
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], #8
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], #8
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], #16
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], #16
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld3r:
+;CHECK: ld3r.16b { v0, v1, v2 }, [x0], #3
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld3r:
+;CHECK: ld3r.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld3r:
+;CHECK: ld3r.8b { v0, v1, v2 }, [x0], #3
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld3r:
+;CHECK: ld3r.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld3r:
+;CHECK: ld3r.8h { v0, v1, v2 }, [x0], #6
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld3r:
+;CHECK: ld3r.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld3r:
+;CHECK: ld3r.4h { v0, v1, v2 }, [x0], #6
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld3r:
+;CHECK: ld3r.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld4r:
+;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], #4
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld4r:
+;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld4r:
+;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], #4
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld4r:
+;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld4r:
+;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], #8
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld4r:
+;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld4r:
+;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], #8
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld4r:
+;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], #2
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], #2
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], #4
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], #4
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define i8* @test_v16i8_post_imm_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st2:
+;CHECK: st2.16b { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st2:
+;CHECK: st2.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st2:
+;CHECK: st2.8b { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st2:
+;CHECK: st2.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st2:
+;CHECK: st2.8h { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st2:
+;CHECK: st2.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st2:
+;CHECK: st2.4h { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st2:
+;CHECK: st2.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st3:
+;CHECK: st3.16b { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st3:
+;CHECK: st3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st3:
+;CHECK: st3.8b { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st3:
+;CHECK: st3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st3:
+;CHECK: st3.8h { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st3:
+;CHECK: st3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st3:
+;CHECK: st3.4h { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st3:
+;CHECK: st3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 6
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 6
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st4:
+;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st4:
+;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st4:
+;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st4:
+;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st4:
+;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st4:
+;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st4:
+;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st4:
+;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
+
+
+define i32* @test_v4i32_post_imm_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
+
+
+define i32* @test_v2i32_post_imm_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 8
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
+
+
+define i64* @test_v1i64_post_imm_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
+
+
+define float* @test_v4f32_post_imm_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 8
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
+
+
+define double* @test_v1f64_post_imm_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x2:
+;CHECK: st1.16b { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x2:
+;CHECK: st1.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x2:
+;CHECK: st1.8b { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x2:
+;CHECK: st1.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x2:
+;CHECK: st1.8h { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x2:
+;CHECK: st1.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x2:
+;CHECK: st1.4h { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x2:
+;CHECK: st1.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x3:
+;CHECK: st1.16b { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x3:
+;CHECK: st1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x3:
+;CHECK: st1.8b { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x3:
+;CHECK: st1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x3:
+;CHECK: st1.8h { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x3:
+;CHECK: st1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x3:
+;CHECK: st1.4h { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x3:
+;CHECK: st1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 6
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 6
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x4:
+;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x4:
+;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x4:
+;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x4:
+;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x4:
+;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x4:
+;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x4:
+;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x4:
+;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 8
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
+
+
+define float* @test_v4f32_post_imm_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 8
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
+
+
+define double* @test_v1f64_post_imm_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) {
+  call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) {
+  call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i64, i8*) nounwind readnone
+
+
+define i8* @test_v16i8_post_imm_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], #2
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], #2
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*)
+
+
+define i8* @test_v16i8_post_imm_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
+  call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
+  call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*)
+
+
+define i8* @test_v16i8_post_imm_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*)
+
+define <16 x i8> @test_v16i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
+; CHECK-LABEL: test_v16i8_post_imm_ld1r:
+; CHECK: ld1r.16b { v0 }, [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+  %tmp18 = getelementptr i8* %bar, i64 1
+  store i8* %tmp18, i8** %ptr
+  ret <16 x i8> %tmp17
+}
+
+define <16 x i8> @test_v16i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v16i8_post_reg_ld1r:
+; CHECK: ld1r.16b { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+  %tmp18 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp18, i8** %ptr
+  ret <16 x i8> %tmp17
+}
+
+define <8 x i8> @test_v8i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
+; CHECK-LABEL: test_v8i8_post_imm_ld1r:
+; CHECK: ld1r.8b { v0 }, [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = getelementptr i8* %bar, i64 1
+  store i8* %tmp10, i8** %ptr
+  ret <8 x i8> %tmp9
+}
+
+define <8 x i8> @test_v8i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v8i8_post_reg_ld1r:
+; CHECK: ld1r.8b { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp10, i8** %ptr
+  ret <8 x i8> %tmp9
+}
+
+define <8 x i16> @test_v8i16_post_imm_ld1r(i16* %bar, i16** %ptr) {
+; CHECK-LABEL: test_v8i16_post_imm_ld1r:
+; CHECK: ld1r.8h { v0 }, [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+  %tmp10 = getelementptr i16* %bar, i64 1
+  store i16* %tmp10, i16** %ptr
+  ret <8 x i16> %tmp9
+}
+
+define <8 x i16> @test_v8i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v8i16_post_reg_ld1r:
+; CHECK: ld1r.8h { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+  %tmp10 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp10, i16** %ptr
+  ret <8 x i16> %tmp9
+}
+
+define <4 x i16> @test_v4i16_post_imm_ld1r(i16* %bar, i16** %ptr) {
+; CHECK-LABEL: test_v4i16_post_imm_ld1r:
+; CHECK: ld1r.4h { v0 }, [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = getelementptr i16* %bar, i64 1
+  store i16* %tmp6, i16** %ptr
+  ret <4 x i16> %tmp5
+}
+
+define <4 x i16> @test_v4i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4i16_post_reg_ld1r:
+; CHECK: ld1r.4h { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp6, i16** %ptr
+  ret <4 x i16> %tmp5
+}
+
+define <4 x i32> @test_v4i32_post_imm_ld1r(i32* %bar, i32** %ptr) {
+; CHECK-LABEL: test_v4i32_post_imm_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  %tmp6 = getelementptr i32* %bar, i64 1
+  store i32* %tmp6, i32** %ptr
+  ret <4 x i32> %tmp5
+}
+
+define <4 x i32> @test_v4i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4i32_post_reg_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  %tmp6 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp6, i32** %ptr
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i32> @test_v2i32_post_imm_ld1r(i32* %bar, i32** %ptr) {
+; CHECK-LABEL: test_v2i32_post_imm_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = getelementptr i32* %bar, i64 1
+  store i32* %tmp4, i32** %ptr
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @test_v2i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2i32_post_reg_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp4, i32** %ptr
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i64> @test_v2i64_post_imm_ld1r(i64* %bar, i64** %ptr) {
+; CHECK-LABEL: test_v2i64_post_imm_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], #8
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+  %tmp4 = getelementptr i64* %bar, i64 1
+  store i64* %tmp4, i64** %ptr
+  ret <2 x i64> %tmp3
+}
+
+define <2 x i64> @test_v2i64_post_reg_ld1r(i64* %bar, i64** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2i64_post_reg_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+  %tmp4 = getelementptr i64* %bar, i64 %inc
+  store i64* %tmp4, i64** %ptr
+  ret <2 x i64> %tmp3
+}
+
+define <4 x float> @test_v4f32_post_imm_ld1r(float* %bar, float** %ptr) {
+; CHECK-LABEL: test_v4f32_post_imm_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> <float undef, float undef, float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2
+  %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3
+  %tmp6 = getelementptr float* %bar, i64 1
+  store float* %tmp6, float** %ptr
+  ret <4 x float> %tmp5
+}
+
+define <4 x float> @test_v4f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4f32_post_reg_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> <float undef, float undef, float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2
+  %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3
+  %tmp6 = getelementptr float* %bar, i64 %inc
+  store float* %tmp6, float** %ptr
+  ret <4 x float> %tmp5
+}
+
+define <2 x float> @test_v2f32_post_imm_ld1r(float* %bar, float** %ptr) {
+; CHECK-LABEL: test_v2f32_post_imm_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> <float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = getelementptr float* %bar, i64 1
+  store float* %tmp4, float** %ptr
+  ret <2 x float> %tmp3
+}
+
+define <2 x float> @test_v2f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2f32_post_reg_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> <float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = getelementptr float* %bar, i64 %inc
+  store float* %tmp4, float** %ptr
+  ret <2 x float> %tmp3
+}
+
+define <2 x double> @test_v2f64_post_imm_ld1r(double* %bar, double** %ptr) {
+; CHECK-LABEL: test_v2f64_post_imm_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], #8
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> <double undef, double undef>, double %tmp1, i32 0
+  %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1
+  %tmp4 = getelementptr double* %bar, i64 1
+  store double* %tmp4, double** %ptr
+  ret <2 x double> %tmp3
+}
+
+define <2 x double> @test_v2f64_post_reg_ld1r(double* %bar, double** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2f64_post_reg_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> <double undef, double undef>, double %tmp1, i32 0
+  %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1
+  %tmp4 = getelementptr double* %bar, i64 %inc
+  store double* %tmp4, double** %ptr
+  ret <2 x double> %tmp3
+}
+
+define <16 x i8> @test_v16i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <16 x i8> %A) {
+; CHECK-LABEL: test_v16i8_post_imm_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 1
+  store i8* %tmp3, i8** %ptr
+  ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @test_v16i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <16 x i8> %A) {
+; CHECK-LABEL: test_v16i8_post_reg_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp3, i8** %ptr
+  ret <16 x i8> %tmp2
+}
+
+define <8 x i8> @test_v8i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <8 x i8> %A) {
+; CHECK-LABEL: test_v8i8_post_imm_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 1
+  store i8* %tmp3, i8** %ptr
+  ret <8 x i8> %tmp2
+}
+
+define <8 x i8> @test_v8i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <8 x i8> %A) {
+; CHECK-LABEL: test_v8i8_post_reg_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp3, i8** %ptr
+  ret <8 x i8> %tmp2
+}
+
+define <8 x i16> @test_v8i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <8 x i16> %A) {
+; CHECK-LABEL: test_v8i16_post_imm_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 1
+  store i16* %tmp3, i16** %ptr
+  ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @test_v8i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <8 x i16> %A) {
+; CHECK-LABEL: test_v8i16_post_reg_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp3, i16** %ptr
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i16> @test_v4i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <4 x i16> %A) {
+; CHECK-LABEL: test_v4i16_post_imm_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 1
+  store i16* %tmp3, i16** %ptr
+  ret <4 x i16> %tmp2
+}
+
+define <4 x i16> @test_v4i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <4 x i16> %A) {
+; CHECK-LABEL: test_v4i16_post_reg_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp3, i16** %ptr
+  ret <4 x i16> %tmp2
+}
+
+define <4 x i32> @test_v4i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <4 x i32> %A) {
+; CHECK-LABEL: test_v4i32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 1
+  store i32* %tmp3, i32** %ptr
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @test_v4i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <4 x i32> %A) {
+; CHECK-LABEL: test_v4i32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp3, i32** %ptr
+  ret <4 x i32> %tmp2
+}
+
+define <2 x i32> @test_v2i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <2 x i32> %A) {
+; CHECK-LABEL: test_v2i32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 1
+  store i32* %tmp3, i32** %ptr
+  ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @test_v2i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <2 x i32> %A) {
+; CHECK-LABEL: test_v2i32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp3, i32** %ptr
+  ret <2 x i32> %tmp2
+}
+
+define <2 x i64> @test_v2i64_post_imm_ld1lane(i64* %bar, i64** %ptr, <2 x i64> %A) {
+; CHECK-LABEL: test_v2i64_post_imm_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], #8
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1
+  %tmp3 = getelementptr i64* %bar, i64 1
+  store i64* %tmp3, i64** %ptr
+  ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @test_v2i64_post_reg_ld1lane(i64* %bar, i64** %ptr, i64 %inc, <2 x i64> %A) {
+; CHECK-LABEL: test_v2i64_post_reg_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1
+  %tmp3 = getelementptr i64* %bar, i64 %inc
+  store i64* %tmp3, i64** %ptr
+  ret <2 x i64> %tmp2
+}
+
+define <4 x float> @test_v4f32_post_imm_ld1lane(float* %bar, float** %ptr, <4 x float> %A) {
+; CHECK-LABEL: test_v4f32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 1
+  store float* %tmp3, float** %ptr
+  ret <4 x float> %tmp2
+}
+
+define <4 x float> @test_v4f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <4 x float> %A) {
+; CHECK-LABEL: test_v4f32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 %inc
+  store float* %tmp3, float** %ptr
+  ret <4 x float> %tmp2
+}
+
+define <2 x float> @test_v2f32_post_imm_ld1lane(float* %bar, float** %ptr, <2 x float> %A) {
+; CHECK-LABEL: test_v2f32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 1
+  store float* %tmp3, float** %ptr
+  ret <2 x float> %tmp2
+}
+
+define <2 x float> @test_v2f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <2 x float> %A) {
+; CHECK-LABEL: test_v2f32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 %inc
+  store float* %tmp3, float** %ptr
+  ret <2 x float> %tmp2
+}
+
+define <2 x double> @test_v2f64_post_imm_ld1lane(double* %bar, double** %ptr, <2 x double> %A) {
+; CHECK-LABEL: test_v2f64_post_imm_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], #8
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1
+  %tmp3 = getelementptr double* %bar, i64 1
+  store double* %tmp3, double** %ptr
+  ret <2 x double> %tmp2
+}
+
+define <2 x double> @test_v2f64_post_reg_ld1lane(double* %bar, double** %ptr, i64 %inc, <2 x double> %A) {
+; CHECK-LABEL: test_v2f64_post_reg_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1
+  %tmp3 = getelementptr double* %bar, i64 %inc
+  store double* %tmp3, double** %ptr
+  ret <2 x double> %tmp2
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM64/inline-asm-error-I.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
index a7aaf9e..a7aaf9e 100644
--- a/test/CodeGen/ARM64/inline-asm-error-I.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
diff --git a/test/CodeGen/ARM64/inline-asm-error-J.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
index 077e1b8..077e1b8 100644
--- a/test/CodeGen/ARM64/inline-asm-error-J.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
diff --git a/test/CodeGen/ARM64/inline-asm-error-K.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
index 2a7f961..2a7f961 100644
--- a/test/CodeGen/ARM64/inline-asm-error-K.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
diff --git a/test/CodeGen/ARM64/inline-asm-error-L.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
index 1701943..1701943 100644
--- a/test/CodeGen/ARM64/inline-asm-error-L.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
diff --git a/test/CodeGen/ARM64/inline-asm-error-M.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
index 952bf60..952bf60 100644
--- a/test/CodeGen/ARM64/inline-asm-error-M.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
diff --git a/test/CodeGen/ARM64/inline-asm-error-N.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
index b4a199f..b4a199f 100644
--- a/test/CodeGen/ARM64/inline-asm-error-N.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
diff --git a/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll b/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
index 6bfce8f..6bfce8f 100644
--- a/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
diff --git a/test/CodeGen/ARM64/inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
index e645078..d76cca3 100644
--- a/test/CodeGen/ARM64/inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -no-integrated-as | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as | FileCheck %s
 
 ; rdar://9167275
 
diff --git a/test/CodeGen/ARM64/join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll
index e99168b..e99168b 100644
--- a/test/CodeGen/ARM64/join-reserved.ll
+++ b/test/CodeGen/AArch64/arm64-join-reserved.ll
diff --git a/test/CodeGen/ARM64/jumptable.ll b/test/CodeGen/AArch64/arm64-jumptable.ll
index 4635cfe..4635cfe 100644
--- a/test/CodeGen/ARM64/jumptable.ll
+++ b/test/CodeGen/AArch64/arm64-jumptable.ll
diff --git a/test/CodeGen/AArch64/arm64-large-frame.ll b/test/CodeGen/AArch64/arm64-large-frame.ll
new file mode 100644
index 0000000..5a53da6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-large-frame.ll
@@ -0,0 +1,69 @@
+; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
+declare void @use_addr(i8*)
+
+@addr = global i8* null
+
+define void @test_bigframe() {
+; CHECK-LABEL: test_bigframe:
+; CHECK: .cfi_startproc
+
+  %var1 = alloca i8, i32 20000000
+  %var2 = alloca i8, i32 16
+  %var3 = alloca i8, i32 20000000
+
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #1575, lsl #12
+; CHECK: sub sp, sp, #2576
+; CHECK: .cfi_def_cfa_offset 40000032
+
+
+; CHECK: add [[TMP:x[0-9]+]], sp, #4095, lsl #12
+; CHECK: add [[TMP1:x[0-9]+]], [[TMP]], #787, lsl #12
+; CHECK: add {{x[0-9]+}}, [[TMP1]], #3344
+  store volatile i8* %var1, i8** @addr
+
+  %var1plus2 = getelementptr i8* %var1, i32 2
+  store volatile i8* %var1plus2, i8** @addr
+
+; CHECK: add [[TMP:x[0-9]+]], sp, #4095, lsl #12
+; CHECK: add [[TMP1:x[0-9]+]], [[TMP]], #787, lsl #12
+; CHECK: add {{x[0-9]+}}, [[TMP1]], #3328
+  store volatile i8* %var2, i8** @addr
+
+  %var2plus2 = getelementptr i8* %var2, i32 2
+  store volatile i8* %var2plus2, i8** @addr
+
+  store volatile i8* %var3, i8** @addr
+
+  %var3plus2 = getelementptr i8* %var3, i32 2
+  store volatile i8* %var3plus2, i8** @addr
+
+; CHECK: add sp, sp, #4095, lsl #12
+; CHECK: add sp, sp, #4095, lsl #12
+; CHECK: add sp, sp, #1575, lsl #12
+; CHECK: add sp, sp, #2576
+; CHECK: .cfi_endproc
+  ret void
+}
+
+define void @test_mediumframe() {
+; CHECK-LABEL: test_mediumframe:
+  %var1 = alloca i8, i32 1000000
+  %var2 = alloca i8, i32 16
+  %var3 = alloca i8, i32 1000000
+; CHECK: sub sp, sp, #488, lsl #12
+; CHECK-NEXT: sub sp, sp, #1168
+
+  store volatile i8* %var1, i8** @addr
+; CHECK: add     [[VAR1ADDR:x[0-9]+]], sp, #244, lsl #12
+; CHECK: add     [[VAR1ADDR]], [[VAR1ADDR]], #592
+
+; CHECK: add [[VAR2ADDR:x[0-9]+]], sp, #244, lsl #12
+; CHECK: add [[VAR2ADDR]], [[VAR2ADDR]], #576
+
+  store volatile i8* %var2, i8** @addr
+; CHECK: add     sp, sp, #488, lsl #12
+; CHECK: add     sp, sp, #1168
+  ret void
+}
diff --git a/test/CodeGen/ARM64/ld1.ll b/test/CodeGen/AArch64/arm64-ld1.ll
index 61836a1..72d808c 100644
--- a/test/CodeGen/ARM64/ld1.ll
+++ b/test/CodeGen/AArch64/arm64-ld1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
 
 %struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
@@ -10,7 +10,7 @@ define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
 ; and from the argument of the function also defined by ABI (i.e., x0)
 ; CHECK ld2.8b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x2_t  %tmp2
 }
 
@@ -19,7 +19,7 @@ define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.8b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x3_t  %tmp2
 }
 
@@ -28,13 +28,13 @@ define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
 
 %struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
 %struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
@@ -45,7 +45,7 @@ define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.16b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
   ret %struct.__neon_int8x16x2_t  %tmp2
 }
 
@@ -54,7 +54,7 @@ define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.16b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
+  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
   ret %struct.__neon_int8x16x3_t  %tmp2
 }
 
@@ -63,13 +63,13 @@ define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
+  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
   ret %struct.__neon_int8x16x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
 
 %struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
@@ -80,7 +80,7 @@ define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.4h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x2_t  %tmp2
 }
 
@@ -89,7 +89,7 @@ define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.4h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x3_t  %tmp2
 }
 
@@ -98,13 +98,13 @@ define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
 
 %struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
 %struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
@@ -115,7 +115,7 @@ define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.8h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x2_t  %tmp2
 }
 
@@ -124,7 +124,7 @@ define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.8h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x3_t %tmp2
 }
 
@@ -133,13 +133,13 @@ define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
 
 %struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
 %struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
@@ -150,7 +150,7 @@ define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.2s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x2_t  %tmp2
 }
 
@@ -159,7 +159,7 @@ define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.2s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x3_t  %tmp2
 }
 
@@ -168,13 +168,13 @@ define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
 
 %struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
 %struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
@@ -185,7 +185,7 @@ define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.4s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x2_t  %tmp2
 }
 
@@ -194,7 +194,7 @@ define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.4s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x3_t  %tmp2
 }
 
@@ -203,13 +203,13 @@ define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
 
 %struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
 %struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
@@ -220,7 +220,7 @@ define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.2d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x2_t  %tmp2
 }
 
@@ -229,7 +229,7 @@ define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.2d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x3_t  %tmp2
 }
 
@@ -238,13 +238,13 @@ define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
 
 %struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
 %struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
@@ -256,7 +256,7 @@ define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x2_t  %tmp2
 }
 
@@ -265,7 +265,7 @@ define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x3_t  %tmp2
 }
 
@@ -274,14 +274,14 @@ define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x4_t  %tmp2
 }
 
 
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
 
 %struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
 %struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
@@ -293,7 +293,7 @@ define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
+	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
 	ret %struct.__neon_float64x1x2_t  %tmp2
 }
 
@@ -302,7 +302,7 @@ define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
+	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
 	ret %struct.__neon_float64x1x3_t  %tmp2
 }
 
@@ -311,13 +311,13 @@ define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
+	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
 	ret %struct.__neon_float64x1x4_t  %tmp2
 }
 
-declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
-declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
-declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
 
 
 define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8* %A) nounwind {
@@ -325,7 +325,7 @@ define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8*
 ; CHECK: ld2lane_16b
 ; CHECK ld2.b { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
 	ret %struct.__neon_int8x16x2_t  %tmp2
 }
 
@@ -334,7 +334,7 @@ define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16
 ; CHECK: ld3lane_16b
 ; CHECK ld3.b { v0, v1, v2 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
 	ret %struct.__neon_int8x16x3_t  %tmp2
 }
 
@@ -343,20 +343,20 @@ define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16
 ; CHECK: ld4lane_16b
 ; CHECK ld4.b { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
 	ret %struct.__neon_int8x16x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
 
 define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld2lane_8h
 ; CHECK ld2.h { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
+	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
 	ret %struct.__neon_int16x8x2_t  %tmp2
 }
 
@@ -365,7 +365,7 @@ define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x
 ; CHECK: ld3lane_8h
 ; CHECK ld3.h { v0, v1, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
 	ret %struct.__neon_int16x8x3_t  %tmp2
 }
 
@@ -374,20 +374,20 @@ define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x
 ; CHECK: ld4lane_8h
 ; CHECK ld4.h { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
+	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
 	ret %struct.__neon_int16x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
 
 define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld2lane_4s
 ; CHECK ld2.s { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
 	ret %struct.__neon_int32x4x2_t  %tmp2
 }
 
@@ -396,7 +396,7 @@ define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x
 ; CHECK: ld3lane_4s
 ; CHECK ld3.s { v0, v1, v2 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
 	ret %struct.__neon_int32x4x3_t  %tmp2
 }
 
@@ -405,20 +405,20 @@ define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x
 ; CHECK: ld4lane_4s
 ; CHECK ld4.s { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
 	ret %struct.__neon_int32x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
 
 define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld2lane_2d
 ; CHECK ld2.d { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
 	ret %struct.__neon_int64x2x2_t  %tmp2
 }
 
@@ -427,7 +427,7 @@ define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x
 ; CHECK: ld3lane_2d
 ; CHECK ld3.d { v0, v1, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
 	ret %struct.__neon_int64x2x3_t  %tmp2
 }
 
@@ -436,13 +436,13 @@ define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x
 ; CHECK: ld4lane_2d
 ; CHECK ld4.d { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
 	ret %struct.__neon_int64x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
 
 define <8 x i8> @ld1r_8b(i8* %bar) {
 ; CHECK: ld1r_8b
@@ -556,7 +556,7 @@ define %struct.__neon_int8x8x2_t @ld2r_8b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.8b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x2_t  %tmp2
 }
 
@@ -565,7 +565,7 @@ define %struct.__neon_int8x8x3_t @ld3r_8b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.8b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x3_t  %tmp2
 }
 
@@ -574,20 +574,20 @@ define %struct.__neon_int8x8x4_t @ld4r_8b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.8b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
 
 define %struct.__neon_int8x16x2_t @ld2r_16b(i8* %A) nounwind {
 ; CHECK: ld2r_16b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.16b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x16x2_t  %tmp2
 }
 
@@ -596,7 +596,7 @@ define %struct.__neon_int8x16x3_t @ld3r_16b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.16b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x16x3_t  %tmp2
 }
 
@@ -605,20 +605,20 @@ define %struct.__neon_int8x16x4_t @ld4r_16b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.16b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x16x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
 
 define %struct.__neon_int16x4x2_t @ld2r_4h(i16* %A) nounwind {
 ; CHECK: ld2r_4h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.4h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x2_t  %tmp2
 }
 
@@ -627,7 +627,7 @@ define %struct.__neon_int16x4x3_t @ld3r_4h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.4h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x3_t  %tmp2
 }
 
@@ -636,20 +636,20 @@ define %struct.__neon_int16x4x4_t @ld4r_4h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.4h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
 
 define %struct.__neon_int16x8x2_t @ld2r_8h(i16* %A) nounwind {
 ; CHECK: ld2r_8h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.8h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x2_t  %tmp2
 }
 
@@ -658,7 +658,7 @@ define %struct.__neon_int16x8x3_t @ld3r_8h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.8h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x3_t  %tmp2
 }
 
@@ -667,20 +667,20 @@ define %struct.__neon_int16x8x4_t @ld4r_8h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.8h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
 
 define %struct.__neon_int32x2x2_t @ld2r_2s(i32* %A) nounwind {
 ; CHECK: ld2r_2s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.2s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x2_t  %tmp2
 }
 
@@ -689,7 +689,7 @@ define %struct.__neon_int32x2x3_t @ld3r_2s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.2s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x3_t  %tmp2
 }
 
@@ -698,20 +698,20 @@ define %struct.__neon_int32x2x4_t @ld4r_2s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.2s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
 
 define %struct.__neon_int32x4x2_t @ld2r_4s(i32* %A) nounwind {
 ; CHECK: ld2r_4s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.4s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x2_t  %tmp2
 }
 
@@ -720,7 +720,7 @@ define %struct.__neon_int32x4x3_t @ld3r_4s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.4s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x3_t  %tmp2
 }
 
@@ -729,20 +729,20 @@ define %struct.__neon_int32x4x4_t @ld4r_4s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.4s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
 
 define %struct.__neon_int64x1x2_t @ld2r_1d(i64* %A) nounwind {
 ; CHECK: ld2r_1d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x2_t  %tmp2
 }
 
@@ -751,7 +751,7 @@ define %struct.__neon_int64x1x3_t @ld3r_1d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x3_t  %tmp2
 }
 
@@ -760,20 +760,20 @@ define %struct.__neon_int64x1x4_t @ld4r_1d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
 
 define %struct.__neon_int64x2x2_t @ld2r_2d(i64* %A) nounwind {
 ; CHECK: ld2r_2d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.2d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x2_t  %tmp2
 }
 
@@ -782,7 +782,7 @@ define %struct.__neon_int64x2x3_t @ld3r_2d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.2d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x3_t  %tmp2
 }
 
@@ -791,13 +791,13 @@ define %struct.__neon_int64x2x4_t @ld4r_2d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.2d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
 
 define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
 ; CHECK-LABEL: ld1_16b
@@ -1041,52 +1041,52 @@ entry:
 %struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
 %struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x2_v8i8:
 ; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x8x2_t %val
 }
 
 define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x2_v4i16:
 ; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x4x2_t %val
 }
 
 define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x2_v2i32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x2x2_t %val
 }
 
 define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(float* %addr) {
 ; CHECK-LABEL: ld1_x2_v2f32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr)
   ret %struct.__neon_float32x2x2_t %val
 }
 
 define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x2_v1i64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x1x2_t %val
 }
 
 define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
 ; CHECK-LABEL: ld1_x2_v1f64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %addr)
   ret %struct.__neon_float64x1x2_t %val
 }
 
@@ -1099,247 +1099,247 @@ define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
 %struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
 %struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x2_v16i8:
 ; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x16x2_t %val
 }
 
 define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x2_v8i16:
 ; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x8x2_t %val
 }
 
 define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x2_v4i32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x4x2_t %val
 }
 
 define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(float* %addr) {
 ; CHECK-LABEL: ld1_x2_v4f32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %addr)
   ret %struct.__neon_float32x4x2_t %val
 }
 
 define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x2_v2i64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x2x2_t %val
 }
 
 define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(double* %addr) {
 ; CHECK-LABEL: ld1_x2_v2f64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %addr)
   ret %struct.__neon_float64x2x2_t %val
 }
 
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x3_v8i8:
 ; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x8x3_t %val
 }
 
 define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x3_v4i16:
 ; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x4x3_t %val
 }
 
 define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x3_v2i32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x2x3_t %val
 }
 
 define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(float* %addr) {
 ; CHECK-LABEL: ld1_x3_v2f32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr)
   ret %struct.__neon_float32x2x3_t %val
 }
 
 define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x3_v1i64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x1x3_t %val
 }
 
 define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(double* %addr) {
 ; CHECK-LABEL: ld1_x3_v1f64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %addr)
   ret %struct.__neon_float64x1x3_t %val
 }
 
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x3_v16i8:
 ; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x16x3_t %val
 }
 
 define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x3_v8i16:
 ; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x8x3_t %val
 }
 
 define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x3_v4i32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x4x3_t %val
 }
 
 define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(float* %addr) {
 ; CHECK-LABEL: ld1_x3_v4f32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %addr)
   ret %struct.__neon_float32x4x3_t %val
 }
 
 define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x3_v2i64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x2x3_t %val
 }
 
 define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(double* %addr) {
 ; CHECK-LABEL: ld1_x3_v2f64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %addr)
   ret %struct.__neon_float64x2x3_t %val
 }
 
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x4_v8i8:
 ; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x8x4_t %val
 }
 
 define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x4_v4i16:
 ; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x4x4_t %val
 }
 
 define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x4_v2i32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x2x4_t %val
 }
 
 define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(float* %addr) {
 ; CHECK-LABEL: ld1_x4_v2f32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr)
   ret %struct.__neon_float32x2x4_t %val
 }
 
 define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x4_v1i64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x1x4_t %val
 }
 
 define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(double* %addr) {
 ; CHECK-LABEL: ld1_x4_v1f64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %addr)
   ret %struct.__neon_float64x1x4_t %val
 }
 
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x4_v16i8:
 ; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x16x4_t %val
 }
 
 define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x4_v8i16:
 ; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x8x4_t %val
 }
 
 define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x4_v4i32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x4x4_t %val
 }
 
 define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(float* %addr) {
 ; CHECK-LABEL: ld1_x4_v4f32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %addr)
   ret %struct.__neon_float32x4x4_t %val
 }
 
 define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x4_v2i64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x2x4_t %val
 }
 
 define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(double* %addr) {
 ; CHECK-LABEL: ld1_x4_v2f64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %addr)
   ret %struct.__neon_float64x2x4_t %val
 }
diff --git a/test/CodeGen/ARM64/ldp.ll b/test/CodeGen/AArch64/arm64-ldp.ll
index 9444385..5a98626 100644
--- a/test/CodeGen/ARM64/ldp.ll
+++ b/test/CodeGen/AArch64/arm64-ldp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
 ; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
 
 ; CHECK: ldp_int
diff --git a/test/CodeGen/ARM64/ldur.ll b/test/CodeGen/AArch64/arm64-ldur.ll
index 2848c06..2848c06 100644
--- a/test/CodeGen/ARM64/ldur.ll
+++ b/test/CodeGen/AArch64/arm64-ldur.ll
diff --git a/test/CodeGen/AArch64/arm64-ldxr-stxr.ll b/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
new file mode 100644
index 0000000..9093df2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
@@ -0,0 +1,270 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+%0 = type { i64, i64 }
+
+define i128 @f0(i8* %p) nounwind readonly {
+; CHECK-LABEL: f0:
+; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %ldrexd = tail call %0 @llvm.aarch64.ldxp(i8* %p)
+  %0 = extractvalue %0 %ldrexd, 1
+  %1 = extractvalue %0 %ldrexd, 0
+  %2 = zext i64 %0 to i128
+  %3 = zext i64 %1 to i128
+  %shl = shl nuw i128 %2, 64
+  %4 = or i128 %shl, %3
+  ret i128 %4
+}
+
+define i32 @f1(i8* %ptr, i128 %val) nounwind {
+; CHECK-LABEL: f1:
+; CHECK: stxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %tmp4 = trunc i128 %val to i64
+  %tmp6 = lshr i128 %val, 64
+  %tmp7 = trunc i128 %tmp6 to i64
+  %strexd = tail call i32 @llvm.aarch64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  ret i32 %strexd
+}
+
+declare %0 @llvm.aarch64.ldxp(i8*) nounwind
+declare i32 @llvm.aarch64.stxp(i64, i64, i8*) nounwind
+
+@var = global i64 0, align 8
+
+define void @test_load_i8(i8* %addr) {
+; CHECK-LABEL: test_load_i8:
+; CHECK: ldxrb w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+  %shortval = trunc i64 %val to i8
+  %extval = zext i8 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i16(i16* %addr) {
+; CHECK-LABEL: test_load_i16:
+; CHECK: ldxrh w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i16(i16* %addr)
+  %shortval = trunc i64 %val to i16
+  %extval = zext i16 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i32(i32* %addr) {
+; CHECK-LABEL: test_load_i32:
+; CHECK: ldxr w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
+  %shortval = trunc i64 %val to i32
+  %extval = zext i32 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i64(i64* %addr) {
+; CHECK-LABEL: test_load_i64:
+; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i64(i64* %addr)
+  store i64 %val, i64* @var, align 8
+  ret void
+}
+
+
+declare i64 @llvm.aarch64.ldxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i64(i64*) nounwind
+
+define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
+; CHECK-LABEL: test_store_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: stxrb w0, w1, [x2]
+  %extval = zext i8 %val to i64
+  %res = call i32 @llvm.aarch64.stxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
+; CHECK-LABEL: test_store_i16:
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: stxrh w0, w1, [x2]
+  %extval = zext i16 %val to i64
+  %res = call i32 @llvm.aarch64.stxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
+; CHECK-LABEL: test_store_i32:
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: stxr w0, w1, [x2]
+  %extval = zext i32 %val to i64
+  %res = call i32 @llvm.aarch64.stxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
+; CHECK-LABEL: test_store_i64:
+; CHECK: stxr w0, x1, [x2]
+  %res = call i32 @llvm.aarch64.stxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %res
+}
+
+declare i32 @llvm.aarch64.stxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i64(i64, i64*) nounwind
+
+; CHECK: test_clear:
+; CHECK: clrex
+define void @test_clear() {
+  call void @llvm.aarch64.clrex()
+  ret void
+}
+
+declare void @llvm.aarch64.clrex() nounwind
+
+define i128 @test_load_acquire_i128(i8* %p) nounwind readonly {
+; CHECK-LABEL: test_load_acquire_i128:
+; CHECK: ldaxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %ldrexd = tail call %0 @llvm.aarch64.ldaxp(i8* %p)
+  %0 = extractvalue %0 %ldrexd, 1
+  %1 = extractvalue %0 %ldrexd, 0
+  %2 = zext i64 %0 to i128
+  %3 = zext i64 %1 to i128
+  %shl = shl nuw i128 %2, 64
+  %4 = or i128 %shl, %3
+  ret i128 %4
+}
+
+define i32 @test_store_release_i128(i8* %ptr, i128 %val) nounwind {
+; CHECK-LABEL: test_store_release_i128:
+; CHECK: stlxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %tmp4 = trunc i128 %val to i64
+  %tmp6 = lshr i128 %val, 64
+  %tmp7 = trunc i128 %tmp6 to i64
+  %strexd = tail call i32 @llvm.aarch64.stlxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  ret i32 %strexd
+}
+
+declare %0 @llvm.aarch64.ldaxp(i8*) nounwind
+declare i32 @llvm.aarch64.stlxp(i64, i64, i8*) nounwind
+
+define void @test_load_acquire_i8(i8* %addr) {
+; CHECK-LABEL: test_load_acquire_i8:
+; CHECK: ldaxrb w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+  %shortval = trunc i64 %val to i8
+  %extval = zext i8 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_acquire_i16(i16* %addr) {
+; CHECK-LABEL: test_load_acquire_i16:
+; CHECK: ldaxrh w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i16(i16* %addr)
+  %shortval = trunc i64 %val to i16
+  %extval = zext i16 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_acquire_i32(i32* %addr) {
+; CHECK-LABEL: test_load_acquire_i32:
+; CHECK: ldaxr w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i32(i32* %addr)
+  %shortval = trunc i64 %val to i32
+  %extval = zext i32 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_acquire_i64(i64* %addr) {
+; CHECK-LABEL: test_load_acquire_i64:
+; CHECK: ldaxr x[[LOADVAL:[0-9]+]], [x0]
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i64(i64* %addr)
+  store i64 %val, i64* @var, align 8
+  ret void
+}
+
+
+declare i64 @llvm.aarch64.ldaxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i64(i64*) nounwind
+
+define i32 @test_store_release_i8(i32, i8 %val, i8* %addr) {
+; CHECK-LABEL: test_store_release_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: stlxrb w0, w1, [x2]
+  %extval = zext i8 %val to i64
+  %res = call i32 @llvm.aarch64.stlxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_release_i16(i32, i16 %val, i16* %addr) {
+; CHECK-LABEL: test_store_release_i16:
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: stlxrh w0, w1, [x2]
+  %extval = zext i16 %val to i64
+  %res = call i32 @llvm.aarch64.stlxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_release_i32(i32, i32 %val, i32* %addr) {
+; CHECK-LABEL: test_store_release_i32:
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: stlxr w0, w1, [x2]
+  %extval = zext i32 %val to i64
+  %res = call i32 @llvm.aarch64.stlxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_release_i64(i32, i64 %val, i64* %addr) {
+; CHECK-LABEL: test_store_release_i64:
+; CHECK: stlxr w0, x1, [x2]
+  %res = call i32 @llvm.aarch64.stlxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %res
+}
+
+declare i32 @llvm.aarch64.stlxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i64(i64, i64*) nounwind
diff --git a/test/CodeGen/ARM64/leaf.ll b/test/CodeGen/AArch64/arm64-leaf.ll
index d3b2031..d3b2031 100644
--- a/test/CodeGen/ARM64/leaf.ll
+++ b/test/CodeGen/AArch64/arm64-leaf.ll
diff --git a/test/CodeGen/ARM64/long-shift.ll b/test/CodeGen/AArch64/arm64-long-shift.ll
index 6f37044..d5baf16 100644
--- a/test/CodeGen/ARM64/long-shift.ll
+++ b/test/CodeGen/AArch64/arm64-long-shift.ll
@@ -2,16 +2,16 @@
 
 define i128 @shl(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: shl:
-; CHECK: lslv  [[XREG_0:x[0-9]+]], x1, x2
-; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
-; CHECK-NEXT: lsrv  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
+; CHECK: lsl  [[XREG_0:x[0-9]+]], x1, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsr  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
 ; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
 ; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
-; CHECK-NEXT: lslv  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
+; CHECK-NEXT: lsl  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
 ; CHECK-NEXT: cmp   [[XREG_4]], #0
 ; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
-; CHECK-NEXT: lslv  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
+; CHECK-NEXT: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
 ; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
 ; CHECK-NEXT: ret
 
@@ -20,17 +20,17 @@ define i128 @shl(i128 %r, i128 %s) nounwind readnone {
 }
 
 define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
-; CHECK: ashr:
-; CHECK: lsrv  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
-; CHECK-NEXT: lslv  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-LABEL: ashr:
+; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
 ; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
 ; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: asrv  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: asr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
 ; CHECK-NEXT: cmp   [[XREG_5]], #0
 ; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: asrv  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
 ; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
 ; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
 ; CHECK-NEXT: ret
@@ -40,17 +40,17 @@ define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
 }
 
 define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
-; CHECK: lshr:
-; CHECK: lsrv  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
-; CHECK-NEXT: lslv  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-LABEL: lshr:
+; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
 ; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
 ; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsrv  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: lsr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
 ; CHECK-NEXT: cmp   [[XREG_5]], #0
 ; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: lsrv  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
 ; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
 ; CHECK-NEXT: ret
 
diff --git a/test/CodeGen/ARM64/memcpy-inline.ll b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
index 26f5166..f921a59 100644
--- a/test/CodeGen/ARM64/memcpy-inline.ll
+++ b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
@@ -75,7 +75,7 @@ define void @t5(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t5:
 ; CHECK: strb wzr, [x0, #6]
-; CHECK: movz [[REG7:w[0-9]+]], #21587
+; CHECK: movz [[REG7:w[0-9]+]], #0x5453
 ; CHECK: strh [[REG7]], [x0, #4]
 ; CHECK: movz [[REG8:w[0-9]+]],
 ; CHECK: movk [[REG8]],
diff --git a/test/CodeGen/ARM64/memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll
index 2e237f4..2e237f4 100644
--- a/test/CodeGen/ARM64/memset-inline.ll
+++ b/test/CodeGen/AArch64/arm64-memset-inline.ll
diff --git a/test/CodeGen/ARM64/memset-to-bzero.ll b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
index b28122c..29036ca 100644
--- a/test/CodeGen/ARM64/memset-to-bzero.ll
+++ b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
@@ -1,4 +1,7 @@
-; RUN: llc %s -march arm64 -o - | FileCheck %s
+; RUN: llc %s -mtriple=arm64-apple-darwin -o - | \
+; RUN:   FileCheck --check-prefix=CHECK-DARWIN --check-prefix=CHECK %s
+; RUN: llc %s -mtriple=arm64-linux-gnu -o - | \
+; RUN:   FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s
 ; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
 
 ; CHECK: @fct1
@@ -14,7 +17,8 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
 
 ; CHECK: @fct2
 ; When the size is bigger than 256, change into bzero.
-; CHECK: bzero
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
 define void @fct2(i8* nocapture %ptr) {
 entry:
   tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)
@@ -23,7 +27,8 @@ entry:
 
 ; CHECK: @fct3
 ; For unknown size, change to bzero.
-; CHECK: bzero
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
 define void @fct3(i8* nocapture %ptr, i32 %unknown) {
 entry:
   %conv = sext i32 %unknown to i64
@@ -47,7 +52,8 @@ declare i64 @llvm.objectsize.i64(i8*, i1)
 
 ; CHECK: @fct5
 ; Size > 256, change.
-; CHECK: bzero
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
 define void @fct5(i8* %ptr) {
 entry:
   %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
@@ -57,7 +63,8 @@ entry:
 
 ; CHECK: @fct6
 ; Size = unknown, change.
-; CHECK: bzero
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
 define void @fct6(i8* %ptr, i32 %unknown) {
 entry:
   %conv = sext i32 %unknown to i64
diff --git a/test/CodeGen/AArch64/misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index 1555c48..f88bd6a 100644
--- a/test/CodeGen/AArch64/misched-basic-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
 ;
 ; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
 ; much higher than the ADD instructions in order to hide latency. When not
@@ -8,10 +8,8 @@
 ; CHECK: ********** MI Scheduling **********
 ; CHECK: main
 ; CHECK: *** Final schedule for BB#2 ***
-; CHECK: SU(13)
-; CHECK: MADDwwww
-; CHECK: SU(4)
-; CHECK: ADDwwi_lsl0_s
+; CHECK: MADDWrrr
+; CHECK: ADDWri
 ; CHECK: ********** INTERVALS **********
 @main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
 @main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
@@ -86,9 +84,9 @@ for.end:                                          ; preds = %for.cond
 ; CHECK: ********** MI Scheduling **********
 ; CHECK: neon4xfloat:BB#0
 ; CHECK: *** Final schedule for BB#0 ***
-; CHECK: FDIVvvv_4S
-; CHECK: FADDvvv_4S
-; CHECK: FADDvvv_4S
+; CHECK: FDIVv4f32
+; CHECK: FADDv4f32
+; CHECK: FADDv4f32
 ; CHECK: ********** INTERVALS **********
 define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
         %tmp1 = fadd <4 x float> %A, %B;
@@ -110,3 +108,17 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
+
+
+; Regression Test for PR19761
+;   [ARM64] Cortex-a53 schedule mode can't handle NEON post-increment load
+;
+; Nothing explicit to check other than llc not crashing.
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
new file mode 100644
index 0000000..97bfb5c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
@@ -0,0 +1,21 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+;
+; For Cortex-A53, shiftable operands that are not actually shifted
+; are not needed for an additional two cycles.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: shiftable
+; CHECK: *** Final schedule for BB#0 ***
+; CHECK: ADDXrr %vreg0, %vreg2
+; CHECK: ADDXrs %vreg0, %vreg2, 5
+; CHECK: ********** INTERVALS **********
+define i64 @shiftable(i64 %A, i64 %B) {
+        %tmp0 = sub i64 %B, 20
+        %tmp1 = shl i64 %tmp0, 5;
+        %tmp2 = add i64 %A, %tmp1;
+        %tmp3 = add i64 %A, %tmp0
+        %tmp4 = mul i64 %tmp2, %tmp3
+
+        ret i64 %tmp4
+}
diff --git a/test/CodeGen/ARM64/movi.ll b/test/CodeGen/AArch64/arm64-movi.ll
index 8fceccc..2cd368d 100644
--- a/test/CodeGen/ARM64/movi.ll
+++ b/test/CodeGen/AArch64/arm64-movi.ll
@@ -6,35 +6,35 @@
 
 ; 64-bit immed with 32-bit pattern size, rotated by 0.
 define i64 @test64_32_rot0() nounwind {
-; CHECK: test64_32_rot0
+; CHECK-LABEL: test64_32_rot0:
 ; CHECK: orr x0, xzr, #0x700000007
   ret i64 30064771079
 }
 
 ; 64-bit immed with 32-bit pattern size, rotated by 2.
 define i64 @test64_32_rot2() nounwind {
-; CHECK: test64_32_rot2
+; CHECK-LABEL: test64_32_rot2:
 ; CHECK: orr x0, xzr, #0xc0000003c0000003
   ret i64 13835058071388291075
 }
 
 ; 64-bit immed with 4-bit pattern size, rotated by 3.
 define i64 @test64_4_rot3() nounwind {
-; CHECK: test64_4_rot3
+; CHECK-LABEL: test64_4_rot3:
 ; CHECK: orr  x0, xzr, #0xeeeeeeeeeeeeeeee
   ret i64 17216961135462248174
 }
 
 ; 32-bit immed with 32-bit pattern size, rotated by 16.
 define i32 @test32_32_rot16() nounwind {
-; CHECK: test32_32_rot16
+; CHECK-LABEL: test32_32_rot16:
 ; CHECK: orr w0, wzr, #0xff0000
   ret i32 16711680
 }
 
 ; 32-bit immed with 2-bit pattern size, rotated by 1.
 define i32 @test32_2_rot1() nounwind {
-; CHECK: test32_2_rot1
+; CHECK-LABEL: test32_2_rot1:
 ; CHECK: orr w0, wzr, #0xaaaaaaaa
   ret i32 2863311530
 }
@@ -44,31 +44,31 @@ define i32 @test32_2_rot1() nounwind {
 ;==--------------------------------------------------------------------------==
 
 define i32 @movz() nounwind {
-; CHECK: movz
-; CHECK: movz w0, #5
+; CHECK-LABEL: movz:
+; CHECK: movz w0, #0x5
   ret i32 5
 }
 
 define i64 @movz_3movk() nounwind {
-; CHECK: movz_3movk
-; CHECK:      movz x0, #5, lsl #48
-; CHECK-NEXT: movk x0, #4660, lsl #32
-; CHECK-NEXT: movk x0, #43981, lsl #16
-; CHECK-NEXT: movk x0, #22136
+; CHECK-LABEL: movz_3movk:
+; CHECK:      movz x0, #0x5, lsl #48
+; CHECK-NEXT: movk x0, #0x1234, lsl #32
+; CHECK-NEXT: movk x0, #0xabcd, lsl #16
+; CHECK-NEXT: movk x0, #0x5678
   ret i64 1427392313513592
 }
 
 define i64 @movz_movk_skip1() nounwind {
-; CHECK: movz_movk_skip1
-; CHECK:      movz x0, #5, lsl #32
-; CHECK-NEXT: movk x0, #17185, lsl #16
+; CHECK-LABEL: movz_movk_skip1:
+; CHECK:      movz x0, #0x5, lsl #32
+; CHECK-NEXT: movk x0, #0x4321, lsl #16
   ret i64 22601072640
 }
 
 define i64 @movz_skip1_movk() nounwind {
-; CHECK: movz_skip1_movk
-; CHECK:      movz x0, #34388, lsl #32
-; CHECK-NEXT: movk x0, #4660
+; CHECK-LABEL: movz_skip1_movk:
+; CHECK:      movz x0, #0x8654, lsl #32
+; CHECK-NEXT: movk x0, #0x1234
   ret i64 147695335379508
 }
 
@@ -77,15 +77,15 @@ define i64 @movz_skip1_movk() nounwind {
 ;==--------------------------------------------------------------------------==
 
 define i64 @movn() nounwind {
-; CHECK: movn
-; CHECK: movn x0, #41
+; CHECK-LABEL: movn:
+; CHECK: movn x0, #0x29
   ret i64 -42
 }
 
 define i64 @movn_skip1_movk() nounwind {
-; CHECK: movn_skip1_movk
-; CHECK:      movn x0, #41, lsl #32
-; CHECK-NEXT: movk x0, #4660
+; CHECK-LABEL: movn_skip1_movk:
+; CHECK:      movn x0, #0x29, lsl #32
+; CHECK-NEXT: movk x0, #0x1234
   ret i64 -176093720012
 }
 
@@ -95,108 +95,108 @@ define i64 @movn_skip1_movk() nounwind {
 ; rdar://14987673
 
 define i64 @orr_movk1() nounwind {
-; CHECK: orr_movk1
+; CHECK-LABEL: orr_movk1:
 ; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #0xdead, lsl #16
   ret i64 72056498262245120
 }
 
 define i64 @orr_movk2() nounwind {
-; CHECK: orr_movk2
+; CHECK-LABEL: orr_movk2:
 ; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #57005, lsl #48
+; CHECK: movk x0, #0xdead, lsl #48
   ret i64 -2400982650836746496
 }
 
 define i64 @orr_movk3() nounwind {
-; CHECK: orr_movk3
+; CHECK-LABEL: orr_movk3:
 ; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #57005, lsl #32
+; CHECK: movk x0, #0xdead, lsl #32
   ret i64 72020953688702720
 }
 
 define i64 @orr_movk4() nounwind {
-; CHECK: orr_movk4
+; CHECK-LABEL: orr_movk4:
 ; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #57005
+; CHECK: movk x0, #0xdead
   ret i64 72056494543068845
 }
 
 ; rdar://14987618
 define i64 @orr_movk5() nounwind {
-; CHECK: orr_movk5
+; CHECK-LABEL: orr_movk5:
 ; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #0xdead, lsl #16
   ret i64 -71777214836900096
 }
 
 define i64 @orr_movk6() nounwind {
-; CHECK: orr_movk6
+; CHECK-LABEL: orr_movk6:
 ; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #57005, lsl #16
-; CHECK: movk x0, #57005, lsl #48
+; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: movk x0, #0xdead, lsl #48
   ret i64 -2400982647117578496
 }
 
 define i64 @orr_movk7() nounwind {
-; CHECK: orr_movk7
+; CHECK-LABEL: orr_movk7:
 ; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #57005, lsl #48
+; CHECK: movk x0, #0xdead, lsl #48
   ret i64 -2400982646575268096
 }
 
 define i64 @orr_movk8() nounwind {
-; CHECK: orr_movk8
+; CHECK-LABEL: orr_movk8:
 ; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #57005
-; CHECK: movk x0, #57005, lsl #48
+; CHECK: movk x0, #0xdead
+; CHECK: movk x0, #0xdead, lsl #48
   ret i64 -2400982646575276371
 }
 
 ; rdar://14987715
 define i64 @orr_movk9() nounwind {
-; CHECK: orr_movk9
+; CHECK-LABEL: orr_movk9:
 ; CHECK: orr x0, xzr, #0xffffff000000000
-; CHECK: movk x0, #65280
-; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #0xff00
+; CHECK: movk x0, #0xdead, lsl #16
   ret i64 1152921439623315200
 }
 
 define i64 @orr_movk10() nounwind {
-; CHECK: orr_movk10
+; CHECK-LABEL: orr_movk10:
 ; CHECK: orr x0, xzr, #0xfffffffffffff00
-; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #0xdead, lsl #16
   ret i64 1152921504047824640
 }
 
 define i64 @orr_movk11() nounwind {
-; CHECK: orr_movk11
+; CHECK-LABEL: orr_movk11:
 ; CHECK: orr x0, xzr, #0xfff00000000000ff
-; CHECK: movk x0, #57005, lsl #16
-; CHECK: movk x0, #65535, lsl #32
+; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: movk x0, #0xffff, lsl #32
   ret i64 -4222125209747201
 }
 
 define i64 @orr_movk12() nounwind {
-; CHECK: orr_movk12
+; CHECK-LABEL: orr_movk12:
 ; CHECK: orr x0, xzr, #0xfff00000000000ff
-; CHECK: movk x0, #57005, lsl #32
+; CHECK: movk x0, #0xdead, lsl #32
   ret i64 -4258765016661761
 }
 
 define i64 @orr_movk13() nounwind {
-; CHECK: orr_movk13
+; CHECK-LABEL: orr_movk13:
 ; CHECK: orr x0, xzr, #0xfffff000000
-; CHECK: movk x0, #57005
-; CHECK: movk x0, #57005, lsl #48
+; CHECK: movk x0, #0xdead
+; CHECK: movk x0, #0xdead, lsl #48
   ret i64 -2401245434149282131
 }
 
 ; rdar://13944082
 define i64 @g() nounwind {
-; CHECK: g
-; CHECK: movz x0, #65535, lsl #48
-; CHECK: movk x0, #2
+; CHECK-LABEL: g:
+; CHECK: movz x0, #0xffff, lsl #48
+; CHECK: movk x0, #0x2
 entry:
   ret i64 -281474976710654
 }
diff --git a/test/CodeGen/ARM64/mul.ll b/test/CodeGen/AArch64/arm64-mul.ll
index 2e7986d..2e7986d 100644
--- a/test/CodeGen/ARM64/mul.ll
+++ b/test/CodeGen/AArch64/arm64-mul.ll
diff --git a/test/CodeGen/AArch64/arm64-named-reg-alloc.ll b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
new file mode 100644
index 0000000..d86d2e6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
@@ -0,0 +1,14 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"x5\00"}
diff --git a/test/CodeGen/AArch64/arm64-named-reg-notareg.ll b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
new file mode 100644
index 0000000..3ca14c4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"notareg\00"}
diff --git a/test/CodeGen/ARM64/neg.ll b/test/CodeGen/AArch64/arm64-neg.ll
index 659ce98..659ce98 100644
--- a/test/CodeGen/ARM64/neg.ll
+++ b/test/CodeGen/AArch64/arm64-neg.ll
diff --git a/test/CodeGen/AArch64/neon-2velem-high.ll b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
index 97031d9..58df094 100644
--- a/test/CodeGen/AArch64/neon-2velem-high.ll
+++ b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
@@ -1,259 +1,269 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 
 define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vmull_high_n_s16:
-; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-LABEL: test_vmull_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   ret <4 x i32> %vmull15.i.i
 }
 
 define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vmull_high_n_s32:
-; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vmull_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   ret <2 x i64> %vmull9.i.i
 }
 
 define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vmull_high_n_u16:
-; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-LABEL: test_vmull_high_n_u16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   ret <4 x i32> %vmull15.i.i
 }
 
 define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vmull_high_n_u32:
-; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vmull_high_n_u32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   ret <2 x i64> %vmull9.i.i
 }
 
 define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vqdmull_high_n_s16:
-; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-LABEL: test_vqdmull_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vqdmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   ret <4 x i32> %vqdmull15.i.i
 }
 
 define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vqdmull_high_n_s32:
-; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vqdmull_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vqdmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   ret <2 x i64> %vqdmull9.i.i
 }
 
 define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlal_high_n_s16:
-; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlal_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
 
 define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlal_high_n_s32:
-; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlal_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
 
 define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlal_high_n_u16:
-; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlal_high_n_u16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
 
 define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlal_high_n_u32:
-; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlal_high_n_u32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
 
 define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vqdmlal_high_n_s16:
-; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vqdmlal_high_n_s16:
+; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
+  %vqdmlal15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlal17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
   ret <4 x i32> %vqdmlal17.i.i
 }
 
 define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vqdmlal_high_n_s32:
-; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vqdmlal_high_n_s32:
+; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
+  %vqdmlal9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlal11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
   ret <2 x i64> %vqdmlal11.i.i
 }
 
 define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlsl_high_n_s16:
-; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlsl_high_n_s16:
+; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
 
 define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlsl_high_n_s32:
-; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlsl_high_n_s32:
+; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
 
 define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlsl_high_n_u16:
-; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlsl_high_n_u16:
+; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
 
 define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlsl_high_n_u32:
-; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlsl_high_n_u32:
+; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
 
 define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vqdmlsl_high_n_s16:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vqdmlsl_high_n_s16:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
+  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
   ret <4 x i32> %vqdmlsl17.i.i
 }
 
 define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vqdmlsl_high_n_s32:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vqdmlsl_high_n_s32:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
+  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
   ret <2 x i64> %vqdmlsl11.i.i
 }
 
 define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) {
-; CHECK: test_vmul_n_f32:
+; CHECK-LABEL: test_vmul_n_f32:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 entry:
   %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
@@ -263,7 +273,7 @@ entry:
 }
 
 define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
-; CHECK: test_vmulq_n_f32:
+; CHECK-LABEL: test_vmulq_n_f32:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 entry:
   %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
@@ -275,7 +285,7 @@ entry:
 }
 
 define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) {
-; CHECK: test_vmulq_n_f64:
+; CHECK-LABEL: test_vmulq_n_f64:
 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 entry:
   %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
@@ -285,7 +295,7 @@ entry:
 }
 
 define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
-; CHECK: test_vfma_n_f32:
+; CHECK-LABEL: test_vfma_n_f32:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
 entry:
   %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
@@ -295,7 +305,7 @@ entry:
 }
 
 define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
-; CHECK: test_vfmaq_n_f32:
+; CHECK-LABEL: test_vfmaq_n_f32:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
 entry:
   %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
@@ -307,7 +317,7 @@ entry:
 }
 
 define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
-; CHECK: test_vfms_n_f32:
+; CHECK-LABEL: test_vfms_n_f32:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
 entry:
   %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
@@ -318,7 +328,7 @@ entry:
 }
 
 define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
-; CHECK: test_vfmsq_n_f32:
+; CHECK-LABEL: test_vfmsq_n_f32:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
 entry:
   %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
diff --git a/test/CodeGen/AArch64/neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll
index acffb14..869966c 100644
--- a/test/CodeGen/AArch64/neon-2velem.ll
+++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -1,49 +1,49 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
-declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
 
-declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
 
-declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
 
-declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
 
-declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
 
-declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
 
-declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
 
-declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
 
-declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 
 define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmla_lane_s16:
+; CHECK-LABEL: test_vmla_lane_s16:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
@@ -54,7 +54,7 @@ entry:
 }
 
 define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlaq_lane_s16:
+; CHECK-LABEL: test_vmlaq_lane_s16:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
@@ -65,7 +65,7 @@ entry:
 }
 
 define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmla_lane_s32:
+; CHECK-LABEL: test_vmla_lane_s32:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -76,7 +76,7 @@ entry:
 }
 
 define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlaq_lane_s32:
+; CHECK-LABEL: test_vmlaq_lane_s32:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -87,7 +87,7 @@ entry:
 }
 
 define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmla_laneq_s16:
+; CHECK-LABEL: test_vmla_laneq_s16:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
@@ -98,7 +98,7 @@ entry:
 }
 
 define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlaq_laneq_s16:
+; CHECK-LABEL: test_vmlaq_laneq_s16:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
@@ -109,7 +109,7 @@ entry:
 }
 
 define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmla_laneq_s32:
+; CHECK-LABEL: test_vmla_laneq_s32:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -120,7 +120,7 @@ entry:
 }
 
 define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlaq_laneq_s32:
+; CHECK-LABEL: test_vmlaq_laneq_s32:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -131,7 +131,7 @@ entry:
 }
 
 define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmls_lane_s16:
+; CHECK-LABEL: test_vmls_lane_s16:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
@@ -142,7 +142,7 @@ entry:
 }
 
 define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsq_lane_s16:
+; CHECK-LABEL: test_vmlsq_lane_s16:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
@@ -153,7 +153,7 @@ entry:
 }
 
 define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmls_lane_s32:
+; CHECK-LABEL: test_vmls_lane_s32:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -164,7 +164,7 @@ entry:
 }
 
 define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsq_lane_s32:
+; CHECK-LABEL: test_vmlsq_lane_s32:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -175,7 +175,7 @@ entry:
 }
 
 define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmls_laneq_s16:
+; CHECK-LABEL: test_vmls_laneq_s16:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
@@ -186,7 +186,7 @@ entry:
 }
 
 define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsq_laneq_s16:
+; CHECK-LABEL: test_vmlsq_laneq_s16:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
@@ -197,7 +197,7 @@ entry:
 }
 
 define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmls_laneq_s32:
+; CHECK-LABEL: test_vmls_laneq_s32:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -208,7 +208,7 @@ entry:
 }
 
 define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsq_laneq_s32:
+; CHECK-LABEL: test_vmlsq_laneq_s32:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -219,7 +219,7 @@ entry:
 }
 
 define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_s16:
+; CHECK-LABEL: test_vmul_lane_s16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
@@ -229,7 +229,7 @@ entry:
 }
 
 define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_s16:
+; CHECK-LABEL: test_vmulq_lane_s16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
@@ -239,7 +239,7 @@ entry:
 }
 
 define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_s32:
+; CHECK-LABEL: test_vmul_lane_s32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -249,7 +249,7 @@ entry:
 }
 
 define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_s32:
+; CHECK-LABEL: test_vmulq_lane_s32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -259,7 +259,7 @@ entry:
 }
 
 define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_u16:
+; CHECK-LABEL: test_vmul_lane_u16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
@@ -269,7 +269,7 @@ entry:
 }
 
 define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_u16:
+; CHECK-LABEL: test_vmulq_lane_u16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
@@ -279,7 +279,7 @@ entry:
 }
 
 define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_u32:
+; CHECK-LABEL: test_vmul_lane_u32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -289,7 +289,7 @@ entry:
 }
 
 define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_u32:
+; CHECK-LABEL: test_vmulq_lane_u32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -299,7 +299,7 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_s16:
+; CHECK-LABEL: test_vmul_laneq_s16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
@@ -309,7 +309,7 @@ entry:
 }
 
 define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_s16:
+; CHECK-LABEL: test_vmulq_laneq_s16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
@@ -319,7 +319,7 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_s32:
+; CHECK-LABEL: test_vmul_laneq_s32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -329,7 +329,7 @@ entry:
 }
 
 define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_s32:
+; CHECK-LABEL: test_vmulq_laneq_s32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -339,7 +339,7 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_u16:
+; CHECK-LABEL: test_vmul_laneq_u16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
@@ -349,7 +349,7 @@ entry:
 }
 
 define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_u16:
+; CHECK-LABEL: test_vmulq_laneq_u16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
@@ -359,7 +359,7 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_u32:
+; CHECK-LABEL: test_vmul_laneq_u32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -369,7 +369,7 @@ entry:
 }
 
 define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_u32:
+; CHECK-LABEL: test_vmulq_laneq_u32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -379,7 +379,7 @@ entry:
 }
 
 define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfma_lane_f32:
+; CHECK-LABEL: test_vfma_lane_f32:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -391,7 +391,7 @@ entry:
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 
 define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmaq_lane_f32:
+; CHECK-LABEL: test_vfmaq_lane_f32:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -403,7 +403,7 @@ entry:
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfma_laneq_f32:
+; CHECK-LABEL: test_vfma_laneq_f32:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -413,7 +413,7 @@ entry:
 }
 
 define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmaq_laneq_f32:
+; CHECK-LABEL: test_vfmaq_laneq_f32:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -423,7 +423,7 @@ entry:
 }
 
 define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfms_lane_f32:
+; CHECK-LABEL: test_vfms_lane_f32:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -434,7 +434,7 @@ entry:
 }
 
 define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmsq_lane_f32:
+; CHECK-LABEL: test_vfmsq_lane_f32:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -445,7 +445,7 @@ entry:
 }
 
 define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfms_laneq_f32:
+; CHECK-LABEL: test_vfms_laneq_f32:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -456,7 +456,7 @@ entry:
 }
 
 define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmsq_laneq_f32:
+; CHECK-LABEL: test_vfmsq_laneq_f32:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -467,7 +467,7 @@ entry:
 }
 
 define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK: test_vfmaq_lane_f64:
+; CHECK-LABEL: test_vfmaq_lane_f64:
 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
 entry:
@@ -479,7 +479,7 @@ entry:
 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
 
 define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmaq_laneq_f64:
+; CHECK-LABEL: test_vfmaq_laneq_f64:
 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
 entry:
@@ -489,7 +489,7 @@ entry:
 }
 
 define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK: test_vfmsq_lane_f64:
+; CHECK-LABEL: test_vfmsq_lane_f64:
 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
 entry:
@@ -500,7 +500,7 @@ entry:
 }
 
 define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmsq_laneq_f64:
+; CHECK-LABEL: test_vfmsq_laneq_f64:
 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
 entry:
@@ -524,7 +524,7 @@ declare float @llvm.fma.f32(float, float, float)
 
 define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
 ; CHECK-LABEL: test_vfmsd_lane_f64
-; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 ; CHECK-NEXT: ret
 entry:
   %extract.rhs = extractelement <1 x double> %v, i32 0
@@ -536,7 +536,7 @@ entry:
 declare double @llvm.fma.f64(double, double, double)
 
 define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
-; CHECK: test_vfmss_laneq_f32
+; CHECK-LABEL: test_vfmss_laneq_f32
 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -558,799 +558,799 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_s16:
+; CHECK-LABEL: test_vmlal_lane_s16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_s32:
+; CHECK-LABEL: test_vmlal_lane_s32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_s16:
+; CHECK-LABEL: test_vmlal_laneq_s16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_s32:
+; CHECK-LABEL: test_vmlal_laneq_s32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_s16:
+; CHECK-LABEL: test_vmlal_high_lane_s16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_s32:
+; CHECK-LABEL: test_vmlal_high_lane_s32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_s16:
+; CHECK-LABEL: test_vmlal_high_laneq_s16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_s32:
+; CHECK-LABEL: test_vmlal_high_laneq_s32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_s16:
+; CHECK-LABEL: test_vmlsl_lane_s16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_s32:
+; CHECK-LABEL: test_vmlsl_lane_s32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_s16:
+; CHECK-LABEL: test_vmlsl_laneq_s16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_s32:
+; CHECK-LABEL: test_vmlsl_laneq_s32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_s16:
+; CHECK-LABEL: test_vmlsl_high_lane_s16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_s32:
+; CHECK-LABEL: test_vmlsl_high_lane_s32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_s16:
+; CHECK-LABEL: test_vmlsl_high_laneq_s16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_s32:
+; CHECK-LABEL: test_vmlsl_high_laneq_s32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_u16:
+; CHECK-LABEL: test_vmlal_lane_u16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_u32:
+; CHECK-LABEL: test_vmlal_lane_u32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_u16:
+; CHECK-LABEL: test_vmlal_laneq_u16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_u32:
+; CHECK-LABEL: test_vmlal_laneq_u32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_u16:
+; CHECK-LABEL: test_vmlal_high_lane_u16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_u32:
+; CHECK-LABEL: test_vmlal_high_lane_u32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_u16:
+; CHECK-LABEL: test_vmlal_high_laneq_u16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_u32:
+; CHECK-LABEL: test_vmlal_high_laneq_u32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_u16:
+; CHECK-LABEL: test_vmlsl_lane_u16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_u32:
+; CHECK-LABEL: test_vmlsl_lane_u32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_u16:
+; CHECK-LABEL: test_vmlsl_laneq_u16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_u32:
+; CHECK-LABEL: test_vmlsl_laneq_u32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_u16:
+; CHECK-LABEL: test_vmlsl_high_lane_u16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_u32:
+; CHECK-LABEL: test_vmlsl_high_lane_u32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_u16:
+; CHECK-LABEL: test_vmlsl_high_laneq_u16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_u32:
+; CHECK-LABEL: test_vmlsl_high_laneq_u32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_s16:
+; CHECK-LABEL: test_vmull_lane_s16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_s32:
+; CHECK-LABEL: test_vmull_lane_s32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_u16:
+; CHECK-LABEL: test_vmull_lane_u16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_u32:
+; CHECK-LABEL: test_vmull_lane_u32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_s16:
+; CHECK-LABEL: test_vmull_high_lane_s16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_s32:
+; CHECK-LABEL: test_vmull_high_lane_s32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_u16:
+; CHECK-LABEL: test_vmull_high_lane_u16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_u32:
+; CHECK-LABEL: test_vmull_high_lane_u32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_s16:
+; CHECK-LABEL: test_vmull_laneq_s16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_s32:
+; CHECK-LABEL: test_vmull_laneq_s32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_u16:
+; CHECK-LABEL: test_vmull_laneq_u16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_u32:
+; CHECK-LABEL: test_vmull_laneq_u32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_s16:
+; CHECK-LABEL: test_vmull_high_laneq_s16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_s32:
+; CHECK-LABEL: test_vmull_high_laneq_s32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_u16:
+; CHECK-LABEL: test_vmull_high_laneq_u16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_u32:
+; CHECK-LABEL: test_vmull_high_laneq_u32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_lane_s16:
+; CHECK-LABEL: test_vqdmlal_lane_s16:
 ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
 define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_lane_s32:
+; CHECK-LABEL: test_vqdmlal_lane_s32:
 ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
 define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_high_lane_s16:
+; CHECK-LABEL: test_vqdmlal_high_lane_s16:
 ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
 define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_high_lane_s32:
+; CHECK-LABEL: test_vqdmlal_high_lane_s32:
 ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
 define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_lane_s16:
+; CHECK-LABEL: test_vqdmlsl_lane_s16:
 ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
 define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_lane_s32:
+; CHECK-LABEL: test_vqdmlsl_lane_s32:
 ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
 define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_high_lane_s16:
+; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
 define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_high_lane_s32:
+; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
 define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_lane_s16:
+; CHECK-LABEL: test_vqdmull_lane_s16:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
 define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_lane_s32:
+; CHECK-LABEL: test_vqdmull_lane_s32:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
 define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_laneq_s16:
+; CHECK-LABEL: test_vqdmull_laneq_s16:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
 define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_laneq_s32:
+; CHECK-LABEL: test_vqdmull_laneq_s32:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
 define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_high_lane_s16:
+; CHECK-LABEL: test_vqdmull_high_lane_s16:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
 define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_high_lane_s32:
+; CHECK-LABEL: test_vqdmull_high_lane_s32:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
 define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_high_laneq_s16:
+; CHECK-LABEL: test_vqdmull_high_laneq_s16:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
 define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_high_laneq_s32:
+; CHECK-LABEL: test_vqdmull_high_laneq_s32:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
 define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulh_lane_s16:
+; CHECK-LABEL: test_vqdmulh_lane_s16:
 ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqdmulh2.i
 }
 
 define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulhq_lane_s16:
+; CHECK-LABEL: test_vqdmulhq_lane_s16:
 ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqdmulh2.i
 }
 
 define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulh_lane_s32:
+; CHECK-LABEL: test_vqdmulh_lane_s32:
 ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqdmulh2.i
 }
 
 define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulhq_lane_s32:
+; CHECK-LABEL: test_vqdmulhq_lane_s32:
 ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqdmulh2.i
 }
 
 define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulh_lane_s16:
+; CHECK-LABEL: test_vqrdmulh_lane_s16:
 ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqrdmulh2.i
 }
 
 define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulhq_lane_s16:
+; CHECK-LABEL: test_vqrdmulhq_lane_s16:
 ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqrdmulh2.i
 }
 
 define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulh_lane_s32:
+; CHECK-LABEL: test_vqrdmulh_lane_s32:
 ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqrdmulh2.i
 }
 
 define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulhq_lane_s32:
+; CHECK-LABEL: test_vqrdmulhq_lane_s32:
 ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqrdmulh2.i
 }
 
 define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmul_lane_f32:
+; CHECK-LABEL: test_vmul_lane_f32:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -1360,8 +1360,8 @@ entry:
 }
 
 define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
-; CHECK: test_vmul_lane_f64:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vmul_lane_f64:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 ; CHECK-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
@@ -1373,7 +1373,7 @@ entry:
 }
 
 define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulq_lane_f32:
+; CHECK-LABEL: test_vmulq_lane_f32:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
@@ -1383,7 +1383,7 @@ entry:
 }
 
 define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulq_lane_f64:
+; CHECK-LABEL: test_vmulq_lane_f64:
 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1393,7 +1393,7 @@ entry:
 }
 
 define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmul_laneq_f32:
+; CHECK-LABEL: test_vmul_laneq_f32:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -1403,7 +1403,7 @@ entry:
 }
 
 define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
-; CHECK: test_vmul_laneq_f64:
+; CHECK-LABEL: test_vmul_laneq_f64:
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
 entry:
@@ -1416,7 +1416,7 @@ entry:
 }
 
 define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulq_laneq_f32:
+; CHECK-LABEL: test_vmulq_laneq_f32:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
@@ -1426,7 +1426,7 @@ entry:
 }
 
 define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulq_laneq_f64:
+; CHECK-LABEL: test_vmulq_laneq_f64:
 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
 entry:
@@ -1436,67 +1436,67 @@ entry:
 }
 
 define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulx_lane_f32:
+; CHECK-LABEL: test_vmulx_lane_f32:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
 define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulxq_lane_f32:
+; CHECK-LABEL: test_vmulxq_lane_f32:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
 define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulxq_lane_f64:
+; CHECK-LABEL: test_vmulxq_lane_f64:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
 define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulx_laneq_f32:
+; CHECK-LABEL: test_vmulx_laneq_f32:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
 define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulxq_laneq_f32:
+; CHECK-LABEL: test_vmulxq_laneq_f32:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
 define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulxq_laneq_f64:
+; CHECK-LABEL: test_vmulxq_laneq_f64:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
 define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmla_lane_s16_0:
+; CHECK-LABEL: test_vmla_lane_s16_0:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1507,7 +1507,7 @@ entry:
 }
 
 define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlaq_lane_s16_0:
+; CHECK-LABEL: test_vmlaq_lane_s16_0:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1518,7 +1518,7 @@ entry:
 }
 
 define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmla_lane_s32_0:
+; CHECK-LABEL: test_vmla_lane_s32_0:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1529,7 +1529,7 @@ entry:
 }
 
 define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlaq_lane_s32_0:
+; CHECK-LABEL: test_vmlaq_lane_s32_0:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1540,7 +1540,7 @@ entry:
 }
 
 define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmla_laneq_s16_0:
+; CHECK-LABEL: test_vmla_laneq_s16_0:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1551,7 +1551,7 @@ entry:
 }
 
 define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlaq_laneq_s16_0:
+; CHECK-LABEL: test_vmlaq_laneq_s16_0:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1562,7 +1562,7 @@ entry:
 }
 
 define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmla_laneq_s32_0:
+; CHECK-LABEL: test_vmla_laneq_s32_0:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1573,7 +1573,7 @@ entry:
 }
 
 define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlaq_laneq_s32_0:
+; CHECK-LABEL: test_vmlaq_laneq_s32_0:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1584,7 +1584,7 @@ entry:
 }
 
 define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmls_lane_s16_0:
+; CHECK-LABEL: test_vmls_lane_s16_0:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1595,7 +1595,7 @@ entry:
 }
 
 define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsq_lane_s16_0:
+; CHECK-LABEL: test_vmlsq_lane_s16_0:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1606,7 +1606,7 @@ entry:
 }
 
 define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmls_lane_s32_0:
+; CHECK-LABEL: test_vmls_lane_s32_0:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1617,7 +1617,7 @@ entry:
 }
 
 define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsq_lane_s32_0:
+; CHECK-LABEL: test_vmlsq_lane_s32_0:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1628,7 +1628,7 @@ entry:
 }
 
 define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmls_laneq_s16_0:
+; CHECK-LABEL: test_vmls_laneq_s16_0:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1639,7 +1639,7 @@ entry:
 }
 
 define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsq_laneq_s16_0:
+; CHECK-LABEL: test_vmlsq_laneq_s16_0:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1650,7 +1650,7 @@ entry:
 }
 
 define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmls_laneq_s32_0:
+; CHECK-LABEL: test_vmls_laneq_s32_0:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1661,7 +1661,7 @@ entry:
 }
 
 define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsq_laneq_s32_0:
+; CHECK-LABEL: test_vmlsq_laneq_s32_0:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1672,7 +1672,7 @@ entry:
 }
 
 define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_s16_0:
+; CHECK-LABEL: test_vmul_lane_s16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1682,7 +1682,7 @@ entry:
 }
 
 define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_s16_0:
+; CHECK-LABEL: test_vmulq_lane_s16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1692,7 +1692,7 @@ entry:
 }
 
 define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_s32_0:
+; CHECK-LABEL: test_vmul_lane_s32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1702,7 +1702,7 @@ entry:
 }
 
 define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_s32_0:
+; CHECK-LABEL: test_vmulq_lane_s32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1712,7 +1712,7 @@ entry:
 }
 
 define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_u16_0:
+; CHECK-LABEL: test_vmul_lane_u16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1722,7 +1722,7 @@ entry:
 }
 
 define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_u16_0:
+; CHECK-LABEL: test_vmulq_lane_u16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1732,7 +1732,7 @@ entry:
 }
 
 define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_u32_0:
+; CHECK-LABEL: test_vmul_lane_u32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1742,7 +1742,7 @@ entry:
 }
 
 define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_u32_0:
+; CHECK-LABEL: test_vmulq_lane_u32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1752,7 +1752,7 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_s16_0:
+; CHECK-LABEL: test_vmul_laneq_s16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1762,7 +1762,7 @@ entry:
 }
 
 define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_s16_0:
+; CHECK-LABEL: test_vmulq_laneq_s16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1772,7 +1772,7 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_s32_0:
+; CHECK-LABEL: test_vmul_laneq_s32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1782,7 +1782,7 @@ entry:
 }
 
 define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_s32_0:
+; CHECK-LABEL: test_vmulq_laneq_s32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1792,7 +1792,7 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_u16_0:
+; CHECK-LABEL: test_vmul_laneq_u16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1802,7 +1802,7 @@ entry:
 }
 
 define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_u16_0:
+; CHECK-LABEL: test_vmulq_laneq_u16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1812,7 +1812,7 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_u32_0:
+; CHECK-LABEL: test_vmul_laneq_u32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1822,7 +1822,7 @@ entry:
 }
 
 define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_u32_0:
+; CHECK-LABEL: test_vmulq_laneq_u32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1832,7 +1832,7 @@ entry:
 }
 
 define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfma_lane_f32_0:
+; CHECK-LABEL: test_vfma_lane_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1842,7 +1842,7 @@ entry:
 }
 
 define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmaq_lane_f32_0:
+; CHECK-LABEL: test_vfmaq_lane_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1852,7 +1852,7 @@ entry:
 }
 
 define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfma_laneq_f32_0:
+; CHECK-LABEL: test_vfma_laneq_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1862,7 +1862,7 @@ entry:
 }
 
 define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmaq_laneq_f32_0:
+; CHECK-LABEL: test_vfmaq_laneq_f32_0:
 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1872,7 +1872,7 @@ entry:
 }
 
 define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfms_lane_f32_0:
+; CHECK-LABEL: test_vfms_lane_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1883,7 +1883,7 @@ entry:
 }
 
 define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmsq_lane_f32_0:
+; CHECK-LABEL: test_vfmsq_lane_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1894,7 +1894,7 @@ entry:
 }
 
 define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfms_laneq_f32_0:
+; CHECK-LABEL: test_vfms_laneq_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1905,7 +1905,7 @@ entry:
 }
 
 define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmsq_laneq_f32_0:
+; CHECK-LABEL: test_vfmsq_laneq_f32_0:
 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1916,7 +1916,7 @@ entry:
 }
 
 define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmaq_laneq_f64_0:
+; CHECK-LABEL: test_vfmaq_laneq_f64_0:
 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1926,7 +1926,7 @@ entry:
 }
 
 define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmsq_laneq_f64_0:
+; CHECK-LABEL: test_vfmsq_laneq_f64_0:
 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
 entry:
@@ -1937,799 +1937,799 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_s16_0:
+; CHECK-LABEL: test_vmlal_lane_s16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_s32_0:
+; CHECK-LABEL: test_vmlal_lane_s32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_s16_0:
+; CHECK-LABEL: test_vmlal_laneq_s16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_s32_0:
+; CHECK-LABEL: test_vmlal_laneq_s32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_s16_0:
+; CHECK-LABEL: test_vmlal_high_lane_s16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_s32_0:
+; CHECK-LABEL: test_vmlal_high_lane_s32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_s16_0:
+; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_s32_0:
+; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_s16_0:
+; CHECK-LABEL: test_vmlsl_lane_s16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_s32_0:
+; CHECK-LABEL: test_vmlsl_lane_s32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_s16_0:
+; CHECK-LABEL: test_vmlsl_laneq_s16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_s32_0:
+; CHECK-LABEL: test_vmlsl_laneq_s32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_s16_0:
+; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_s32_0:
+; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_s16_0:
+; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_s32_0:
+; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_u16_0:
+; CHECK-LABEL: test_vmlal_lane_u16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_u32_0:
+; CHECK-LABEL: test_vmlal_lane_u32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_u16_0:
+; CHECK-LABEL: test_vmlal_laneq_u16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_u32_0:
+; CHECK-LABEL: test_vmlal_laneq_u32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_u16_0:
+; CHECK-LABEL: test_vmlal_high_lane_u16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_u32_0:
+; CHECK-LABEL: test_vmlal_high_lane_u32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_u16_0:
+; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
 
 define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_u32_0:
+; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
 
 define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_u16_0:
+; CHECK-LABEL: test_vmlsl_lane_u16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_u32_0:
+; CHECK-LABEL: test_vmlsl_lane_u32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_u16_0:
+; CHECK-LABEL: test_vmlsl_laneq_u16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_u32_0:
+; CHECK-LABEL: test_vmlsl_laneq_u32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_u16_0:
+; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_u32_0:
+; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_u16_0:
+; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
 
 define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_u32_0:
+; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
 
 define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_s16_0:
+; CHECK-LABEL: test_vmull_lane_s16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_s32_0:
+; CHECK-LABEL: test_vmull_lane_s32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_u16_0:
+; CHECK-LABEL: test_vmull_lane_u16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_u32_0:
+; CHECK-LABEL: test_vmull_lane_u32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_s16_0:
+; CHECK-LABEL: test_vmull_high_lane_s16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_s32_0:
+; CHECK-LABEL: test_vmull_high_lane_s32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_u16_0:
+; CHECK-LABEL: test_vmull_high_lane_u16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_u32_0:
+; CHECK-LABEL: test_vmull_high_lane_u32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_s16_0:
+; CHECK-LABEL: test_vmull_laneq_s16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_s32_0:
+; CHECK-LABEL: test_vmull_laneq_s32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_u16_0:
+; CHECK-LABEL: test_vmull_laneq_u16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_u32_0:
+; CHECK-LABEL: test_vmull_laneq_u32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_s16_0:
+; CHECK-LABEL: test_vmull_high_laneq_s16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_s32_0:
+; CHECK-LABEL: test_vmull_high_laneq_s32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_u16_0:
+; CHECK-LABEL: test_vmull_high_laneq_u16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_u32_0:
+; CHECK-LABEL: test_vmull_high_laneq_u32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
 define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_lane_s16_0:
+; CHECK-LABEL: test_vqdmlal_lane_s16_0:
 ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
 define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_lane_s32_0:
+; CHECK-LABEL: test_vqdmlal_lane_s32_0:
 ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
 define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_high_lane_s16_0:
+; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
 ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
 define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_high_lane_s32_0:
+; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
 ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
 define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_lane_s16_0:
+; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
 ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
 define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_lane_s32_0:
+; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
 ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
 define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_high_lane_s16_0:
+; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
 define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_high_lane_s32_0:
+; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
 define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_lane_s16_0:
+; CHECK-LABEL: test_vqdmull_lane_s16_0:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
 define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_lane_s32_0:
+; CHECK-LABEL: test_vqdmull_lane_s32_0:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
 define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_laneq_s16_0:
+; CHECK-LABEL: test_vqdmull_laneq_s16_0:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
 define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_laneq_s32_0:
+; CHECK-LABEL: test_vqdmull_laneq_s32_0:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
 define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_high_lane_s16_0:
+; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
 define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_high_lane_s32_0:
+; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
 define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_high_laneq_s16_0:
+; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
 define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_high_laneq_s32_0:
+; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
 define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulh_lane_s16_0:
+; CHECK-LABEL: test_vqdmulh_lane_s16_0:
 ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqdmulh2.i
 }
 
 define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulhq_lane_s16_0:
+; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
 ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqdmulh2.i
 }
 
 define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulh_lane_s32_0:
+; CHECK-LABEL: test_vqdmulh_lane_s32_0:
 ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqdmulh2.i
 }
 
 define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulhq_lane_s32_0:
+; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
 ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqdmulh2.i
 }
 
 define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulh_lane_s16_0:
+; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqrdmulh2.i
 }
 
 define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulhq_lane_s16_0:
+; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqrdmulh2.i
 }
 
 define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulh_lane_s32_0:
+; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqrdmulh2.i
 }
 
 define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulhq_lane_s32_0:
+; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqrdmulh2.i
 }
 
 define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmul_lane_f32_0:
+; CHECK-LABEL: test_vmul_lane_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -2739,7 +2739,7 @@ entry:
 }
 
 define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulq_lane_f32_0:
+; CHECK-LABEL: test_vmulq_lane_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -2749,7 +2749,7 @@ entry:
 }
 
 define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmul_laneq_f32_0:
+; CHECK-LABEL: test_vmul_laneq_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -2759,7 +2759,7 @@ entry:
 }
 
 define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
-; CHECK: test_vmul_laneq_f64_0:
+; CHECK-LABEL: test_vmul_laneq_f64_0:
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
 entry:
@@ -2772,7 +2772,7 @@ entry:
 }
 
 define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulq_laneq_f32_0:
+; CHECK-LABEL: test_vmulq_laneq_f32_0:
 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
@@ -2782,7 +2782,7 @@ entry:
 }
 
 define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulq_laneq_f64_0:
+; CHECK-LABEL: test_vmulq_laneq_f64_0:
 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
 entry:
@@ -2792,62 +2792,62 @@ entry:
 }
 
 define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulx_lane_f32_0:
+; CHECK-LABEL: test_vmulx_lane_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
 define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulxq_lane_f32_0:
+; CHECK-LABEL: test_vmulxq_lane_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
 define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulxq_lane_f64_0:
+; CHECK-LABEL: test_vmulxq_lane_f64_0:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
 define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulx_laneq_f32_0:
+; CHECK-LABEL: test_vmulx_laneq_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
 define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulxq_laneq_f32_0:
+; CHECK-LABEL: test_vmulxq_laneq_f32_0:
 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
 define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulxq_laneq_f64_0:
+; CHECK-LABEL: test_vmulxq_laneq_f64_0:
 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
diff --git a/test/CodeGen/AArch64/neon-3vdiff.ll b/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index 96400eb..cb9b36c 100644
--- a/test/CodeGen/AArch64/neon-3vdiff.ll
+++ b/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -1,57 +1,57 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
-declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
 
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
 
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
 
-declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
 
-declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
 
-declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
 
-declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
 
-declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
 
-declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
 
-declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>)
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
 
-declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
 
-declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>)
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
 
-declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>)
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
 
-declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
 
-declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>)
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
 
 define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vaddl_s8:
+; CHECK-LABEL: test_vaddl_s8:
 ; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
@@ -61,7 +61,7 @@ entry:
 }
 
 define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vaddl_s16:
+; CHECK-LABEL: test_vaddl_s16:
 ; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
@@ -71,7 +71,7 @@ entry:
 }
 
 define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vaddl_s32:
+; CHECK-LABEL: test_vaddl_s32:
 ; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
@@ -81,7 +81,7 @@ entry:
 }
 
 define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vaddl_u8:
+; CHECK-LABEL: test_vaddl_u8:
 ; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
@@ -91,7 +91,7 @@ entry:
 }
 
 define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vaddl_u16:
+; CHECK-LABEL: test_vaddl_u16:
 ; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
@@ -101,7 +101,7 @@ entry:
 }
 
 define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vaddl_u32:
+; CHECK-LABEL: test_vaddl_u32:
 ; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
@@ -111,7 +111,7 @@ entry:
 }
 
 define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vaddl_high_s8:
+; CHECK-LABEL: test_vaddl_high_s8:
 ; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -123,7 +123,7 @@ entry:
 }
 
 define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddl_high_s16:
+; CHECK-LABEL: test_vaddl_high_s16:
 ; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -135,7 +135,7 @@ entry:
 }
 
 define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddl_high_s32:
+; CHECK-LABEL: test_vaddl_high_s32:
 ; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -147,7 +147,7 @@ entry:
 }
 
 define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vaddl_high_u8:
+; CHECK-LABEL: test_vaddl_high_u8:
 ; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -159,7 +159,7 @@ entry:
 }
 
 define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddl_high_u16:
+; CHECK-LABEL: test_vaddl_high_u16:
 ; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -171,7 +171,7 @@ entry:
 }
 
 define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddl_high_u32:
+; CHECK-LABEL: test_vaddl_high_u32:
 ; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -183,7 +183,7 @@ entry:
 }
 
 define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vaddw_s8:
+; CHECK-LABEL: test_vaddw_s8:
 ; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 entry:
   %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
@@ -192,7 +192,7 @@ entry:
 }
 
 define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vaddw_s16:
+; CHECK-LABEL: test_vaddw_s16:
 ; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 entry:
   %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
@@ -201,7 +201,7 @@ entry:
 }
 
 define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vaddw_s32:
+; CHECK-LABEL: test_vaddw_s32:
 ; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 entry:
   %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
@@ -210,7 +210,7 @@ entry:
 }
 
 define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vaddw_u8:
+; CHECK-LABEL: test_vaddw_u8:
 ; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 entry:
   %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
@@ -219,7 +219,7 @@ entry:
 }
 
 define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vaddw_u16:
+; CHECK-LABEL: test_vaddw_u16:
 ; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 entry:
   %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
@@ -228,7 +228,7 @@ entry:
 }
 
 define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vaddw_u32:
+; CHECK-LABEL: test_vaddw_u32:
 ; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 entry:
   %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
@@ -237,7 +237,7 @@ entry:
 }
 
 define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vaddw_high_s8:
+; CHECK-LABEL: test_vaddw_high_s8:
 ; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -247,7 +247,7 @@ entry:
 }
 
 define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vaddw_high_s16:
+; CHECK-LABEL: test_vaddw_high_s16:
 ; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -257,7 +257,7 @@ entry:
 }
 
 define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vaddw_high_s32:
+; CHECK-LABEL: test_vaddw_high_s32:
 ; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -267,7 +267,7 @@ entry:
 }
 
 define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vaddw_high_u8:
+; CHECK-LABEL: test_vaddw_high_u8:
 ; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -277,7 +277,7 @@ entry:
 }
 
 define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vaddw_high_u16:
+; CHECK-LABEL: test_vaddw_high_u16:
 ; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -287,7 +287,7 @@ entry:
 }
 
 define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vaddw_high_u32:
+; CHECK-LABEL: test_vaddw_high_u32:
 ; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -297,7 +297,7 @@ entry:
 }
 
 define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsubl_s8:
+; CHECK-LABEL: test_vsubl_s8:
 ; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
@@ -307,7 +307,7 @@ entry:
 }
 
 define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsubl_s16:
+; CHECK-LABEL: test_vsubl_s16:
 ; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
@@ -317,7 +317,7 @@ entry:
 }
 
 define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsubl_s32:
+; CHECK-LABEL: test_vsubl_s32:
 ; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
@@ -327,7 +327,7 @@ entry:
 }
 
 define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsubl_u8:
+; CHECK-LABEL: test_vsubl_u8:
 ; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
@@ -337,7 +337,7 @@ entry:
 }
 
 define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsubl_u16:
+; CHECK-LABEL: test_vsubl_u16:
 ; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
@@ -347,7 +347,7 @@ entry:
 }
 
 define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsubl_u32:
+; CHECK-LABEL: test_vsubl_u32:
 ; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
@@ -357,7 +357,7 @@ entry:
 }
 
 define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsubl_high_s8:
+; CHECK-LABEL: test_vsubl_high_s8:
 ; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -369,7 +369,7 @@ entry:
 }
 
 define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubl_high_s16:
+; CHECK-LABEL: test_vsubl_high_s16:
 ; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -381,7 +381,7 @@ entry:
 }
 
 define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubl_high_s32:
+; CHECK-LABEL: test_vsubl_high_s32:
 ; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -393,7 +393,7 @@ entry:
 }
 
 define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsubl_high_u8:
+; CHECK-LABEL: test_vsubl_high_u8:
 ; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -405,7 +405,7 @@ entry:
 }
 
 define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubl_high_u16:
+; CHECK-LABEL: test_vsubl_high_u16:
 ; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -417,7 +417,7 @@ entry:
 }
 
 define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubl_high_u32:
+; CHECK-LABEL: test_vsubl_high_u32:
 ; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -429,7 +429,7 @@ entry:
 }
 
 define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vsubw_s8:
+; CHECK-LABEL: test_vsubw_s8:
 ; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 entry:
   %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
@@ -438,7 +438,7 @@ entry:
 }
 
 define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vsubw_s16:
+; CHECK-LABEL: test_vsubw_s16:
 ; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 entry:
   %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
@@ -447,7 +447,7 @@ entry:
 }
 
 define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vsubw_s32:
+; CHECK-LABEL: test_vsubw_s32:
 ; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 entry:
   %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
@@ -456,7 +456,7 @@ entry:
 }
 
 define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vsubw_u8:
+; CHECK-LABEL: test_vsubw_u8:
 ; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 entry:
   %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
@@ -465,7 +465,7 @@ entry:
 }
 
 define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vsubw_u16:
+; CHECK-LABEL: test_vsubw_u16:
 ; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 entry:
   %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
@@ -474,7 +474,7 @@ entry:
 }
 
 define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vsubw_u32:
+; CHECK-LABEL: test_vsubw_u32:
 ; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 entry:
   %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
@@ -483,7 +483,7 @@ entry:
 }
 
 define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vsubw_high_s8:
+; CHECK-LABEL: test_vsubw_high_s8:
 ; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -493,7 +493,7 @@ entry:
 }
 
 define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vsubw_high_s16:
+; CHECK-LABEL: test_vsubw_high_s16:
 ; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -503,7 +503,7 @@ entry:
 }
 
 define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vsubw_high_s32:
+; CHECK-LABEL: test_vsubw_high_s32:
 ; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -513,7 +513,7 @@ entry:
 }
 
 define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vsubw_high_u8:
+; CHECK-LABEL: test_vsubw_high_u8:
 ; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -523,7 +523,7 @@ entry:
 }
 
 define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vsubw_high_u16:
+; CHECK-LABEL: test_vsubw_high_u16:
 ; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -533,7 +533,7 @@ entry:
 }
 
 define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vsubw_high_u32:
+; CHECK-LABEL: test_vsubw_high_u32:
 ; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -543,7 +543,7 @@ entry:
 }
 
 define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_s16:
+; CHECK-LABEL: test_vaddhn_s16:
 ; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %vaddhn.i = add <8 x i16> %a, %b
@@ -553,7 +553,7 @@ entry:
 }
 
 define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_s32:
+; CHECK-LABEL: test_vaddhn_s32:
 ; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %vaddhn.i = add <4 x i32> %a, %b
@@ -563,7 +563,7 @@ entry:
 }
 
 define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_s64:
+; CHECK-LABEL: test_vaddhn_s64:
 ; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %vaddhn.i = add <2 x i64> %a, %b
@@ -573,7 +573,7 @@ entry:
 }
 
 define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_u16:
+; CHECK-LABEL: test_vaddhn_u16:
 ; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %vaddhn.i = add <8 x i16> %a, %b
@@ -583,7 +583,7 @@ entry:
 }
 
 define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_u32:
+; CHECK-LABEL: test_vaddhn_u32:
 ; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %vaddhn.i = add <4 x i32> %a, %b
@@ -593,7 +593,7 @@ entry:
 }
 
 define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_u64:
+; CHECK-LABEL: test_vaddhn_u64:
 ; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %vaddhn.i = add <2 x i64> %a, %b
@@ -603,7 +603,7 @@ entry:
 }
 
 define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_high_s16:
+; CHECK-LABEL: test_vaddhn_high_s16:
 ; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %vaddhn.i.i = add <8 x i16> %a, %b
@@ -617,7 +617,7 @@ entry:
 }
 
 define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_high_s32:
+; CHECK-LABEL: test_vaddhn_high_s32:
 ; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %vaddhn.i.i = add <4 x i32> %a, %b
@@ -631,7 +631,7 @@ entry:
 }
 
 define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_high_s64:
+; CHECK-LABEL: test_vaddhn_high_s64:
 ; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %vaddhn.i.i = add <2 x i64> %a, %b
@@ -645,7 +645,7 @@ entry:
 }
 
 define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_high_u16:
+; CHECK-LABEL: test_vaddhn_high_u16:
 ; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %vaddhn.i.i = add <8 x i16> %a, %b
@@ -659,7 +659,7 @@ entry:
 }
 
 define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_high_u32:
+; CHECK-LABEL: test_vaddhn_high_u32:
 ; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %vaddhn.i.i = add <4 x i32> %a, %b
@@ -673,7 +673,7 @@ entry:
 }
 
 define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_high_u64:
+; CHECK-LABEL: test_vaddhn_high_u64:
 ; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %vaddhn.i.i = add <2 x i64> %a, %b
@@ -687,58 +687,58 @@ entry:
 }
 
 define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_s16:
+; CHECK-LABEL: test_vraddhn_s16:
 ; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vraddhn2.i
 }
 
 define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_s32:
+; CHECK-LABEL: test_vraddhn_s32:
 ; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vraddhn2.i
 }
 
 define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_s64:
+; CHECK-LABEL: test_vraddhn_s64:
 ; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vraddhn2.i
 }
 
 define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_u16:
+; CHECK-LABEL: test_vraddhn_u16:
 ; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vraddhn2.i
 }
 
 define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_u32:
+; CHECK-LABEL: test_vraddhn_u32:
 ; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vraddhn2.i
 }
 
 define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_u64:
+; CHECK-LABEL: test_vraddhn_u64:
 ; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vraddhn2.i
 }
 
 define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_high_s16:
+; CHECK-LABEL: test_vraddhn_high_s16:
 ; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -747,10 +747,10 @@ entry:
 }
 
 define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_high_s32:
+; CHECK-LABEL: test_vraddhn_high_s32:
 ; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -759,10 +759,10 @@ entry:
 }
 
 define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_high_s64:
+; CHECK-LABEL: test_vraddhn_high_s64:
 ; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -771,10 +771,10 @@ entry:
 }
 
 define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_high_u16:
+; CHECK-LABEL: test_vraddhn_high_u16:
 ; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -783,10 +783,10 @@ entry:
 }
 
 define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_high_u32:
+; CHECK-LABEL: test_vraddhn_high_u32:
 ; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -795,10 +795,10 @@ entry:
 }
 
 define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_high_u64:
+; CHECK-LABEL: test_vraddhn_high_u64:
 ; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -807,7 +807,7 @@ entry:
 }
 
 define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_s16:
+; CHECK-LABEL: test_vsubhn_s16:
 ; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %vsubhn.i = sub <8 x i16> %a, %b
@@ -817,7 +817,7 @@ entry:
 }
 
 define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_s32:
+; CHECK-LABEL: test_vsubhn_s32:
 ; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %vsubhn.i = sub <4 x i32> %a, %b
@@ -827,7 +827,7 @@ entry:
 }
 
 define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_s64:
+; CHECK-LABEL: test_vsubhn_s64:
 ; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %vsubhn.i = sub <2 x i64> %a, %b
@@ -837,7 +837,7 @@ entry:
 }
 
 define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_u16:
+; CHECK-LABEL: test_vsubhn_u16:
 ; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %vsubhn.i = sub <8 x i16> %a, %b
@@ -847,7 +847,7 @@ entry:
 }
 
 define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_u32:
+; CHECK-LABEL: test_vsubhn_u32:
 ; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %vsubhn.i = sub <4 x i32> %a, %b
@@ -857,7 +857,7 @@ entry:
 }
 
 define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_u64:
+; CHECK-LABEL: test_vsubhn_u64:
 ; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %vsubhn.i = sub <2 x i64> %a, %b
@@ -867,7 +867,7 @@ entry:
 }
 
 define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_high_s16:
+; CHECK-LABEL: test_vsubhn_high_s16:
 ; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %vsubhn.i.i = sub <8 x i16> %a, %b
@@ -881,7 +881,7 @@ entry:
 }
 
 define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_high_s32:
+; CHECK-LABEL: test_vsubhn_high_s32:
 ; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %vsubhn.i.i = sub <4 x i32> %a, %b
@@ -895,7 +895,7 @@ entry:
 }
 
 define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_high_s64:
+; CHECK-LABEL: test_vsubhn_high_s64:
 ; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %vsubhn.i.i = sub <2 x i64> %a, %b
@@ -909,7 +909,7 @@ entry:
 }
 
 define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_high_u16:
+; CHECK-LABEL: test_vsubhn_high_u16:
 ; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %vsubhn.i.i = sub <8 x i16> %a, %b
@@ -923,7 +923,7 @@ entry:
 }
 
 define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_high_u32:
+; CHECK-LABEL: test_vsubhn_high_u32:
 ; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %vsubhn.i.i = sub <4 x i32> %a, %b
@@ -937,7 +937,7 @@ entry:
 }
 
 define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_high_u64:
+; CHECK-LABEL: test_vsubhn_high_u64:
 ; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %vsubhn.i.i = sub <2 x i64> %a, %b
@@ -951,58 +951,58 @@ entry:
 }
 
 define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_s16:
+; CHECK-LABEL: test_vrsubhn_s16:
 ; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vrsubhn2.i
 }
 
 define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_s32:
+; CHECK-LABEL: test_vrsubhn_s32:
 ; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vrsubhn2.i
 }
 
 define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_s64:
+; CHECK-LABEL: test_vrsubhn_s64:
 ; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vrsubhn2.i
 }
 
 define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_u16:
+; CHECK-LABEL: test_vrsubhn_u16:
 ; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vrsubhn2.i
 }
 
 define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_u32:
+; CHECK-LABEL: test_vrsubhn_u32:
 ; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vrsubhn2.i
 }
 
 define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_u64:
+; CHECK-LABEL: test_vrsubhn_u64:
 ; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vrsubhn2.i
 }
 
 define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_high_s16:
+; CHECK-LABEL: test_vrsubhn_high_s16:
 ; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1011,10 +1011,10 @@ entry:
 }
 
 define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_high_s32:
+; CHECK-LABEL: test_vrsubhn_high_s32:
 ; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1023,10 +1023,10 @@ entry:
 }
 
 define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_high_s64:
+; CHECK-LABEL: test_vrsubhn_high_s64:
 ; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1035,10 +1035,10 @@ entry:
 }
 
 define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_high_u16:
+; CHECK-LABEL: test_vrsubhn_high_u16:
 ; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1047,10 +1047,10 @@ entry:
 }
 
 define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_high_u32:
+; CHECK-LABEL: test_vrsubhn_high_u32:
 ; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1059,10 +1059,10 @@ entry:
 }
 
 define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_high_u64:
+; CHECK-LABEL: test_vrsubhn_high_u64:
 ; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1071,763 +1071,759 @@ entry:
 }
 
 define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vabdl_s8:
+; CHECK-LABEL: test_vabdl_s8:
 ; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i
 }
 
 define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vabdl_s16:
+; CHECK-LABEL: test_vabdl_s16:
 ; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i
 }
 
 define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vabdl_s32:
+; CHECK-LABEL: test_vabdl_s32:
 ; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
   %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i
 }
 
 define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vabdl_u8:
+; CHECK-LABEL: test_vabdl_u8:
 ; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i
 }
 
 define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vabdl_u16:
+; CHECK-LABEL: test_vabdl_u16:
 ; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i
 }
 
 define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vabdl_u32:
+; CHECK-LABEL: test_vabdl_u32:
 ; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
   %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i
 }
 
 define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vabal_s8:
+; CHECK-LABEL: test_vabal_s8:
 ; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   %add.i = add <8 x i16> %vmovl.i.i.i, %a
   ret <8 x i16> %add.i
 }
 
 define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vabal_s16:
+; CHECK-LABEL: test_vabal_s16:
 ; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   %add.i = add <4 x i32> %vmovl.i.i.i, %a
   ret <4 x i32> %add.i
 }
 
 define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vabal_s32:
+; CHECK-LABEL: test_vabal_s32:
 ; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   %add.i = add <2 x i64> %vmovl.i.i.i, %a
   ret <2 x i64> %add.i
 }
 
 define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vabal_u8:
+; CHECK-LABEL: test_vabal_u8:
 ; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   %add.i = add <8 x i16> %vmovl.i.i.i, %a
   ret <8 x i16> %add.i
 }
 
 define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vabal_u16:
+; CHECK-LABEL: test_vabal_u16:
 ; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   %add.i = add <4 x i32> %vmovl.i.i.i, %a
   ret <4 x i32> %add.i
 }
 
 define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vabal_u32:
+; CHECK-LABEL: test_vabal_u32:
 ; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   %add.i = add <2 x i64> %vmovl.i.i.i, %a
   ret <2 x i64> %add.i
 }
 
 define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vabdl_high_s8:
+; CHECK-LABEL: test_vabdl_high_s8:
 ; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i.i
 }
 
 define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vabdl_high_s16:
+; CHECK-LABEL: test_vabdl_high_s16:
 ; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i.i
 }
 
 define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vabdl_high_s32:
+; CHECK-LABEL: test_vabdl_high_s32:
 ; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i.i
 }
 
 define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vabdl_high_u8:
+; CHECK-LABEL: test_vabdl_high_u8:
 ; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i.i
 }
 
 define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vabdl_high_u16:
+; CHECK-LABEL: test_vabdl_high_u16:
 ; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i.i
 }
 
 define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vabdl_high_u32:
+; CHECK-LABEL: test_vabdl_high_u32:
 ; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i.i
 }
 
 define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vabal_high_s8:
+; CHECK-LABEL: test_vabal_high_s8:
 ; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
   %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
   ret <8 x i16> %add.i.i
 }
 
 define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vabal_high_s16:
+; CHECK-LABEL: test_vabal_high_s16:
 ; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
   %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
 
 define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vabal_high_s32:
+; CHECK-LABEL: test_vabal_high_s32:
 ; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
   %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
 
 define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vabal_high_u8:
+; CHECK-LABEL: test_vabal_high_u8:
 ; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
   %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
   ret <8 x i16> %add.i.i
 }
 
 define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vabal_high_u16:
+; CHECK-LABEL: test_vabal_high_u16:
 ; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
   %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
 
 define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vabal_high_u32:
+; CHECK-LABEL: test_vabal_high_u32:
 ; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
   %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
 
 define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_s8:
+; CHECK-LABEL: test_vmull_s8:
 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b)
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
   ret <8 x i16> %vmull.i
 }
 
 define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vmull_s16:
+; CHECK-LABEL: test_vmull_s16:
 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vmull_s32:
+; CHECK-LABEL: test_vmull_s32:
 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i64> %vmull2.i
 }
 
 define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_u8:
+; CHECK-LABEL: test_vmull_u8:
 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b)
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
   ret <8 x i16> %vmull.i
 }
 
 define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vmull_u16:
+; CHECK-LABEL: test_vmull_u16:
 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
   ret <4 x i32> %vmull2.i
 }
 
 define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vmull_u32:
+; CHECK-LABEL: test_vmull_u32:
 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i64> %vmull2.i
 }
 
 define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_s8:
+; CHECK-LABEL: test_vmull_high_s8:
 ; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   ret <8 x i16> %vmull.i.i
 }
 
 define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vmull_high_s16:
+; CHECK-LABEL: test_vmull_high_s16:
 ; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   ret <4 x i32> %vmull2.i.i
 }
 
 define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vmull_high_s32:
+; CHECK-LABEL: test_vmull_high_s32:
 ; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   ret <2 x i64> %vmull2.i.i
 }
 
 define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_u8:
+; CHECK-LABEL: test_vmull_high_u8:
 ; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   ret <8 x i16> %vmull.i.i
 }
 
 define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vmull_high_u16:
+; CHECK-LABEL: test_vmull_high_u16:
 ; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   ret <4 x i32> %vmull2.i.i
 }
 
 define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vmull_high_u32:
+; CHECK-LABEL: test_vmull_high_u32:
 ; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   ret <2 x i64> %vmull2.i.i
 }
 
 define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlal_s8:
+; CHECK-LABEL: test_vmlal_s8:
 ; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %add.i = add <8 x i16> %vmull.i.i, %a
   ret <8 x i16> %add.i
 }
 
 define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlal_s16:
+; CHECK-LABEL: test_vmlal_s16:
 ; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %add.i = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add.i
 }
 
 define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlal_s32:
+; CHECK-LABEL: test_vmlal_s32:
 ; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %add.i = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add.i
 }
 
 define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlal_u8:
+; CHECK-LABEL: test_vmlal_u8:
 ; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %add.i = add <8 x i16> %vmull.i.i, %a
   ret <8 x i16> %add.i
 }
 
 define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlal_u16:
+; CHECK-LABEL: test_vmlal_u16:
 ; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %add.i = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add.i
 }
 
 define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlal_u32:
+; CHECK-LABEL: test_vmlal_u32:
 ; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %add.i = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add.i
 }
 
 define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlal_high_s8:
+; CHECK-LABEL: test_vmlal_high_s8:
 ; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %add.i.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i.i
 }
 
 define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlal_high_s16:
+; CHECK-LABEL: test_vmlal_high_s16:
 ; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
 
 define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlal_high_s32:
+; CHECK-LABEL: test_vmlal_high_s32:
 ; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
 
 define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlal_high_u8:
+; CHECK-LABEL: test_vmlal_high_u8:
 ; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %add.i.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i.i
 }
 
 define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlal_high_u16:
+; CHECK-LABEL: test_vmlal_high_u16:
 ; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
 
 define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlal_high_u32:
+; CHECK-LABEL: test_vmlal_high_u32:
 ; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
 
 define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlsl_s8:
+; CHECK-LABEL: test_vmlsl_s8:
 ; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %sub.i = sub <8 x i16> %a, %vmull.i.i
   ret <8 x i16> %sub.i
 }
 
 define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlsl_s16:
+; CHECK-LABEL: test_vmlsl_s16:
 ; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %sub.i = sub <4 x i32> %a, %vmull2.i.i
   ret <4 x i32> %sub.i
 }
 
 define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlsl_s32:
+; CHECK-LABEL: test_vmlsl_s32:
 ; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %sub.i = sub <2 x i64> %a, %vmull2.i.i
   ret <2 x i64> %sub.i
 }
 
 define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlsl_u8:
+; CHECK-LABEL: test_vmlsl_u8:
 ; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %sub.i = sub <8 x i16> %a, %vmull.i.i
   ret <8 x i16> %sub.i
 }
 
 define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlsl_u16:
+; CHECK-LABEL: test_vmlsl_u16:
 ; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %sub.i = sub <4 x i32> %a, %vmull2.i.i
   ret <4 x i32> %sub.i
 }
 
 define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlsl_u32:
+; CHECK-LABEL: test_vmlsl_u32:
 ; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %sub.i = sub <2 x i64> %a, %vmull2.i.i
   ret <2 x i64> %sub.i
 }
 
 define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlsl_high_s8:
+; CHECK-LABEL: test_vmlsl_high_s8:
 ; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
   ret <8 x i16> %sub.i.i
 }
 
 define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlsl_high_s16:
+; CHECK-LABEL: test_vmlsl_high_s16:
 ; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
 
 define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlsl_high_s32:
+; CHECK-LABEL: test_vmlsl_high_s32:
 ; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
 
 define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlsl_high_u8:
+; CHECK-LABEL: test_vmlsl_high_u8:
 ; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
   ret <8 x i16> %sub.i.i
 }
 
 define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlsl_high_u16:
+; CHECK-LABEL: test_vmlsl_high_u16:
 ; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
 
 define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlsl_high_u32:
+; CHECK-LABEL: test_vmlsl_high_u32:
 ; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
 
 define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vqdmull_s16:
+; CHECK-LABEL: test_vqdmull_s16:
 ; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
   ret <4 x i32> %vqdmull2.i
 }
 
 define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vqdmull_s32:
+; CHECK-LABEL: test_vqdmull_s32:
 ; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i64> %vqdmull2.i
 }
 
 define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vqdmlal_s16:
+; CHECK-LABEL: test_vqdmlal_s16:
 ; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
 define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vqdmlal_s32:
+; CHECK-LABEL: test_vqdmlal_s32:
 ; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
 define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vqdmlsl_s16:
+; CHECK-LABEL: test_vqdmlsl_s16:
 ; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
 define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vqdmlsl_s32:
+; CHECK-LABEL: test_vqdmlsl_s32:
 ; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
 define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vqdmull_high_s16:
+; CHECK-LABEL: test_vqdmull_high_s16:
 ; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   ret <4 x i32> %vqdmull2.i.i
 }
 
 define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vqdmull_high_s32:
+; CHECK-LABEL: test_vqdmull_high_s32:
 ; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   ret <2 x i64> %vqdmull2.i.i
 }
 
 define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vqdmlal_high_s16:
+; CHECK-LABEL: test_vqdmlal_high_s16:
 ; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlal2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
+  %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
   ret <4 x i32> %vqdmlal4.i.i
 }
 
 define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vqdmlal_high_s32:
+; CHECK-LABEL: test_vqdmlal_high_s32:
 ; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlal2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
+  %vqdmlal2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
   ret <2 x i64> %vqdmlal4.i.i
 }
 
 define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vqdmlsl_high_s16:
+; CHECK-LABEL: test_vqdmlsl_high_s16:
 ; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
+  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
   ret <4 x i32> %vqdmlsl4.i.i
 }
 
 define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vqdmlsl_high_s32:
+; CHECK-LABEL: test_vqdmlsl_high_s32:
 ; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
+  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
   ret <2 x i64> %vqdmlsl4.i.i
 }
 
 define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_p8:
+; CHECK-LABEL: test_vmull_p8:
 ; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b)
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
   ret <8 x i16> %vmull.i
 }
 
 define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_p8:
+; CHECK-LABEL: test_vmull_high_p8:
 ; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   ret <8 x i16> %vmull.i.i
 }
 
 define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
-; CHECK: test_vmull_p64
+; CHECK-LABEL: test_vmull_p64
 ; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
 entry:
-  %vmull.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vmull1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i, <1 x i64> %vmull1.i) #1
+  %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
   %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
   ret i128 %vmull3.i
 }
 
 define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
-; CHECK: test_vmull_high_p64
+; CHECK-LABEL: test_vmull_high_p64
 ; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %0 = extractelement <2 x i64> %a, i32 1
   %1 = extractelement <2 x i64> %b, i32 1
-  %vmull.i.i = insertelement <1 x i64> undef, i64 %0, i32 0
-  %vmull1.i.i = insertelement <1 x i64> undef, i64 %1, i32 0
-  %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i.i, <1 x i64> %vmull1.i.i) #1
+  %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %0, i64 %1) #1
   %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
   ret i128 %vmull3.i.i
 }
 
-declare <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64>, <1 x i64>) #5
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
 
 
diff --git a/test/CodeGen/AArch64/neon-aba-abd.ll b/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
index 5400984..6404ab7 100644
--- a/test/CodeGen/AArch64/neon-aba-abd.ll
+++ b/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
@@ -1,18 +1,18 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
-declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
 
 define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 ; CHECK: test_uabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
 ; CHECK: uabd v0.8b, v0.8b, v1.8b
   ret <8 x i8> %abd
 }
 
 define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 ; CHECK: test_uaba_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
   %aba = add <8 x i8> %lhs, %abd
 ; CHECK: uaba v0.8b, v0.8b, v1.8b
   ret <8 x i8> %aba
@@ -20,32 +20,32 @@ define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 
 define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 ; CHECK: test_sabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
 ; CHECK: sabd v0.8b, v0.8b, v1.8b
   ret <8 x i8> %abd
 }
 
 define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 ; CHECK: test_saba_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
   %aba = add <8 x i8> %lhs, %abd
 ; CHECK: saba v0.8b, v0.8b, v1.8b
   ret <8 x i8> %aba
 }
 
-declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>)
 
 define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK: test_uabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
 ; CHECK: uabd v0.16b, v0.16b, v1.16b
   ret <16 x i8> %abd
 }
 
 define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK: test_uaba_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
   %aba = add <16 x i8> %lhs, %abd
 ; CHECK: uaba v0.16b, v0.16b, v1.16b
   ret <16 x i8> %aba
@@ -53,32 +53,32 @@ define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 
 define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK: test_sabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
 ; CHECK: sabd v0.16b, v0.16b, v1.16b
   ret <16 x i8> %abd
 }
 
 define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK: test_saba_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
   %aba = add <16 x i8> %lhs, %abd
 ; CHECK: saba v0.16b, v0.16b, v1.16b
   ret <16 x i8> %aba
 }
 
-declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
 
 define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK: test_uabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
 ; CHECK: uabd v0.4h, v0.4h, v1.4h
   ret <4 x i16> %abd
 }
 
 define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK: test_uaba_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %aba = add <4 x i16> %lhs, %abd
 ; CHECK: uaba v0.4h, v0.4h, v1.4h
   ret <4 x i16> %aba
@@ -86,32 +86,32 @@ define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 
 define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK: test_sabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
 ; CHECK: sabd v0.4h, v0.4h, v1.4h
   ret <4 x i16> %abd
 }
 
 define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK: test_saba_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %aba = add <4 x i16> %lhs, %abd
 ; CHECK: saba v0.4h, v0.4h, v1.4h
   ret <4 x i16> %aba
 }
 
-declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>)
 
 define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK: test_uabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
 ; CHECK: uabd v0.8h, v0.8h, v1.8h
   ret <8 x i16> %abd
 }
 
 define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK: test_uaba_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
   %aba = add <8 x i16> %lhs, %abd
 ; CHECK: uaba v0.8h, v0.8h, v1.8h
   ret <8 x i16> %aba
@@ -119,32 +119,32 @@ define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 
 define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK: test_sabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
 ; CHECK: sabd v0.8h, v0.8h, v1.8h
   ret <8 x i16> %abd
 }
 
 define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK: test_saba_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
   %aba = add <8 x i16> %lhs, %abd
 ; CHECK: saba v0.8h, v0.8h, v1.8h
   ret <8 x i16> %aba
 }
 
-declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
 
 define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK: test_uabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
 ; CHECK: uabd v0.2s, v0.2s, v1.2s
   ret <2 x i32> %abd
 }
 
 define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK: test_uaba_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   %aba = add <2 x i32> %lhs, %abd
 ; CHECK: uaba v0.2s, v0.2s, v1.2s
   ret <2 x i32> %aba
@@ -152,16 +152,16 @@ define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 
 define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK: test_sabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
 ; CHECK: sabd v0.2s, v0.2s, v1.2s
   ret <2 x i32> %abd
 }
 
 define <2 x i32> @test_sabd_v2i32_const() {
 ; CHECK: test_sabd_v2i32_const:
-; CHECK: movi     d1, #0xffffffff0000
+; CHECK: movi     d1, #0x00ffffffff0000
 ; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
-  %1 = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(
+  %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(
     <2 x i32> <i32 -2147483648, i32 2147450880>,
     <2 x i32> <i32 -65536, i32 65535>)
   ret <2 x i32> %1
@@ -169,25 +169,25 @@ define <2 x i32> @test_sabd_v2i32_const() {
 
 define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK: test_saba_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   %aba = add <2 x i32> %lhs, %abd
 ; CHECK: saba v0.2s, v0.2s, v1.2s
   ret <2 x i32> %aba
 }
 
-declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>)
 
 define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK: test_uabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
 ; CHECK: uabd v0.4s, v0.4s, v1.4s
   ret <4 x i32> %abd
 }
 
 define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK: test_uaba_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
   %aba = add <4 x i32> %lhs, %abd
 ; CHECK: uaba v0.4s, v0.4s, v1.4s
   ret <4 x i32> %aba
@@ -195,42 +195,42 @@ define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 
 define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK: test_sabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
 ; CHECK: sabd v0.4s, v0.4s, v1.4s
   ret <4 x i32> %abd
 }
 
 define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK: test_saba_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
   %aba = add <4 x i32> %lhs, %abd
 ; CHECK: saba v0.4s, v0.4s, v1.4s
   ret <4 x i32> %aba
 }
 
-declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>)
 
 define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
 ; CHECK: test_fabd_v2f32:
-  %abd = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+  %abd = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
 ; CHECK: fabd v0.2s, v0.2s, v1.2s
   ret <2 x float> %abd
 }
 
-declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>)
 
 define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK: test_fabd_v4f32:
-  %abd = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+  %abd = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
 ; CHECK: fabd v0.4s, v0.4s, v1.4s
   ret <4 x float> %abd
 }
 
-declare <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>)
 
 define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
 ; CHECK: test_fabd_v2f64:
-  %abd = call <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+  %abd = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
 ; CHECK: fabd v0.2d, v0.2d, v1.2d
   ret <2 x double> %abd
 }
diff --git a/test/CodeGen/AArch64/arm64-neon-across.ll b/test/CodeGen/AArch64/arm64-neon-across.ll
new file mode 100644
index 0000000..3a63673
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-across.ll
@@ -0,0 +1,460 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
+
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>)
+
+declare i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
+
+define i16 @test_vaddlv_s8(<8 x i8> %a) {
+; CHECK: test_vaddlv_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %saddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_s16(<4 x i16> %a) {
+; CHECK: test_vaddlv_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %a)
+  ret i32 %saddlvv.i
+}
+
+define i16 @test_vaddlv_u8(<8 x i8> %a) {
+; CHECK: test_vaddlv_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %uaddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_u16(<4 x i16> %a) {
+; CHECK: test_vaddlv_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a)
+  ret i32 %uaddlvv.i
+}
+
+define i16 @test_vaddlvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %saddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %a)
+  ret i32 %saddlvv.i
+}
+
+define i64 @test_vaddlvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_s32:
+; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %saddlvv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %a)
+  ret i64 %saddlvv.i
+}
+
+define i16 @test_vaddlvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %uaddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
+  ret i32 %uaddlvv.i
+}
+
+define i64 @test_vaddlvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_u32:
+; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uaddlvv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %a)
+  ret i64 %uaddlvv.i
+}
+
+define i8 @test_vmaxv_s8(<8 x i8> %a) {
+; CHECK: test_vmaxv_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %smaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_s16(<4 x i16> %a) {
+; CHECK: test_vmaxv_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %smaxv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vmaxv_u8(<8 x i8> %a) {
+; CHECK: test_vmaxv_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %umaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_u16(<4 x i16> %a) {
+; CHECK: test_vmaxv_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %umaxv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vmaxvq_s8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %smaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_s16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %smaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_s32:
+; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a)
+  ret i32 %smaxv.i
+}
+
+define i8 @test_vmaxvq_u8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %umaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_u16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %umaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_u32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_u32:
+; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> %a)
+  ret i32 %umaxv.i
+}
+
+define i8 @test_vminv_s8(<8 x i8> %a) {
+; CHECK: test_vminv_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %sminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminv_s16(<4 x i16> %a) {
+; CHECK: test_vminv_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %sminv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vminv_u8(<8 x i8> %a) {
+; CHECK: test_vminv_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %uminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminv_u16(<4 x i16> %a) {
+; CHECK: test_vminv_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %uminv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vminvq_s8(<16 x i8> %a) {
+; CHECK: test_vminvq_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %sminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminvq_s16(<8 x i16> %a) {
+; CHECK: test_vminvq_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %sminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a) {
+; CHECK: test_vminvq_s32:
+; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a)
+  ret i32 %sminv.i
+}
+
+define i8 @test_vminvq_u8(<16 x i8> %a) {
+; CHECK: test_vminvq_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %uminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminvq_u16(<8 x i16> %a) {
+; CHECK: test_vminvq_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %uminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_u32(<4 x i32> %a) {
+; CHECK: test_vminvq_u32:
+; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a)
+  ret i32 %uminv.i
+}
+
+define i8 @test_vaddv_s8(<8 x i8> %a) {
+; CHECK: test_vaddv_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddv_s16(<4 x i16> %a) {
+; CHECK: test_vaddv_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vaddv_u8(<8 x i8> %a) {
+; CHECK: test_vaddv_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddv_u16(<4 x i16> %a) {
+; CHECK: test_vaddv_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vaddvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddvq_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddvq_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddvq_s32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
+  ret i32 %vaddv.i
+}
+
+define i8 @test_vaddvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddvq_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddvq_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddvq_u32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
+  ret i32 %vaddv.i
+}
+
+define float @test_vmaxvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxvq_f32:
+; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vminvq_f32(<4 x float> %a) {
+; CHECK: test_vminvq_f32:
+; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vmaxnmvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxnmvq_f32:
+; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vminnmvq_f32(<4 x float> %a) {
+; CHECK: test_vminnmvq_f32:
+; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
diff --git a/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll b/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
new file mode 100644
index 0000000..d3dc1b8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_addp_v8i8:
+  %tmp1 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: addp v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_addp_v16i8:
+  %tmp1 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: addp v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_addp_v4i16:
+  %tmp1 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: addp v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_addp_v8i16:
+  %tmp1 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: addp v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_addp_v2i32:
+  %tmp1 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: addp v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_addp_v4i32:
+  %tmp1 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: addp v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_addp_v2i64:
+        %val = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: addp v0.2d, v0.2d, v1.2d
+        ret <2 x i64> %val
+}
+
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_faddp_v2f32:
+        %val = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: faddp v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_faddp_v4f32:
+        %val = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: faddp v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_faddp_v2f64:
+        %val = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: faddp v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
+define i32 @test_vaddv.v2i32(<2 x i32> %a) {
+; CHECK-LABEL: test_vaddv.v2i32
+; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %1 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a)
+  ret i32 %1
+}
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/arm64-neon-add-sub.ll
index 9015237..fbde606 100644
--- a/test/CodeGen/AArch64/neon-add-sub.ll
+++ b/test/CodeGen/AArch64/arm64-neon-add-sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -aarch64-simd-scalar| FileCheck %s
 
 define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
 ;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
@@ -182,35 +182,35 @@ define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) {
 define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vabd_f64
 ; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vmax_f64
 ; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vmin_f64
 ; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vmaxnm_f64
 ; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vminnm_f64
 ; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
@@ -229,51 +229,9 @@ define <1 x double> @test_vneg_f64(<1 x double> %a) {
 }
 
 declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
-declare <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double>, <1 x double>)
 declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
-
-define <1 x i8> @test_add_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_add_v1i8:
-;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %c = add <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @test_add_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_add_v1i16:
-;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %c = add <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @test_add_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_add_v1i32:
-;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %c = add <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @test_sub_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_sub_v1i8:
-;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %c = sub <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @test_sub_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_sub_v1i16:
-;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %c = sub <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @test_sub_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_sub_v1i32:
-;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %c = sub <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-\ No newline at end of file
diff --git a/test/CodeGen/ARM64/neon-compare-instructions.ll b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
index 55f7b99..cba81ef 100644
--- a/test/CodeGen/ARM64/neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
@@ -51,7 +51,7 @@ define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
 
 define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
@@ -59,7 +59,7 @@ define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
 
 define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
@@ -67,7 +67,7 @@ define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
 
 define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
@@ -75,7 +75,7 @@ define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
 
 define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
@@ -83,7 +83,7 @@ define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
 
 define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -91,7 +91,7 @@ define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
 
 define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -99,7 +99,7 @@ define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
 
 define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -803,7 +803,7 @@ define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
 
 define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
@@ -811,7 +811,7 @@ define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
 
 define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
@@ -819,7 +819,7 @@ define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
 
 define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
@@ -827,7 +827,7 @@ define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
 
 define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
@@ -835,7 +835,7 @@ define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
 
 define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -843,7 +843,7 @@ define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
 
 define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -851,7 +851,7 @@ define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
 
 define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
diff --git a/test/CodeGen/AArch64/neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index b4d55df..cfc2ebf 100644
--- a/test/CodeGen/AArch64/neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1,368 +1,423 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 
 define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{w[0-9]+}}
+; CHECK-LABEL: ins16bw:
+; CHECK: ins {{v[0-9]+}}.b[15], {{w[0-9]+}}
   %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
   ret <16 x i8> %tmp3
 }
 
 define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[6], {{w[0-9]+}}
+; CHECK-LABEL: ins8hw:
+; CHECK: ins {{v[0-9]+}}.h[6], {{w[0-9]+}}
   %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
   ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[2], {{w[0-9]+}}
+; CHECK-LABEL: ins4sw:
+; CHECK: ins {{v[0-9]+}}.s[2], {{w[0-9]+}}
   %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
   ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}}
+; CHECK-LABEL: ins2dw:
+; CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}}
   %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
   ret <2 x i64> %tmp3
 }
 
 define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[5], {{w[0-9]+}}
+; CHECK-LABEL: ins8bw:
+; CHECK: ins {{v[0-9]+}}.b[5], {{w[0-9]+}}
   %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
   ret <8 x i8> %tmp3
 }
 
 define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{w[0-9]+}}
+; CHECK-LABEL: ins4hw:
+; CHECK: ins {{v[0-9]+}}.h[3], {{w[0-9]+}}
   %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
   ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}}
+; CHECK-LABEL: ins2sw:
+; CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}}
   %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
   ret <2 x i32> %tmp3
 }
 
 define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
+; CHECK-LABEL: ins16b16:
+; CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 2
   %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
   ret <16 x i8> %tmp4
 }
 
 define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
+; CHECK-LABEL: ins8h8:
+; CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
   ret <8 x i16> %tmp4
 }
 
 define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK-LABEL: ins4s4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
   ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins2d2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <2 x i64> %tmp1, i32 0
   %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
   ret <2 x i64> %tmp4
 }
 
 define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK-LABEL: ins4f4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x float> %tmp1, i32 2
   %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
   ret <4 x float> %tmp4
 }
 
 define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins2df2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <2 x double> %tmp1, i32 0
   %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
   ret <2 x double> %tmp4
 }
 
 define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
+; CHECK-LABEL: ins8b16:
+; CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
   %tmp3 = extractelement <8 x i8> %tmp1, i32 2
   %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
   ret <16 x i8> %tmp4
 }
 
 define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
+; CHECK-LABEL: ins4h8:
+; CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
   ret <8 x i16> %tmp4
 }
 
 define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: ins2s4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
   %tmp3 = extractelement <2 x i32> %tmp1, i32 1
   %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
   ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins1d2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <1 x i64> %tmp1, i32 0
   %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
   ret <2 x i64> %tmp4
 }
 
 define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: ins2f4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
   %tmp3 = extractelement <2 x float> %tmp1, i32 1
   %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
   ret <4 x float> %tmp4
 }
 
 define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins1f2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <1 x double> %tmp1, i32 0
   %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
   ret <2 x double> %tmp4
 }
 
 define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2]
+; CHECK-LABEL: ins16b8:
+; CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 2
   %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
   ret <8 x i8> %tmp4
 }
 
 define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
+; CHECK-LABEL: ins8h4:
+; CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
   ret <4 x i16> %tmp4
 }
 
 define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK-LABEL: ins4s2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
   ret <2 x i32> %tmp4
 }
 
 define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins2d1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <2 x i64> %tmp1, i32 0
   %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
   ret <1 x i64> %tmp4
 }
 
 define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK-LABEL: ins4f2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x float> %tmp1, i32 2
   %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
   ret <2 x float> %tmp4
 }
 
 define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x double> %tmp1, i32 0
+; CHECK-LABEL: ins2f1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  %tmp3 = extractelement <2 x double> %tmp1, i32 1
   %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
   ret <1 x double> %tmp4
 }
 
 define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2]
+; CHECK-LABEL: ins8b8:
+; CHECK: ins {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2]
   %tmp3 = extractelement <8 x i8> %tmp1, i32 2
   %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
   ret <8 x i8> %tmp4
 }
 
 define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
+; CHECK-LABEL: ins4h4:
+; CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
   ret <4 x i16> %tmp4
 }
 
 define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: ins2s2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
   %tmp3 = extractelement <2 x i32> %tmp1, i32 0
   %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
   ret <2 x i32> %tmp4
 }
 
 define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins1d1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
   %tmp3 = extractelement <1 x i64> %tmp1, i32 0
   %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
   ret <1 x i64> %tmp4
 }
 
 define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: ins2f2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
   %tmp3 = extractelement <2 x float> %tmp1, i32 0
   %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
   ret <2 x float> %tmp4
 }
 
 define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins1df1:
+; CHECK-NOT: ins {{v[0-9]+}}
   %tmp3 = extractelement <1 x double> %tmp1, i32 0
   %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
   ret <1 x double> %tmp4
 }
 
 define i32 @umovw16b(<16 x i8> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
+; CHECK-LABEL: umovw16b:
+; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 8
   %tmp4 = zext i8 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i32 @umovw8h(<8 x i16> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: umovw8h:
+; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = zext i16 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i32 @umovw4s(<4 x i32> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[2]
+; CHECK-LABEL: umovw4s:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   ret i32 %tmp3
 }
 
 define i64 @umovx2d(<2 x i64> %tmp1) {
-;CHECK: umov {{x[0-9]+}}, {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+; CHECK-LABEL: umovx2d:
+; CHECK: mov {{x[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 1
   ret i64 %tmp3
 }
 
 define i32 @umovw8b(<8 x i8> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
+; CHECK-LABEL: umovw8b:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
   %tmp3 = extractelement <8 x i8> %tmp1, i32 7
   %tmp4 = zext i8 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i32 @umovw4h(<4 x i16> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: umovw4h:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = zext i16 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i32 @umovw2s(<2 x i32> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: umovw2s:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp3 = extractelement <2 x i32> %tmp1, i32 1
   ret i32 %tmp3
 }
 
 define i64 @umovx1d(<1 x i64> %tmp1) {
-;CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+; CHECK-LABEL: umovx1d:
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
   %tmp3 = extractelement <1 x i64> %tmp1, i32 0
   ret i64 %tmp3
 }
 
 define i32 @smovw16b(<16 x i8> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
+; CHECK-LABEL: smovw16b:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 8
   %tmp4 = sext i8 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
+  %tmp5 = add i32 %tmp4, %tmp4
   ret i32 %tmp5
 }
 
 define i32 @smovw8h(<8 x i16> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: smovw8h:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
+  %tmp5 = add i32 %tmp4, %tmp4
   ret i32 %tmp5
 }
 
 define i32 @smovx16b(<16 x i8> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[8]
+; CHECK-LABEL: smovx16b:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[8]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 8
   %tmp4 = sext i8 %tmp3 to i32
-  ret i32 %tmp4
+  %tmp5 = add i32 %tmp4, %tmp4
+  ret i32 %tmp5
 }
 
 define i32 @smovx8h(<8 x i16> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: smovx8h:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i64 @smovx4s(<4 x i32> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
+; CHECK-LABEL: smovx4s:
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
   %tmp3 = extractelement <4 x i32> %tmp1, i32 2
   %tmp4 = sext i32 %tmp3 to i64
   ret i64 %tmp4
 }
 
 define i32 @smovw8b(<8 x i8> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4]
+; CHECK-LABEL: smovw8b:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4]
   %tmp3 = extractelement <8 x i8> %tmp1, i32 4
   %tmp4 = sext i8 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
+  %tmp5 = add i32 %tmp4, %tmp4
   ret i32 %tmp5
 }
 
 define i32 @smovw4h(<4 x i16> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: smovw4h:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
+  %tmp5 = add i32 %tmp4, %tmp4
   ret i32 %tmp5
 }
 
 define i32 @smovx8b(<8 x i8> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[6]
+; CHECK-LABEL: smovx8b:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[6]
   %tmp3 = extractelement <8 x i8> %tmp1, i32 6
   %tmp4 = sext i8 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i32 @smovx4h(<4 x i16> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: smovx4h:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <4 x i16> %tmp1, i32 2
   %tmp4 = sext i16 %tmp3 to i32
   ret i32 %tmp4
 }
 
 define i64 @smovx2s(<2 x i32> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: smovx2s:
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp3 = extractelement <2 x i32> %tmp1, i32 1
   %tmp4 = sext i32 %tmp3 to i64
   ret i64 %tmp4
 }
 
 define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
-;CHECK: ins  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
+; CHECK-LABEL: test_vcopy_lane_s8:
+; CHECK: ins  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
   %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
   ret <8 x i8> %vset_lane
 }
 
 define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
-;CHECK: ins  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
+; CHECK-LABEL: test_vcopyq_laneq_s8:
+; CHECK: ins  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
   %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
   ret <16 x i8> %vset_lane
 }
 
 define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
-;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
+; CHECK-LABEL: test_vcopy_lane_swap_s8:
+; CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
   %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
   ret <8 x i8> %vset_lane
 }
 
 define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
-;CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
+; CHECK-LABEL: test_vcopyq_laneq_swap_s8:
+; CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
   %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <16 x i8> %vset_lane
 }
 
 define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+; CHECK-LABEL: test_vdup_n_u8:
+; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
   %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
   %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
   %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
@@ -375,7 +430,8 @@ define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
 }
 
 define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+; CHECK-LABEL: test_vdup_n_u16:
+; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
   %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
@@ -384,20 +440,23 @@ define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
 }
 
 define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
+; CHECK-LABEL: test_vdup_n_u32:
+; CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
   %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
   ret <2 x i32> %vecinit1.i
 }
 
 define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
-;CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+; CHECK-LABEL: test_vdup_n_u64:
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
   %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
   ret <1 x i64> %vecinit.i
 }
 
 define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+; CHECK-LABEL: test_vdupq_n_u8:
+; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
   %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
   %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
   %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
@@ -418,7 +477,8 @@ define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
 }
 
 define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+; CHECK-LABEL: test_vdupq_n_u16:
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
   %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
   %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
   %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
@@ -431,7 +491,8 @@ define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
 }
 
 define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+; CHECK-LABEL: test_vdupq_n_u32:
+; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
   %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
   %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
   %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
@@ -440,92 +501,107 @@ define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
 }
 
 define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
+; CHECK-LABEL: test_vdupq_n_u64:
+; CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
   %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
   %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
   ret <2 x i64> %vecinit1.i
 }
 
 define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+; CHECK-LABEL: test_vdup_lane_s8:
+; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
   %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i8> %shuffle
 }
 
 define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: test_vdup_lane_s16:
+; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
   %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i16> %shuffle
 }
 
 define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vdup_lane_s32:
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   ret <2 x i32> %shuffle
 }
 
 define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
-;CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+; CHECK-LABEL: test_vdupq_lane_s8:
+; CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
   %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <16 x i8> %shuffle
 }
 
 define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
-;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: test_vdupq_lane_s16:
+; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
   %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   ret <8 x i16> %shuffle
 }
 
 define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
-;CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vdupq_lane_s32:
+; CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %shuffle
 }
 
 define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
-;CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vdupq_lane_s64:
+; CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %shuffle
 }
 
 define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+; CHECK-LABEL: test_vdup_laneq_s8:
+; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
   %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i8> %shuffle
 }
 
 define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: test_vdup_laneq_s16:
+; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
   %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i16> %shuffle
 }
 
 define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vdup_laneq_s32:
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
   ret <2 x i32> %shuffle
 }
 
 define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+; CHECK-LABEL: test_vdupq_laneq_s8:
+; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
   %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <16 x i8> %shuffle
 }
 
 define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
-;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: test_vdupq_laneq_s16:
+; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
   %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   ret <8 x i16> %shuffle
 }
 
 define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vdupq_laneq_s32:
+; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %shuffle
 }
 
 define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vdupq_laneq_s64:
+; CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %shuffle
 }
@@ -617,7 +693,7 @@ define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
 define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
 ; CHECK-LABEL: test_bitcastv8i8tov1f64:
 ; CHECK: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
   %sub.i = sub <8 x i8> zeroinitializer, %a
   %1 = bitcast <8 x i8> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -627,7 +703,7 @@ define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
 define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
 ; CHECK-LABEL: test_bitcastv4i16tov1f64:
 ; CHECK: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
   %sub.i = sub <4 x i16> zeroinitializer, %a
   %1 = bitcast <4 x i16> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -637,7 +713,7 @@ define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
 define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
 ; CHECK-LABEL: test_bitcastv2i32tov1f64:
 ; CHECK: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
   %sub.i = sub <2 x i32> zeroinitializer, %a
   %1 = bitcast <2 x i32> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -647,7 +723,7 @@ define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
 define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1i64tov1f64:
 ; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
   %sub.i = sub <1 x i64> zeroinitializer, %a
   %1 = bitcast <1 x i64> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -657,7 +733,7 @@ define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
 define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_bitcastv2f32tov1f64:
 ; CHECK: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
   %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
   %1 = bitcast <2 x float> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -666,7 +742,7 @@ define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
 
 define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov8i8:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
 ; CHECK-NEXT: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
   %1 = bitcast <1 x double> %vcvt.i to <8 x i8>
@@ -676,7 +752,7 @@ define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
 
 define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov4i16:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
 ; CHECK-NEXT: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
   %1 = bitcast <1 x double> %vcvt.i to <4 x i16>
@@ -686,7 +762,7 @@ define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
 
 define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov2i32:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
 ; CHECK-NEXT: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
   %1 = bitcast <1 x double> %vcvt.i to <2 x i32>
@@ -696,7 +772,7 @@ define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
 
 define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov1i64:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
 ; CHECK-NEXT: neg {{d[0-9]+}}, {{d[0-9]+}}
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
   %1 = bitcast <1 x double> %vcvt.i to <1 x i64>
@@ -706,7 +782,7 @@ define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
 
 define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test_bitcastv1f64tov2f32:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
 ; CHECK-NEXT: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   %vcvt.i = sitofp <1 x i64> %a to <1 x double>
   %1 = bitcast <1 x double> %vcvt.i to <2 x float>
@@ -717,49 +793,49 @@ define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
 ; Test insert element into an undef vector
 define <8 x i8> @scalar_to_vector.v8i8(i8 %a) {
 ; CHECK-LABEL: scalar_to_vector.v8i8:
-; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
   %b = insertelement <8 x i8> undef, i8 %a, i32 0
   ret <8 x i8> %b
 }
 
 define <16 x i8> @scalar_to_vector.v16i8(i8 %a) {
 ; CHECK-LABEL: scalar_to_vector.v16i8:
-; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
   %b = insertelement <16 x i8> undef, i8 %a, i32 0
   ret <16 x i8> %b
 }
 
 define <4 x i16> @scalar_to_vector.v4i16(i16 %a) {
 ; CHECK-LABEL: scalar_to_vector.v4i16:
-; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
   %b = insertelement <4 x i16> undef, i16 %a, i32 0
   ret <4 x i16> %b
 }
 
 define <8 x i16> @scalar_to_vector.v8i16(i16 %a) {
 ; CHECK-LABEL: scalar_to_vector.v8i16:
-; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
   %b = insertelement <8 x i16> undef, i16 %a, i32 0
   ret <8 x i16> %b
 }
 
 define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
 ; CHECK-LABEL: scalar_to_vector.v2i32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
   %b = insertelement <2 x i32> undef, i32 %a, i32 0
   ret <2 x i32> %b
 }
 
 define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
 ; CHECK-LABEL: scalar_to_vector.v4i32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
   %b = insertelement <4 x i32> undef, i32 %a, i32 0
   ret <4 x i32> %b
 }
 
 define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
 ; CHECK-LABEL: scalar_to_vector.v2i64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{x[0-9]+}}
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
   %b = insertelement <2 x i64> undef, i64 %a, i32 0
   ret <2 x i64> %b
 }
@@ -954,7 +1030,7 @@ define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) {
 ; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
 ; CHECK-NEXT: ret
 entry:
-  %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
   %1 = insertelement <1 x float> undef, float %0, i32 0
   %2 = extractelement <1 x float> %1, i32 0
   %vecinit1.i = insertelement <2 x float> undef, float %2, i32 0
@@ -966,58 +1042,57 @@ define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) {
 ; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
 ; CHECK-NEXT: ret
 entry:
-  %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
   %1 = insertelement <1 x float> undef, float %0, i32 0
   %2 = extractelement <1 x float> %1, i32 0
   %vecinit1.i = insertelement <4 x float> undef, float %2, i32 0
   ret <4 x float> %vecinit1.i
 }
 
-declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
 
-define <2 x i32> @test_concat_undef_v1i32(<1 x i32> %a) {
+define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
 ; CHECK-LABEL: test_concat_undef_v1i32:
-; CHECK: ins v{{[0-9]+}}.s[1], v{{[0-9]+}}.s[0]
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 entry:
-  %0 = extractelement <1 x i32> %a, i32 0
+  %0 = extractelement <2 x i32> %a, i32 0
   %vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1
   ret <2 x i32> %vecinit1.i
 }
 
-declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>) #4
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) #4
 
-define <2 x i32> @test_concat_v1i32_undef(<1 x i32> %a) {
+define <2 x i32> @test_concat_v1i32_undef(i32 %a) {
 ; CHECK-LABEL: test_concat_v1i32_undef:
 ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
 ; CHECK-NEXT: ret
 entry:
-  %b = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a)
-  %0 = extractelement <1 x i32> %b, i32 0
-  %vecinit.i432 = insertelement <2 x i32> undef, i32 %0, i32 0
+  %b = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
+  %vecinit.i432 = insertelement <2 x i32> undef, i32 %b, i32 0
   ret <2 x i32> %vecinit.i432
 }
 
-define <2 x i32> @test_concat_same_v1i32_v1i32(<1 x i32> %a) {
+define <2 x i32> @test_concat_same_v1i32_v1i32(<2 x i32> %a) {
 ; CHECK-LABEL: test_concat_same_v1i32_v1i32:
 ; CHECK: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
 entry:
-  %0 = extractelement <1 x i32> %a, i32 0
+  %0 = extractelement <2 x i32> %a, i32 0
   %vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0
   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %0, i32 1
   ret <2 x i32> %vecinit1.i
 }
 
-define <2 x i32> @test_concat_diff_v1i32_v1i32(<1 x i32> %a, <1 x i32> %b) {
+define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) {
 ; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
 ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: ins v0.s[1], v1.s[0]
+; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %c = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a)
-  %d = extractelement <1 x i32> %c, i32 0
-  %e = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %b)
-  %f = extractelement <1 x i32> %e, i32 0
-  %h = shufflevector <1 x i32> %c, <1 x i32> %e, <2 x i32> <i32 0, i32 1>
+  %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
+  %d = insertelement <2 x i32> undef, i32 %c, i32 0
+  %e = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %b)
+  %f = insertelement <2 x i32> undef, i32 %e, i32 0
+  %h = shufflevector <2 x i32> %d, <2 x i32> %f, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %h
 }
 
@@ -1240,20 +1315,13 @@ define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 {
 ; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32:
 ; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
 entry:
-  %vecext = extractelement <2 x i32> %x, i32 0
-  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
-  %vecext1 = extractelement <2 x i32> %x, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
-  %vecext3 = extractelement <2 x i32> %y, i32 0
-  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
-  %vecext5 = extractelement <2 x i32> %y, i32 1
-  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
+  %vecinit6 = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %vecinit6
 }
 
 define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 {
 ; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %vecinit2
@@ -1261,7 +1329,7 @@ entry:
 
 define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 {
 ; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %vecext = extractelement <1 x i64> %x, i32 0
   %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
@@ -1291,112 +1359,87 @@ entry:
   ret <2 x i64> %vecinit2
 }
 
-declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>)
-
-; This case tests the copy of two FPR8 registers, which is implemented by fmov
-; of two FPR32 registers.
-define <1 x i8> @test_copy_FPR8_FPR8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: test_copy_FPR8_FPR8:
-; CHECK: usqadd b1, b0
-; CHECK-NEXT: fmov s0, s1
-entry:
- %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %b, <1 x i8> %a)
- ret <1 x i8> %vsqadd2.i
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_copy_FPR16_FPR16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: test_copy_FPR16_FPR16:
-; CHECK: usqadd h1, h0
-; CHECK-NEXT: fmov s0, s1
-entry:
-  %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %b, <1 x i16> %a)
-  ret <1 x i16> %vsqadd2.i
-}
 
 define <4 x i16> @concat_vector_v4i16_const() {
 ; CHECK-LABEL: concat_vector_v4i16_const:
-; CHECK: dup {{v[0-9]+}}.4h, wzr
+; CHECK: movi {{d[0-9]+}}, #0
  %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
  ret <4 x i16> %r
 }
 
 define <4 x i16> @concat_vector_v4i16_const_one() {
 ; CHECK-LABEL: concat_vector_v4i16_const_one:
-; CHECK: movz {{w[0-9]+}}, #1
-; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+; CHECK: movi {{v[0-9]+}}.4h, #0x1
  %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <4 x i32> zeroinitializer
  ret <4 x i16> %r
 }
 
 define <4 x i32> @concat_vector_v4i32_const() {
 ; CHECK-LABEL: concat_vector_v4i32_const:
-; CHECK: dup {{v[0-9]+}}.4s, wzr
+; CHECK: movi {{v[0-9]+}}.2d, #0
  %r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer
  ret <4 x i32> %r
 }
 
 define <8 x i8> @concat_vector_v8i8_const() {
 ; CHECK-LABEL: concat_vector_v8i8_const:
-; CHECK: dup {{v[0-9]+}}.8b, wzr
+; CHECK: movi {{d[0-9]+}}, #0
  %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
  ret <8 x i8> %r
 }
 
 define <8 x i16> @concat_vector_v8i16_const() {
 ; CHECK-LABEL: concat_vector_v8i16_const:
-; CHECK: dup {{v[0-9]+}}.8h, wzr
+; CHECK: movi {{v[0-9]+}}.2d, #0
  %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer
  ret <8 x i16> %r
 }
 
 define <8 x i16> @concat_vector_v8i16_const_one() {
 ; CHECK-LABEL: concat_vector_v8i16_const_one:
-; CHECK: movz {{w[0-9]+}}, #1
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+; CHECK: movi {{v[0-9]+}}.8h, #0x1
  %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <8 x i32> zeroinitializer
  ret <8 x i16> %r
 }
 
 define <16 x i8> @concat_vector_v16i8_const() {
 ; CHECK-LABEL: concat_vector_v16i8_const:
-; CHECK: dup {{v[0-9]+}}.16b, wzr
+; CHECK: movi {{v[0-9]+}}.2d, #0
  %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer
  ret <16 x i8> %r
 }
 
 define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
 ; CHECK-LABEL: concat_vector_v4i16:
-; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
  %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
  ret <4 x i16> %r
 }
 
 define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
 ; CHECK-LABEL: concat_vector_v4i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
  %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
  ret <4 x i32> %r
 }
 
 define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
 ; CHECK-LABEL: concat_vector_v8i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[0]
+; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
  %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
  ret <8 x i8> %r
 }
 
 define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
 ; CHECK-LABEL: concat_vector_v8i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
  %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
  ret <8 x i16> %r
 }
 
 define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
 ; CHECK-LABEL: concat_vector_v16i8:
-; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[0]
+; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
  %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
  ret <16 x i8> %r
 }
diff --git a/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
new file mode 100644
index 0000000..276ac13
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; arm64 has a separate copy due to intrinsics
+
+define <4 x i32> @copyTuple.QPair(i32* %a, i32* %b) {
+; CHECK-LABEL: copyTuple.QPair:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QTriple(i32* %a, i32* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QTriple:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QQuad(i32* %a, i32* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QQuad:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
diff --git a/test/CodeGen/AArch64/arm64-neon-mul-div.ll b/test/CodeGen/AArch64/arm64-neon-mul-div.ll
new file mode 100644
index 0000000..720f3eb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-mul-div.ll
@@ -0,0 +1,797 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; arm64 has its own copy of this because of the intrinsics
+
+define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: mul8xi8:
+; CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = mul <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: mul16xi8:
+; CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = mul <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: mul4xi16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = mul <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: mul8xi16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = mul <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: mul2xi32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = mul <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: mul4x32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = mul <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: mul1xi64:
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+  %tmp3 = mul <1 x i64> %A, %B;
+  ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: mul2xi64:
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+  %tmp3 = mul <2 x i64> %A, %B;
+  ret <2 x i64> %tmp3
+}
+
+ define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: mul2xfloat:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = fmul <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: mul4xfloat:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = fmul <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: mul2xdouble:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = fmul <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+
+ define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: div2xfloat:
+; CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = fdiv <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: div4xfloat:
+; CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = fdiv <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: div2xdouble:
+; CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = fdiv <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: sdiv1x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: sdiv8x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: sdiv16x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: sdiv1x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: sdiv4x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: sdiv8x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: sdiv1x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: sdiv2x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: sdiv4x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: sdiv1x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = sdiv <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: sdiv2x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = sdiv <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: udiv1x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: udiv8x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: udiv16x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: udiv1x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: udiv4x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: udiv8x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: udiv1x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: udiv2x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: udiv4x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: udiv1x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = udiv <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: udiv2x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = udiv <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: srem1x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: srem8x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: srem16x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: srem1x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: srem4x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: srem8x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: srem1x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: srem2x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: srem4x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: srem1x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = srem <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: srem2x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = srem <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: urem1x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: urem8x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: urem16x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: urem1x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: urem4x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: urem8x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: urem1x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: urem2x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: urem4x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: urem1x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = urem <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: urem2x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = urem <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: frem2f32:
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+	%tmp3 = frem <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: frem4f32:
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+	%tmp3 = frem <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+
+define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) {
+; CHECK-LABEL: frem1d64:
+; CHECK: bl fmod
+	%tmp3 = frem <1 x double> %A, %B;
+	ret <1 x double> %tmp3
+}
+
+define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: frem2d64:
+; CHECK: bl fmod
+; CHECK: bl fmod
+	%tmp3 = frem <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8>, <8 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8>, <16 x i8>)
+
+define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: poly_mulv8i8:
+   %prod = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: pmul v0.8b, v0.8b, v1.8b
+   ret <8 x i8> %prod
+}
+
+define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK-LABEL: poly_mulv16i8:
+   %prod = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: pmul v0.16b, v0.16b, v1.16b
+   ret <16 x i8> %prod
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
+   ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
+   ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
+   ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
+   ret <4 x i32> %prod
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
+   ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
+   ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
+   ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
+   ret <4 x i32> %prod
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmulx_v2f32:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2s, v0.2s, v1.2s
+        %val = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+        ret <2 x float> %val
+}
+
+define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmulx_v4f32:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.4s, v0.4s, v1.4s
+        %val = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+        ret <4 x float> %val
+}
+
+define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK-LABEL: fmulx_v2f64:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2d, v0.2d, v1.2d
+        %val = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+        ret <2 x double> %val
+}
+
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
index c9128e7..92ed239 100644
--- a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
+++ b/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
-  ; CHECK: test_fmul_lane_ss2S
+  ; CHECK-LABEL: test_fmul_lane_ss2S
   ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fmul float %a, %tmp1;
@@ -9,7 +9,7 @@ define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
 }
 
 define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
-  ; CHECK: test_fmul_lane_ss2S_swap
+  ; CHECK-LABEL: test_fmul_lane_ss2S_swap
   ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fmul float %tmp1, %a;
@@ -18,7 +18,7 @@ define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
 
 
 define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
-  ; CHECK: test_fmul_lane_ss4S
+  ; CHECK-LABEL: test_fmul_lane_ss4S
   ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fmul float %a, %tmp1;
@@ -26,7 +26,7 @@ define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
 }
 
 define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
-  ; CHECK: test_fmul_lane_ss4S_swap
+  ; CHECK-LABEL: test_fmul_lane_ss4S_swap
   ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fmul float %tmp1, %a;
@@ -35,8 +35,8 @@ define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
 
 
 define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
-  ; CHECK: test_fmul_lane_ddD
-  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  ; CHECK-LABEL: test_fmul_lane_ddD
+  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = fmul double %a, %tmp1;
   ret double %tmp2;
@@ -45,7 +45,7 @@ define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
 
 
 define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
-  ; CHECK: test_fmul_lane_dd2D
+  ; CHECK-LABEL: test_fmul_lane_dd2D
   ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fmul double %a, %tmp1;
@@ -54,71 +54,71 @@ define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
 
 
 define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
-  ; CHECK: test_fmul_lane_dd2D_swap
+  ; CHECK-LABEL: test_fmul_lane_dd2D_swap
   ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fmul double %tmp1, %a;
   ret double %tmp2;
 }
 
-declare float @llvm.aarch64.neon.vmulx.f32(float, float)
+declare float @llvm.aarch64.neon.fmulx.f32(float, float)
 
 define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
-  ; CHECK: test_fmulx_lane_f32
+  ; CHECK-LABEL: test_fmulx_lane_f32
   ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
   ret float %tmp2;
 }
 
 define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
-  ; CHECK: test_fmulx_laneq_f32
+  ; CHECK-LABEL: test_fmulx_laneq_f32
   ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
   ret float %tmp2;
 }
 
 define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
-  ; CHECK: test_fmulx_laneq_f32_swap
+  ; CHECK-LABEL: test_fmulx_laneq_f32_swap
   ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %tmp1, float %a)
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %tmp1, float %a)
   ret float %tmp2;
 }
 
-declare double @llvm.aarch64.neon.vmulx.f64(double, double)
+declare double @llvm.aarch64.neon.fmulx.f64(double, double)
 
 define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
-  ; CHECK: test_fmulx_lane_f64
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  ; CHECK-LABEL: test_fmulx_lane_f64
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
   %tmp1 = extractelement <1 x double> %v, i32 0
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
 }
 
 define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
-  ; CHECK: test_fmulx_laneq_f64_0
+  ; CHECK-LABEL: test_fmulx_laneq_f64_0
   ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
   %tmp1 = extractelement <2 x double> %v, i32 0
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
 }
 
 
 define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
-  ; CHECK: test_fmulx_laneq_f64_1
+  ; CHECK-LABEL: test_fmulx_laneq_f64_1
   ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
 }
 
 define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
-  ; CHECK: test_fmulx_laneq_f64_1_swap
+  ; CHECK-LABEL: test_fmulx_laneq_f64_1_swap
   ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %tmp1, double %a)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %tmp1, double %a)
   ret double %tmp2;
 }
 
diff --git a/test/CodeGen/AArch64/neon-select_cc.ll b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
index f6b5d3c..255b90d 100644
--- a/test/CodeGen/AArch64/neon-select_cc.ll
+++ b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
 ; CHECK-LABEL: test_select_cc_v8i8_i8:
-; CHECK: and	w0, w0, #0xff
-; CHECK-NEXT: cmp	w0, w1, uxtb
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.8b, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].8b, v[[LHS]].8b, v[[RHS]].8b
+; CHECK: dup [[DUPMASK:v[0-9]+]].8b, [[MASK]].b[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
   %cmp31 = icmp eq i8 %a, %b
   %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
   ret <8x i8> %e
@@ -14,9 +14,9 @@ define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
 
 define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) {
 ; CHECK-LABEL: test_select_cc_v8i8_f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK-NEXT: bsl [[DUPMASK]].8b, v2.8b, v3.8b
   %cmp31 = fcmp oeq float %a, %b
   %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
   ret <8x i8> %e
@@ -24,8 +24,8 @@ define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8>
 
 define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) {
 ; CHECK-LABEL: test_select_cc_v8i8_f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1
+; CHECK-NEXT: bsl v[[MASK]].8b, v2.8b, v3.8b
   %cmp31 = fcmp oeq double %a, %b
   %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
   ret <8x i8> %e
@@ -33,11 +33,11 @@ define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8
 
 define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) {
 ; CHECK-LABEL: test_select_cc_v16i8_i8:
-; CHECK: and	w0, w0, #0xff
-; CHECK-NEXT: cmp	w0, w1, uxtb
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.16b, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].16b, v[[LHS]].16b, v[[RHS]].16b
+; CHECK: dup [[DUPMASK:v[0-9]+]].16b, [[MASK]].b[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
   %cmp31 = icmp eq i8 %a, %b
   %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
   ret <16x i8> %e
@@ -45,9 +45,9 @@ define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d
 
 define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) {
 ; CHECK-LABEL: test_select_cc_v16i8_f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b
   %cmp31 = fcmp oeq float %a, %b
   %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
   ret <16x i8> %e
@@ -55,9 +55,9 @@ define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x
 
 define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) {
 ; CHECK-LABEL: test_select_cc_v16i8_f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT: dup	v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b
   %cmp31 = fcmp oeq double %a, %b
   %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
   ret <16x i8> %e
@@ -65,11 +65,11 @@ define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16
 
 define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) {
 ; CHECK-LABEL: test_select_cc_v4i16:
-; CHECK: and	w0, w0, #0xffff
-; CHECK-NEXT: cmp	w0, w1, uxth
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.4h, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4h, v[[LHS]].4h, v[[RHS]].4h
+; CHECK: dup [[DUPMASK:v[0-9]+]].4h, [[MASK]].h[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
   %cmp31 = icmp eq i16 %a, %b
   %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
   ret <4x i16> %e
@@ -77,11 +77,11 @@ define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d )
 
 define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) {
 ; CHECK-LABEL: test_select_cc_v8i16:
-; CHECK: and	w0, w0, #0xffff
-; CHECK-NEXT: cmp	w0, w1, uxth
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.8h, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].8h, v[[LHS]].8h, v[[RHS]].8h
+; CHECK: dup [[DUPMASK:v[0-9]+]].8h, [[MASK]].h[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
   %cmp31 = icmp eq i16 %a, %b
   %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
   ret <8x i16> %e
@@ -89,10 +89,11 @@ define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d )
 
 define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) {
 ; CHECK-LABEL: test_select_cc_v2i32:
-; CHECK: cmp	w0, w1, uxtw
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.2s, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].2s, v[[LHS]].2s, v[[RHS]].2s
+; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
   ret <2x i32> %e
@@ -100,10 +101,11 @@ define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d )
 
 define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) {
 ; CHECK-LABEL: test_select_cc_v4i32:
-; CHECK: cmp	w0, w1, uxtw
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
   ret <4x i32> %e
@@ -111,10 +113,10 @@ define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d )
 
 define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) {
 ; CHECK-LABEL: test_select_cc_v1i64:
-; CHECK: cmp	x0, x1
-; CHECK-NEXT: csinv	x0, xzr, xzr, ne
-; CHECK-NEXT: fmov	d{{[0-9]+}}, x0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0
+; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1
+; CHECK: cmeq d[[MASK:[0-9]+]], d[[LHS]], d[[RHS]]
+; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
   ret <1x i64> %e
@@ -122,10 +124,11 @@ define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d )
 
 define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) {
 ; CHECK-LABEL: test_select_cc_v2i64:
-; CHECK: cmp	x0, x1
-; CHECK-NEXT: csinv	x0, xzr, xzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.2d, x0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0
+; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1
+; CHECK: cmeq [[MASK:v[0-9]+]].2d, v[[LHS]].2d, v[[RHS]].2d
+; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
   ret <2x i64> %e
@@ -133,18 +136,18 @@ define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d )
 
 define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
 ; CHECK-LABEL: test_select_cc_v1f32:
-; CHECK: fcmp	s0, s1
-; CHECK-NEXT: fcsel	s0, s2, s3, eq
+; CHECK: fcmp s0, s1
+; CHECK-NEXT: fcsel s0, s2, s3, eq
   %cmp31 = fcmp oeq float %a, %b
   %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
   ret <1 x float> %e
 }
-  
+
 define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) {
 ; CHECK-LABEL: test_select_cc_v2f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
+; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].8b, v2.8b, v3.8b
   %cmp31 = fcmp oeq float %a, %b
   %e = select i1 %cmp31, <2 x float> %c, <2 x float> %d
   ret <2 x float> %e
@@ -152,9 +155,9 @@ define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2
 
 define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) {
 ; CHECK-LABEL: test_select_cc_v4f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b
   %cmp31 = fcmp oeq float %a, %b
   %e = select i1 %cmp31, <4x float> %c, <4x float> %d
   ret <4x float> %e
@@ -162,10 +165,11 @@ define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x f
 
 define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x float> %d ) {
 ; CHECK-LABEL: test_select_cc_v4f32_icmp:
-; CHECK: cmp	w0, w1, uxtw
-; CHECK: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <4x float> %c, <4x float> %d
   ret <4x float> %e
@@ -173,8 +177,8 @@ define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x
 
 define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) {
 ; CHECK-LABEL: test_select_cc_v1f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1
+; CHECK: bsl v[[MASK]].8b, v2.8b, v3.8b
   %cmp31 = fcmp oeq double %a, %b
   %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
   ret <1 x double> %e
@@ -182,10 +186,10 @@ define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c,
 
 define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c, <1 x double> %d ) {
 ; CHECK-LABEL: test_select_cc_v1f64_icmp:
-; CHECK: cmp	 x0, x1
-; CHECK-NEXT: csinv	x0, xzr, xzr, ne
-; CHECK-NEXT: fmov	d{{[0-9]+}}, x0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+; CHECK-DAG: fmov [[LHS:d[0-9]+]], x0
+; CHECK-DAG: fmov [[RHS:d[0-9]+]], x1
+; CHECK: cmeq d[[MASK:[0-9]+]], [[LHS]], [[RHS]]
+; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
   ret <1 x double> %e
@@ -193,9 +197,9 @@ define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c,
 
 define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) {
 ; CHECK-LABEL: test_select_cc_v2f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT: dup	v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d
+; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b
   %cmp31 = fcmp oeq double %a, %b
   %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d
   ret <2 x double> %e
diff --git a/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
new file mode 100644
index 0000000..cca6bfe
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
@@ -0,0 +1,482 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+
+%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
+%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
+%struct.uint8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.int64x2x2_t = type { [2 x <2 x i64>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.float64x2x2_t = type { [2 x <2 x double>] }
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.int64x1x2_t = type { [2 x <1 x i64>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.float64x1x2_t = type { [2 x <1 x double>] }
+%struct.int8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int16x8x3_t = type { [3 x <8 x i16>] }
+%struct.int32x4x3_t = type { [3 x <4 x i32>] }
+%struct.int64x2x3_t = type { [3 x <2 x i64>] }
+%struct.float32x4x3_t = type { [3 x <4 x float>] }
+%struct.float64x2x3_t = type { [3 x <2 x double>] }
+%struct.int8x8x3_t = type { [3 x <8 x i8>] }
+%struct.int16x4x3_t = type { [3 x <4 x i16>] }
+%struct.int32x2x3_t = type { [3 x <2 x i32>] }
+%struct.int64x1x3_t = type { [3 x <1 x i64>] }
+%struct.float32x2x3_t = type { [3 x <2 x float>] }
+%struct.float64x1x3_t = type { [3 x <1 x double>] }
+%struct.int8x16x4_t = type { [4 x <16 x i8>] }
+%struct.int16x8x4_t = type { [4 x <8 x i16>] }
+%struct.int32x4x4_t = type { [4 x <4 x i32>] }
+%struct.int64x2x4_t = type { [4 x <2 x i64>] }
+%struct.float32x4x4_t = type { [4 x <4 x float>] }
+%struct.float64x2x4_t = type { [4 x <2 x double>] }
+%struct.int8x8x4_t = type { [4 x <8 x i8>] }
+%struct.int16x4x4_t = type { [4 x <4 x i16>] }
+%struct.int32x2x4_t = type { [4 x <2 x i32>] }
+%struct.int64x1x4_t = type { [4 x <1 x i64>] }
+%struct.float32x2x4_t = type { [4 x <2 x float>] }
+%struct.float64x1x4_t = type { [4 x <1 x double>] }
+
+define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
+; CHECK-LABEL: test_ld_from_poll_v16i8:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
+  ret <16 x i8> %b
+}
+
+define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: test_ld_from_poll_v8i16:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+  ret <8 x i16> %b
+}
+
+define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4i32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %b
+}
+
+define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2i64:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <2 x i64> %a, <i64 1, i64 2>
+  ret <2 x i64> %b
+}
+
+define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4f32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
+  ret <4 x float> %b
+}
+
+define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2f64:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = fadd <2 x double> %a, <double 1.0, double 2.0>
+  ret <2 x double> %b
+}
+
+define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
+; CHECK-LABEL: test_ld_from_poll_v8i8:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
+  ret <8 x i8> %b
+}
+
+define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4i16:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
+  ret <4 x i16> %b
+}
+
+define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2i32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <2 x i32> %a, <i32 1, i32 2>
+  ret <2 x i32> %b
+}
+
+define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1q_dup_s8:
+; CHECK: ld1r {{{ ?v[0-9]+.16b ?}}}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %lane
+}
+
+define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1q_dup_s16:
+; CHECK: ld1r {{{ ?v[0-9]+.8h ?}}}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %lane
+}
+
+define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1q_dup_s32:
+; CHECK: ld1r {{{ ?v[0-9]+.4s ?}}}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %lane
+}
+
+define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1q_dup_s64:
+; CHECK: ld1r {{{ ?v[0-9]+.2d ?}}}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %lane
+}
+
+define <4 x float> @test_vld1q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1q_dup_f32:
+; CHECK: ld1r {{{ ?v[0-9]+.4s ?}}}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <4 x float> undef, float %0, i32 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x double> @test_vld1q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1q_dup_f64:
+; CHECK: ld1r {{{ ?v[0-9]+.2d ?}}}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <2 x double> undef, double %0, i32 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <8 x i8> @test_vld1_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1_dup_s8:
+; CHECK: ld1r {{{ ?v[0-9]+.8b ?}}}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  ret <8 x i8> %lane
+}
+
+define <4 x i16> @test_vld1_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1_dup_s16:
+; CHECK: ld1r {{{ ?v[0-9]+.4h ?}}}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  ret <4 x i16> %lane
+}
+
+define <2 x i32> @test_vld1_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1_dup_s32:
+; CHECK: ld1r {{{ ?v[0-9]+.2s ?}}}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  ret <2 x i32> %lane
+}
+
+define <1 x i64> @test_vld1_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1_dup_s64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %1
+}
+
+define <2 x float> @test_vld1_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1_dup_f32:
+; CHECK: ld1r {{{ ?v[0-9]+.2s ?}}}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <2 x float> undef, float %0, i32 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <1 x double> @test_vld1_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1_dup_f64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %1
+}
+
+define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
+; As there is a store operation depending on %1, LD1R pattern can't be selected.
+; So LDR and FMOV should be emitted.
+; CHECK-LABEL: testDUP.v1i64:
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-DAG: fmov {{d[0-9]+}}, {{x[0-9]+}}
+; CHECK-DAG: str {{x[0-9]+}}, [{{x[0-9]+}}]
+  %1 = load i64* %a, align 8
+  store i64 %1, i64* %b, align 8
+  %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
+  ret <1 x i64> %vecinit.i
+}
+
+define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
+; As there is a store operation depending on %1, LD1R pattern can't be selected.
+; So LDR and FMOV should be emitted.
+; CHECK-LABEL: testDUP.v1f64:
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
+  %1 = load double* %a, align 8
+  store double %1, double* %b, align 8
+  %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
+  ret <1 x double> %vecinit.i
+}
+
+define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vld1q_lane_s8:
+; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
+  ret <16 x i8> %vld1_lane
+}
+
+define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vld1q_lane_s16:
+; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
+  ret <8 x i16> %vld1_lane
+}
+
+define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vld1q_lane_s32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
+  ret <4 x i32> %vld1_lane
+}
+
+define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vld1q_lane_s64:
+; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
+  ret <2 x i64> %vld1_lane
+}
+
+define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vld1q_lane_f32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
+  ret <4 x float> %vld1_lane
+}
+
+define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vld1q_lane_f64:
+; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
+  ret <2 x double> %vld1_lane
+}
+
+define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vld1_lane_s8:
+; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
+  ret <8 x i8> %vld1_lane
+}
+
+define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vld1_lane_s16:
+; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
+  ret <4 x i16> %vld1_lane
+}
+
+define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vld1_lane_s32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
+  ret <2 x i32> %vld1_lane
+}
+
+define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vld1_lane_s64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %vld1_lane
+}
+
+define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vld1_lane_f32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
+  ret <2 x float> %vld1_lane
+}
+
+define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vld1_lane_f64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %vld1_lane
+}
+
+define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vst1q_lane_s8:
+; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <16 x i8> %b, i32 15
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vst1q_lane_s16:
+; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i16> %b, i32 7
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vst1q_lane_s32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i32> %b, i32 3
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vst1q_lane_s64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i64> %b, i32 1
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vst1q_lane_f32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x float> %b, i32 3
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vst1q_lane_f64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x double> %b, i32 1
+  store double %0, double* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vst1_lane_s8:
+; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i8> %b, i32 7
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vst1_lane_s16:
+; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i16> %b, i32 3
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vst1_lane_s32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i32> %b, i32 1
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vst1_lane_s64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <1 x i64> %b, i32 0
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vst1_lane_f32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x float> %b, i32 1
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vst1_lane_f64:
+; CHECK: str {{d[0-9]+}}, [x0]
+entry:
+  %0 = extractelement <1 x double> %b, i32 0
+  store double %0, double* %a, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-simd-shift.ll b/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
new file mode 100644
index 0000000..447fb63
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
@@ -0,0 +1,663 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
+; CHECK: test_vshr_n_s8
+; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) {
+; CHECK: test_vshr_n_s16
+; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) {
+; CHECK: test_vshr_n_s32
+; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_s8
+; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_s16
+; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_s32
+; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_s64
+; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3>
+  ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) {
+; CHECK: test_vshr_n_u8
+; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) {
+; CHECK: test_vshr_n_u16
+; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) {
+; CHECK: test_vshr_n_u32
+; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_u8
+; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_u16
+; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_u32
+; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_u64
+; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3>
+  ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_s8
+; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <8 x i8> %vsra_n, %a
+  ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_s16
+; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+  %1 = add <4 x i16> %vsra_n, %a
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_s32
+; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3>
+  %1 = add <2 x i32> %vsra_n, %a
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_s8
+; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <16 x i8> %vsra_n, %a
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_s16
+; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %1 = add <8 x i16> %vsra_n, %a
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_s32
+; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %vsra_n, %a
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_s64
+; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3>
+  %1 = add <2 x i64> %vsra_n, %a
+  ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_u8
+; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <8 x i8> %vsra_n, %a
+  ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_u16
+; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+  %1 = add <4 x i16> %vsra_n, %a
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_u32
+; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3>
+  %1 = add <2 x i32> %vsra_n, %a
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_u8
+; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <16 x i8> %vsra_n, %a
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_u16
+; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %1 = add <8 x i16> %vsra_n, %a
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_u32
+; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %vsra_n, %a
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_u64
+; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3>
+  %1 = add <2 x i64> %vsra_n, %a
+  ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_s16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_s32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_s64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %1 = ashr <2 x i64> %a, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %vshrn_n
+}
+
+define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_u16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_u32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_u64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %1 = lshr <2 x i64> %a, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %vshrn_n
+}
+
+define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_s16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  %2 = bitcast <8 x i8> %a to <1 x i64>
+  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_s32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  %2 = bitcast <4 x i16> %a to <1 x i64>
+  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_s64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = ashr <2 x i64> %b, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_u16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  %2 = bitcast <8 x i8> %a to <1 x i64>
+  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_u32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  %2 = bitcast <4 x i16> %a to <1 x i64>
+  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_u64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = lshr <2 x i64> %b, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrun_high_n_s16
+; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrun_high_n_s32
+; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrun_high_n_s64
+; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vrshrn_high_n_s16
+; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vrshrn_high_n_s32
+; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vrshrn_high_n_s64
+; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrun_high_n_s16
+; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrun_high_n_s32
+; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrun_high_n_s64
+; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_s16
+; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_s32
+; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_s64
+; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_u16
+; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_u32
+; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_u64
+; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_s16
+; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_s32
+; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_s64
+; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_u16
+; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_u32
+; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_u64
+; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+
+
+declare <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32)
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
+
+define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_s64_f64
+; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_u64_f64
+; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
+  ret <1 x i64> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_s64
+; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_u64
+; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  ret <1 x double> %1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-vget.ll b/test/CodeGen/AArch64/arm64-neon-simd-vget.ll
index 6474499..87f3956 100644
--- a/test/CodeGen/AArch64/neon-simd-vget.ll
+++ b/test/CodeGen/AArch64/arm64-neon-simd-vget.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @test_vget_high_s8(<16 x i8> %a) {
 ; CHECK-LABEL: test_vget_high_s8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i8> %shuffle.i
@@ -10,7 +10,7 @@ entry:
 
 define <4 x i16> @test_vget_high_s16(<8 x i16> %a) {
 ; CHECK-LABEL: test_vget_high_s16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   ret <4 x i16> %shuffle.i
@@ -18,7 +18,7 @@ entry:
 
 define <2 x i32> @test_vget_high_s32(<4 x i32> %a) {
 ; CHECK-LABEL: test_vget_high_s32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   ret <2 x i32> %shuffle.i
@@ -26,7 +26,7 @@ entry:
 
 define <1 x i64> @test_vget_high_s64(<2 x i64> %a) {
 ; CHECK-LABEL: test_vget_high_s64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
   ret <1 x i64> %shuffle.i
@@ -34,7 +34,7 @@ entry:
 
 define <8 x i8> @test_vget_high_u8(<16 x i8> %a) {
 ; CHECK-LABEL: test_vget_high_u8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i8> %shuffle.i
@@ -42,7 +42,7 @@ entry:
 
 define <4 x i16> @test_vget_high_u16(<8 x i16> %a) {
 ; CHECK-LABEL: test_vget_high_u16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   ret <4 x i16> %shuffle.i
@@ -50,7 +50,7 @@ entry:
 
 define <2 x i32> @test_vget_high_u32(<4 x i32> %a) {
 ; CHECK-LABEL: test_vget_high_u32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   ret <2 x i32> %shuffle.i
@@ -58,7 +58,7 @@ entry:
 
 define <1 x i64> @test_vget_high_u64(<2 x i64> %a) {
 ; CHECK-LABEL: test_vget_high_u64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
   ret <1 x i64> %shuffle.i
@@ -66,7 +66,7 @@ entry:
 
 define <1 x i64> @test_vget_high_p64(<2 x i64> %a) {
 ; CHECK-LABEL: test_vget_high_p64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
   ret <1 x i64> %shuffle.i
@@ -74,7 +74,7 @@ entry:
 
 define <4 x i16> @test_vget_high_f16(<8 x i16> %a) {
 ; CHECK-LABEL: test_vget_high_f16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   ret <4 x i16> %shuffle.i
@@ -82,7 +82,7 @@ entry:
 
 define <2 x float> @test_vget_high_f32(<4 x float> %a) {
 ; CHECK-LABEL: test_vget_high_f32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuffle.i
@@ -90,7 +90,7 @@ entry:
 
 define <8 x i8> @test_vget_high_p8(<16 x i8> %a) {
 ; CHECK-LABEL: test_vget_high_p8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i8> %shuffle.i
@@ -98,7 +98,7 @@ entry:
 
 define <4 x i16> @test_vget_high_p16(<8 x i16> %a) {
 ; CHECK-LABEL: test_vget_high_p16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   ret <4 x i16> %shuffle.i
@@ -106,7 +106,7 @@ entry:
 
 define <1 x double> @test_vget_high_f64(<2 x double> %a) {
 ; CHECK-LABEL: test_vget_high_f64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> <i32 1>
   ret <1 x double> %shuffle.i
diff --git a/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll b/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
new file mode 100644
index 0000000..74e3af8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
@@ -0,0 +1,74 @@
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=arm64-none-linux-gnu | FileCheck %s
+
+; This is the analogue of AArch64's file of the same name. It's mostly testing
+; some form of correct lowering occurs, the tests are a little artificial but I
+; strongly suspect there's room for improved CodeGen (FIXME).
+
+define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_0:
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: cset
+  %1 = icmp sge <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_1:
+; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_0:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_1:
+; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
+; CHECK-LABEL: test_select_v1i1_2:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
+  ret <1 x double> %res
+}
+
+define <1 x i64> @test_select_v1i1_3(i64 %lhs, i64 %rhs, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_3:
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
+  %tst = icmp eq i64 %lhs, %rhs
+  %evil = insertelement <1 x i1> undef, i1 %tst, i32 0
+  %res = select <1 x i1> %evil, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_br_extr_cmp:
+; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}}
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  br i1 %2, label %if.end, label %if.then
+
+if.then:
+  ret i32 0;
+
+if.end:
+  ret i32 1;
+}
diff --git a/test/CodeGen/AArch64/neon-vector-list-spill.ll b/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
index 3ab69c4..8262fe4 100644
--- a/test/CodeGen/AArch64/neon-vector-list-spill.ll
+++ b/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
@@ -1,16 +1,16 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 ; FIXME: We should not generate ld/st for such register spill/fill, because the
 ; test case seems very simple and the register pressure is not high. If the
 ; spill/fill algorithm is optimized, this test case may not be triggered. And
 ; then we can delete it.
-define i32 @spill.DPairReg(i8* %arg1, i32 %arg2) {
+define i32 @spill.DPairReg(i32* %arg1, i32 %arg2) {
 ; CHECK-LABEL: spill.DPairReg:
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld2 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %arg1, i32 4)
+  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -24,13 +24,13 @@ if.end:
   ret i32 %res
 }
 
-define i16 @spill.DTripleReg(i8* %arg1, i32 %arg2) {
+define i16 @spill.DTripleReg(i16* %arg1, i32 %arg2) {
 ; CHECK-LABEL: spill.DTripleReg:
-; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld3 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %arg1, i32 4)
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -44,13 +44,13 @@ if.end:
   ret i16 %res
 }
 
-define i16 @spill.DQuadReg(i8* %arg1, i32 %arg2) {
+define i16 @spill.DQuadReg(i16* %arg1, i32 %arg2) {
 ; CHECK-LABEL: spill.DQuadReg:
-; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld4 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %arg1, i32 4)
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -64,13 +64,13 @@ if.end:
   ret i16 %res
 }
 
-define i32 @spill.QPairReg(i8* %arg1, i32 %arg2) {
+define i32 @spill.QPairReg(i32* %arg1, i32 %arg2) {
 ; CHECK-LABEL: spill.QPairReg:
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld2 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %arg1, i32 4)
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -84,13 +84,13 @@ if.end:
   ret i32 %res
 }
 
-define float @spill.QTripleReg(i8* %arg1, i32 %arg2) {
+define float @spill.QTripleReg(float* %arg1, i32 %arg2) {
 ; CHECK-LABEL: spill.QTripleReg:
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld3 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %arg1, i32 4)
+  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -106,11 +106,11 @@ if.end:
 
 define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) {
 ; CHECK-LABEL: spill.QQuadReg:
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld4 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %arg1, i32 4)
+  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -124,12 +124,12 @@ if.end:
   ret i8 %res
 }
 
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
 
 declare void @foo()
 
@@ -138,8 +138,8 @@ declare void @foo()
 ; spill/fill algorithm is optimized, this test case may not be triggered. And
 ; then we can delete it.
 ; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_2xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+define <8 x i16> @test_2xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+  tail call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
   tail call void @foo()
   %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
   %1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -149,8 +149,8 @@ define <8 x i16> @test_2xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
 }
 
 ; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_3xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+define <8 x i16> @test_3xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+  tail call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
   tail call void @foo()
   %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
   %1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -160,8 +160,8 @@ define <8 x i16> @test_3xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
 }
 
 ; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_4xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+define <8 x i16> @test_4xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+  tail call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
   tail call void @foo()
   %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
   %1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -170,6 +170,6 @@ define <8 x i16> @test_4xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
   ret <8 x i16> %3
 }
 
-declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-\ No newline at end of file
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
diff --git a/test/CodeGen/ARM64/patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
index 993e3eb..039cdfc 100644
--- a/test/CodeGen/ARM64/patchpoint.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -1,17 +1,17 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone | FileCheck %s
 
 ; Trivial patchpoint codegen
 ;
 define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
 ; CHECK-LABEL: trivial_patchpoint_codegen:
-; CHECK:       movz x16, #57005, lsl #32
-; CHECK-NEXT:  movk x16, #48879, lsl #16
-; CHECK-NEXT:  movk x16, #51966
+; CHECK:       movz x16, #0xdead, lsl #32
+; CHECK-NEXT:  movk x16, #0xbeef, lsl #16
+; CHECK-NEXT:  movk x16, #0xcafe
 ; CHECK-NEXT:  blr  x16
-; CHECK:       movz x16, #57005, lsl #32
-; CHECK-NEXT:  movk x16, #48879, lsl #16
-; CHECK-NEXT:  movk x16, #51967
+; CHECK:       movz x16, #0xdead, lsl #32
+; CHECK-NEXT:  movk x16, #0xbeef, lsl #16
+; CHECK-NEXT:  movk x16, #0xcaff
 ; CHECK-NEXT:  blr  x16
 ; CHECK:       ret
   %resolveCall2 = inttoptr i64 244837814094590 to i8*
@@ -25,10 +25,10 @@ entry:
 ; as a leaf function.
 ;
 ; CHECK-LABEL: caller_meta_leaf
-; CHECK:       mov fp, sp
+; CHECK:       mov x29, sp
 ; CHECK-NEXT:  sub sp, sp, #32
 ; CHECK:       Ltmp
-; CHECK:       mov sp, fp
+; CHECK:       mov sp, x29
 ; CHECK:       ret
 
 define void @caller_meta_leaf() {
@@ -51,9 +51,9 @@ entry:
 ; CHECK:      str x{{.+}}, [sp]
 ; CHECK-NEXT: mov  x0, x{{.+}}
 ; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #65535, lsl #32
-; CHECK-NEXT: movk  x16, #57005, lsl #16
-; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: movz  x16, #0xffff, lsl #32
+; CHECK-NEXT: movk  x16, #0xdead, lsl #16
+; CHECK-NEXT: movk  x16, #0xbeef
 ; CHECK-NEXT: blr x16
   %resolveCall2 = inttoptr i64 281474417671919 to i8*
   %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
@@ -67,16 +67,16 @@ define i64 @jscall_patchpoint_codegen2(i64 %callee) {
 entry:
 ; CHECK-LABEL: jscall_patchpoint_codegen2:
 ; CHECK:      Ltmp
-; CHECK:      orr x{{.+}}, xzr, #0x6
+; CHECK:      orr w{{.+}}, wzr, #0x6
 ; CHECK-NEXT: str x{{.+}}, [sp, #24]
 ; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
 ; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr x{{.+}}, xzr, #0x2
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
 ; CHECK-NEXT: str x{{.+}}, [sp]
 ; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #65535, lsl #32
-; CHECK-NEXT: movk  x16, #57005, lsl #16
-; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: movz  x16, #0xffff, lsl #32
+; CHECK-NEXT: movk  x16, #0xdead, lsl #16
+; CHECK-NEXT: movk  x16, #0xbeef
 ; CHECK-NEXT: blr x16
   %call = inttoptr i64 281474417671919 to i8*
   %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
@@ -88,20 +88,20 @@ define i64 @jscall_patchpoint_codegen3(i64 %callee) {
 entry:
 ; CHECK-LABEL: jscall_patchpoint_codegen3:
 ; CHECK:      Ltmp
-; CHECK:      movz  x{{.+}}, #10
+; CHECK:      movz  w{{.+}}, #0xa
 ; CHECK-NEXT: str x{{.+}}, [sp, #48]
 ; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
 ; CHECK-NEXT: str w{{.+}}, [sp, #36]
-; CHECK-NEXT: orr x{{.+}}, xzr, #0x6
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x6
 ; CHECK-NEXT: str x{{.+}}, [sp, #24]
 ; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
 ; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr x{{.+}}, xzr, #0x2
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
 ; CHECK-NEXT: str x{{.+}}, [sp]
 ; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #65535, lsl #32
-; CHECK-NEXT: movk  x16, #57005, lsl #16
-; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: movz  x16, #0xffff, lsl #32
+; CHECK-NEXT: movk  x16, #0xdead, lsl #16
+; CHECK-NEXT: movk  x16, #0xbeef
 ; CHECK-NEXT: blr x16
   %call = inttoptr i64 281474417671919 to i8*
   %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
@@ -161,3 +161,11 @@ define void @clobberScratch(i32* %p) {
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+
+; CHECK-LABEL: test_i16:
+; CHECK: ldrh [[BREG:w[0-9]+]], [sp]
+; CHECK: add w0, w0, [[BREG]]
+define webkit_jscc i16 @test_i16(i16 zeroext %a, i16 zeroext %b) {
+  %sum = add i16 %a, %b
+  ret i16 %sum
+}
diff --git a/test/CodeGen/AArch64/arm64-pic-local-symbol.ll b/test/CodeGen/AArch64/arm64-pic-local-symbol.ll
new file mode 100644
index 0000000..627e741
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-pic-local-symbol.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64-unknown-linux-gnu -relocation-model=pic < %s | FileCheck %s
+
+@a = internal unnamed_addr global i32 0, align 4
+@.str = private unnamed_addr constant [6 x i8] c"test\0A\00", align 1
+
+define i32 @get() {
+; CHECK: get:
+; CHECK: adrp x{{[0-9]+}}, a
+; CHECK-NEXT: ldr w{{[0-9]+}}, [x{{[0-9]}}, :lo12:a]
+  %res = load i32* @a, align 4
+  ret i32 %res
+}
+
+define void @foo() nounwind {
+; CHECK: foo:
+; CHECK: adrp x{{[0-9]}}, .L.str
+; CHECK-NEXT: add x{{[0-9]}}, x{{[0-9]}}, :lo12:.L.str
+  tail call void @bar(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0))
+  ret void
+}
+
+declare void @bar(i8*)
diff --git a/test/CodeGen/ARM64/platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll
index 651c793..651c793 100644
--- a/test/CodeGen/ARM64/platform-reg.ll
+++ b/test/CodeGen/AArch64/arm64-platform-reg.ll
diff --git a/test/CodeGen/ARM64/popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll
index 9bbba09c..2afade2 100644
--- a/test/CodeGen/ARM64/popcnt.ll
+++ b/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
diff --git a/test/CodeGen/ARM64/prefetch.ll b/test/CodeGen/AArch64/arm64-prefetch.ll
index b2e06ed..b2e06ed 100644
--- a/test/CodeGen/ARM64/prefetch.ll
+++ b/test/CodeGen/AArch64/arm64-prefetch.ll
diff --git a/test/CodeGen/ARM64/promote-const.ll b/test/CodeGen/AArch64/arm64-promote-const.ll
index 4a336db..380ff55 100644
--- a/test/CodeGen/ARM64/promote-const.ll
+++ b/test/CodeGen/AArch64/arm64-promote-const.ll
@@ -1,9 +1,9 @@
 ; Disable machine cse to stress the different path of the algorithm.
 ; Otherwise, we always fall in the simple case, i.e., only one definition.
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-stress-promote-const | FileCheck -check-prefix=PROMOTED %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-stress-promote-const -mcpu=cyclone | FileCheck -check-prefix=PROMOTED %s
 ; The REGULAR run just checks that the inputs passed to promote const expose
 ; the appropriate patterns.
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-promote-const=false | FileCheck -check-prefix=REGULAR %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-promote-const=false -mcpu=cyclone | FileCheck -check-prefix=REGULAR %s
 
 %struct.uint8x16x4_t = type { [4 x <16 x i8>] }
 
diff --git a/test/CodeGen/ARM64/redzone.ll b/test/CodeGen/AArch64/arm64-redzone.ll
index b89d7b1..9b0c384 100644
--- a/test/CodeGen/ARM64/redzone.ll
+++ b/test/CodeGen/AArch64/arm64-redzone.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
 
 define i32 @foo(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll b/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
new file mode 100644
index 0000000..29255ef
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s | FileCheck %s
+
+define float @copy_FPR32(float %a, float %b) {
+;CHECK-LABEL: copy_FPR32:
+;CHECK: fmov s0, s1
+  ret float %b;
+}
+  
+define double @copy_FPR64(double %a, double %b) {
+;CHECK-LABEL: copy_FPR64:
+;CHECK: fmov d0, d1
+  ret double %b;
+}
+  
+define fp128 @copy_FPR128(fp128 %a, fp128 %b) {
+;CHECK-LABEL: copy_FPR128:
+;CHECK: str	q1, [sp, #-16]!
+;CHECK-NEXT: ldr	q0, [sp, #16]!
+  ret fp128 %b;
+}
diff --git a/test/CodeGen/AArch64/arm64-register-offset-addressing.ll b/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
new file mode 100644
index 0000000..045712b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
@@ -0,0 +1,145 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i8 @test_64bit_add(i16* %a, i64 %b) {
+; CHECK-LABEL: test_64bit_add:
+; CHECK: lsl [[REG:x[0-9]+]], x1, #1
+; CHECK: ldrb w0, [x0, [[REG]]]
+; CHECK: ret
+  %tmp1 = getelementptr inbounds i16* %a, i64 %b
+  %tmp2 = load i16* %tmp1
+  %tmp3 = trunc i16 %tmp2 to i8
+  ret i8 %tmp3
+}
+
+; These tests are trying to form SEXT and ZEXT operations that never leave i64
+; space, to make sure LLVM can adapt the offset register correctly.
+define void @ldst_8bit(i8* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_8bit:
+
+   %off32.sext.tmp = shl i64 %offset, 32
+   %off32.sext = ashr i64 %off32.sext.tmp, 32
+   %addr8_sxtw = getelementptr i8* %base, i64 %off32.sext
+   %val8_sxtw = load volatile i8* %addr8_sxtw
+   %val32_signed = sext i8 %val8_sxtw to i32
+   store volatile i32 %val32_signed, i32* @var_32bit
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+  %addrint_uxtw = ptrtoint i8* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i8*
+  %val8_uxtw = load volatile i8* %addr_uxtw
+  %newval8 = add i8 %val8_uxtw, 1
+  store volatile i8 %newval8, i8* @var_8bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+   ret void
+}
+
+
+define void @ldst_16bit(i16* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_16bit:
+
+  %addrint_uxtw = ptrtoint i16* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i16*
+  %val8_uxtw = load volatile i16* %addr_uxtw
+  %newval8 = add i16 %val8_uxtw, 1
+  store volatile i16 %newval8, i16* @var_16bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i16* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i16*
+  %val16_sxtw = load volatile i16* %addr_sxtw
+  %val64_signed = sext i16 %val16_sxtw to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i16* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 1
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i16*
+  %val32 = load volatile i32* @var_32bit
+  %val16_trunc32 = trunc i32 %val32 to i16
+  store volatile i16 %val16_trunc32, i16* %addr_uxtwN
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #1]
+   ret void
+}
+
+define void @ldst_32bit(i32* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_32bit:
+
+  %addrint_uxtw = ptrtoint i32* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i32*
+  %val32_uxtw = load volatile i32* %addr_uxtw
+  %newval32 = add i32 %val32_uxtw, 1
+  store volatile i32 %newval32, i32* @var_32bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i32* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i32*
+  %val32_sxtw = load volatile i32* %addr_sxtw
+  %val64_signed = sext i32 %val32_sxtw to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i32* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 2
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i32*
+  %val32 = load volatile i32* @var_32bit
+  store volatile i32 %val32, i32* %addr_uxtwN
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+   ret void
+}
+
+define void @ldst_64bit(i64* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_64bit:
+
+  %addrint_uxtw = ptrtoint i64* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i64*
+  %val64_uxtw = load volatile i64* %addr_uxtw
+  %newval8 = add i64 %val64_uxtw, 1
+  store volatile i64 %newval8, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i64* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i64*
+  %val64_sxtw = load volatile i64* %addr_sxtw
+  store volatile i64 %val64_sxtw, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i64* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 3
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i64*
+  %val64 = load volatile i64* @var_64bit
+  store volatile i64 %val64, i64* %addr_uxtwN
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+   ret void
+}
+
+@var_8bit = global i8 0
+@var_16bit = global i16 0
+@var_32bit = global i32 0
+@var_64bit = global i64 0
diff --git a/test/CodeGen/ARM64/register-pairing.ll b/test/CodeGen/AArch64/arm64-register-pairing.ll
index 4de80d2..99defb1 100644
--- a/test/CodeGen/ARM64/register-pairing.ll
+++ b/test/CodeGen/AArch64/arm64-register-pairing.ll
@@ -13,7 +13,7 @@ define void @odd() nounwind {
 ; CHECK: stp x24, x23, [sp, #96]
 ; CHECK: stp x22, x21, [sp, #112]
 ; CHECK: stp x20, x19, [sp, #128]
-; CHECK: movz x0, #42
+; CHECK: movz x0, #0x2a
 ; CHECK: ldp x20, x19, [sp, #128]
 ; CHECK: ldp x22, x21, [sp, #112]
 ; CHECK: ldp x24, x23, [sp, #96]
@@ -38,7 +38,7 @@ define void @even() nounwind {
 ; CHECK: stp x24, x23, [sp, #96]
 ; CHECK: stp x22, x21, [sp, #112]
 ; CHECK: stp x20, x19, [sp, #128]
-; CHECK: movz x0, #42
+; CHECK: movz x0, #0x2a
 ; CHECK: ldp x20, x19, [sp, #128]
 ; CHECK: ldp x22, x21, [sp, #112]
 ; CHECK: ldp x24, x23, [sp, #96]
diff --git a/test/CodeGen/ARM64/regress-f128csel-flags.ll b/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
index a1daf03..a1daf03 100644
--- a/test/CodeGen/ARM64/regress-f128csel-flags.ll
+++ b/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
diff --git a/test/CodeGen/ARM64/regress-interphase-shift.ll b/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
index fddf591..fec8933 100644
--- a/test/CodeGen/ARM64/regress-interphase-shift.ll
+++ b/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
@@ -4,6 +4,10 @@
 ; on the phase of legalization, which led to the creation of an unexpected and
 ; unselectable "rotr" node: (i32 (rotr i32, i64)).
 
+; FIXME: This test is xfailed because it relies on an optimization that has
+; been reverted (see PR17975).
+; XFAIL: *
+
 define void @foo(i64* nocapture %d) {
 ; CHECK-LABEL: foo:
 ; CHECK: rorv
diff --git a/test/CodeGen/ARM64/return-vector.ll b/test/CodeGen/AArch64/arm64-return-vector.ll
index 9457d8b..9457d8b 100644
--- a/test/CodeGen/ARM64/return-vector.ll
+++ b/test/CodeGen/AArch64/arm64-return-vector.ll
diff --git a/test/CodeGen/ARM64/returnaddr.ll b/test/CodeGen/AArch64/arm64-returnaddr.ll
index e06ce90..285b295 100644
--- a/test/CodeGen/ARM64/returnaddr.ll
+++ b/test/CodeGen/AArch64/arm64-returnaddr.ll
@@ -3,7 +3,7 @@
 define i8* @rt0(i32 %x) nounwind readnone {
 entry:
 ; CHECK-LABEL: rt0:
-; CHECK: mov x0, lr
+; CHECK: mov x0, x30
 ; CHECK: ret
   %0 = tail call i8* @llvm.returnaddress(i32 0)
   ret i8* %0
@@ -12,12 +12,12 @@ entry:
 define i8* @rt2() nounwind readnone {
 entry:
 ; CHECK-LABEL: rt2:
-; CHECK: stp fp, lr, [sp, #-16]!
-; CHECK: mov fp, sp
-; CHECK: ldr x[[REG:[0-9]+]], [fp]
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: ldr x[[REG:[0-9]+]], [x29]
 ; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]]]
 ; CHECK: ldr x0, [x[[REG2]], #8]
-; CHECK: ldp fp, lr, [sp], #16
+; CHECK: ldp x29, x30, [sp], #16
 ; CHECK: ret
   %0 = tail call i8* @llvm.returnaddress(i32 2)
   ret i8* %0
diff --git a/test/CodeGen/ARM64/rev.ll b/test/CodeGen/AArch64/arm64-rev.ll
index 867d5b3..30d9f4f 100644
--- a/test/CodeGen/ARM64/rev.ll
+++ b/test/CodeGen/AArch64/arm64-rev.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @test_rev_w(i32 %a) nounwind {
 entry:
@@ -36,10 +36,13 @@ entry:
   ret i32 %tmp14
 }
 
+; 64-bit REV16 is *not* a swap then a 16-bit rotation:
+;   01234567 ->(bswap) 76543210 ->(rotr) 10765432
+;   01234567 ->(rev16) 10325476
 define i64 @test_rev16_x(i64 %a) nounwind {
 entry:
 ; CHECK-LABEL: test_rev16_x:
-; CHECK: rev16 x0, x0
+; CHECK-NOT: rev16 x0, x0
   %0 = tail call i64 @llvm.bswap.i64(i64 %a)
   %1 = lshr i64 %0, 16
   %2 = shl i64 %0, 48
@@ -219,3 +222,14 @@ entry:
   ret void
 }
 
+
+define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
+; CHECK-LABEL: test_vrev32_bswap:
+; CHECK: rev32.16b
+; CHECK-NOT: rev
+; CHECK: ret
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
+  ret <4 x i32> %bswap
+}
+
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/rounding.ll b/test/CodeGen/AArch64/arm64-rounding.ll
index 7ff65c3..9311144 100644
--- a/test/CodeGen/ARM64/rounding.ll
+++ b/test/CodeGen/AArch64/arm64-rounding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -O3 < %s -mcpu=cyclone | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
 target triple = "arm64-apple-ios6.0.0"
 
diff --git a/test/CodeGen/ARM64/scaled_iv.ll b/test/CodeGen/AArch64/arm64-scaled_iv.ll
index 987373e..987373e 100644
--- a/test/CodeGen/ARM64/scaled_iv.ll
+++ b/test/CodeGen/AArch64/arm64-scaled_iv.ll
diff --git a/test/CodeGen/ARM64/scvt.ll b/test/CodeGen/AArch64/arm64-scvt.ll
index b4d4add..2e006cf 100644
--- a/test/CodeGen/ARM64/scvt.ll
+++ b/test/CodeGen/AArch64/arm64-scvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 ; rdar://13082402
 
 define float @t1(i32* nocapture %src) nounwind ssp {
diff --git a/test/CodeGen/ARM64/shifted-sext.ll b/test/CodeGen/AArch64/arm64-shifted-sext.ll
index e553be5..b7b4e5d 100644
--- a/test/CodeGen/ARM64/shifted-sext.ll
+++ b/test/CodeGen/AArch64/arm64-shifted-sext.ll
@@ -6,7 +6,7 @@ define signext i16 @extendedLeftShiftcharToshortBy4(i8 signext %a) nounwind read
 entry:
 ; CHECK-LABEL: extendedLeftShiftcharToshortBy4:
 ; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #28, #7
+; CHECK: sbfiz w0, [[REG]], #4, #8
   %inc = add i8 %a, 1
   %conv1 = sext i8 %inc to i32
   %shl = shl nsw i32 %conv1, 4
@@ -18,7 +18,7 @@ define signext i16 @extendedRightShiftcharToshortBy4(i8 signext %a) nounwind rea
 entry:
 ; CHECK-LABEL: extendedRightShiftcharToshortBy4:
 ; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #4, #7
+; CHECK: sbfx w0, [[REG]], #4, #4
   %inc = add i8 %a, 1
   %conv1 = sext i8 %inc to i32
   %shr4 = lshr i32 %conv1, 4
@@ -30,7 +30,7 @@ define signext i16 @extendedLeftShiftcharToshortBy8(i8 signext %a) nounwind read
 entry:
 ; CHECK-LABEL: extendedLeftShiftcharToshortBy8:
 ; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #24, #7
+; CHECK: sbfiz w0, [[REG]], #8, #8
   %inc = add i8 %a, 1
   %conv1 = sext i8 %inc to i32
   %shl = shl nsw i32 %conv1, 8
@@ -55,7 +55,7 @@ define i32 @extendedLeftShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: extendedLeftShiftcharTointBy4:
 ; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #28, #7
+; CHECK: sbfiz w0, [[REG]], #4, #8
   %inc = add i8 %a, 1
   %conv = sext i8 %inc to i32
   %shl = shl nsw i32 %conv, 4
@@ -66,7 +66,7 @@ define i32 @extendedRightShiftcharTointBy4(i8 signext %a) nounwind readnone ssp
 entry:
 ; CHECK-LABEL: extendedRightShiftcharTointBy4:
 ; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #4, #7
+; CHECK: sbfx w0, [[REG]], #4, #4
   %inc = add i8 %a, 1
   %conv = sext i8 %inc to i32
   %shr = ashr i32 %conv, 4
@@ -77,7 +77,7 @@ define i32 @extendedLeftShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: extendedLeftShiftcharTointBy8:
 ; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #24, #7
+; CHECK: sbfiz w0, [[REG]], #8, #8
   %inc = add i8 %a, 1
   %conv = sext i8 %inc to i32
   %shl = shl nsw i32 %conv, 8
@@ -100,7 +100,7 @@ define i64 @extendedLeftShiftcharToint64By4(i8 signext %a) nounwind readnone ssp
 entry:
 ; CHECK-LABEL: extendedLeftShiftcharToint64By4:
 ; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #60, #7
+; CHECK: sbfiz x0, x[[REG]], #4, #8
   %inc = add i8 %a, 1
   %conv = sext i8 %inc to i64
   %shl = shl nsw i64 %conv, 4
@@ -111,7 +111,7 @@ define i64 @extendedRightShiftcharToint64By4(i8 signext %a) nounwind readnone ss
 entry:
 ; CHECK-LABEL: extendedRightShiftcharToint64By4:
 ; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #4, #7
+; CHECK: sbfx x0, x[[REG]], #4, #4
   %inc = add i8 %a, 1
   %conv = sext i8 %inc to i64
   %shr = ashr i64 %conv, 4
@@ -122,7 +122,7 @@ define i64 @extendedLeftShiftcharToint64By8(i8 signext %a) nounwind readnone ssp
 entry:
 ; CHECK-LABEL: extendedLeftShiftcharToint64By8:
 ; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #56, #7
+; CHECK: sbfiz x0, x[[REG]], #8, #8
   %inc = add i8 %a, 1
   %conv = sext i8 %inc to i64
   %shl = shl nsw i64 %conv, 8
@@ -133,7 +133,7 @@ define i64 @extendedRightShiftcharToint64By8(i8 signext %a) nounwind readnone ss
 entry:
 ; CHECK-LABEL: extendedRightShiftcharToint64By8:
 ; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sxtb x[[REG]], x[[REG]]
+; CHECK: sxtb x[[REG]], w[[REG]]
 ; CHECK: asr x0, x[[REG]], #8
   %inc = add i8 %a, 1
   %conv = sext i8 %inc to i64
@@ -145,7 +145,7 @@ define i32 @extendedLeftShiftshortTointBy4(i16 signext %a) nounwind readnone ssp
 entry:
 ; CHECK-LABEL: extendedLeftShiftshortTointBy4:
 ; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #28, #15
+; CHECK: sbfiz w0, [[REG]], #4, #16
   %inc = add i16 %a, 1
   %conv = sext i16 %inc to i32
   %shl = shl nsw i32 %conv, 4
@@ -156,7 +156,7 @@ define i32 @extendedRightShiftshortTointBy4(i16 signext %a) nounwind readnone ss
 entry:
 ; CHECK-LABEL: extendedRightShiftshortTointBy4:
 ; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #4, #15
+; CHECK: sbfx w0, [[REG]], #4, #12
   %inc = add i16 %a, 1
   %conv = sext i16 %inc to i32
   %shr = ashr i32 %conv, 4
@@ -190,7 +190,7 @@ define i64 @extendedLeftShiftshortToint64By4(i16 signext %a) nounwind readnone s
 entry:
 ; CHECK-LABEL: extendedLeftShiftshortToint64By4:
 ; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #60, #15
+; CHECK: sbfiz x0, x[[REG]], #4, #16
   %inc = add i16 %a, 1
   %conv = sext i16 %inc to i64
   %shl = shl nsw i64 %conv, 4
@@ -201,7 +201,7 @@ define i64 @extendedRightShiftshortToint64By4(i16 signext %a) nounwind readnone
 entry:
 ; CHECK-LABEL: extendedRightShiftshortToint64By4:
 ; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #4, #15
+; CHECK: sbfx x0, x[[REG]], #4, #12
   %inc = add i16 %a, 1
   %conv = sext i16 %inc to i64
   %shr = ashr i64 %conv, 4
@@ -212,7 +212,7 @@ define i64 @extendedLeftShiftshortToint64By16(i16 signext %a) nounwind readnone
 entry:
 ; CHECK-LABEL: extendedLeftShiftshortToint64By16:
 ; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #48, #15
+; CHECK: sbfiz x0, x[[REG]], #16, #16
   %inc = add i16 %a, 1
   %conv = sext i16 %inc to i64
   %shl = shl nsw i64 %conv, 16
@@ -223,7 +223,7 @@ define i64 @extendedRightShiftshortToint64By16(i16 signext %a) nounwind readnone
 entry:
 ; CHECK-LABEL: extendedRightShiftshortToint64By16:
 ; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sxth x[[REG]], x[[REG]]
+; CHECK: sxth x[[REG]], w[[REG]]
 ; CHECK: asr x0, x[[REG]], #16
   %inc = add i16 %a, 1
   %conv = sext i16 %inc to i64
@@ -235,7 +235,7 @@ define i64 @extendedLeftShiftintToint64By4(i32 %a) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: extendedLeftShiftintToint64By4:
 ; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #60, #31
+; CHECK: sbfiz x0, x[[REG]], #4, #32
   %inc = add nsw i32 %a, 1
   %conv = sext i32 %inc to i64
   %shl = shl nsw i64 %conv, 4
@@ -246,7 +246,7 @@ define i64 @extendedRightShiftintToint64By4(i32 %a) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: extendedRightShiftintToint64By4:
 ; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #4, #31
+; CHECK: sbfx x0, x[[REG]], #4, #28
   %inc = add nsw i32 %a, 1
   %conv = sext i32 %inc to i64
   %shr = ashr i64 %conv, 4
@@ -268,7 +268,7 @@ define i64 @extendedRightShiftintToint64By32(i32 %a) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: extendedRightShiftintToint64By32:
 ; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sxtw x[[REG]], x[[REG]]
+; CHECK: sxtw x[[REG]], w[[REG]]
 ; CHECK: asr x0, x[[REG]], #32
   %inc = add nsw i32 %a, 1
   %conv = sext i32 %inc to i64
diff --git a/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
new file mode 100644
index 0000000..aed39e7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
+
+define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
+; CHECK: uaddlv.16b h0, v0
+; CHECK: rshrn.8b v0, v0, #4
+; CHECK: dup.16b v0, v0[0]
+; CHECK: ret
+
+; CHECK-FAST: uaddlv.16b
+; CHECK-FAST: rshrn.8b
+; CHECK-FAST: dup.16b
+  %tmp = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp1 = trunc i32 %tmp to i16
+  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
+  %tmp3 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
+  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp4
+}
+
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/simplest-elf.ll b/test/CodeGen/AArch64/arm64-simplest-elf.ll
index 1254365..1254365 100644
--- a/test/CodeGen/ARM64/simplest-elf.ll
+++ b/test/CodeGen/AArch64/arm64-simplest-elf.ll
diff --git a/test/CodeGen/ARM64/sincos.ll b/test/CodeGen/AArch64/arm64-sincos.ll
index 06157b2..06157b2 100644
--- a/test/CodeGen/ARM64/sincos.ll
+++ b/test/CodeGen/AArch64/arm64-sincos.ll
diff --git a/test/CodeGen/ARM64/sitofp-combine-chains.ll b/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
index 10b433b..10b433b 100644
--- a/test/CodeGen/ARM64/sitofp-combine-chains.ll
+++ b/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
diff --git a/test/CodeGen/ARM64/sli-sri-opt.ll b/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
index 725dcd5..7fec539 100644
--- a/test/CodeGen/ARM64/sli-sri-opt.ll
+++ b/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -arm64-shift-insert-generation=true -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -aarch64-shift-insert-generation=true -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
 ; CHECK-LABEL: testLeftGood:
diff --git a/test/CodeGen/ARM64/smaxv.ll b/test/CodeGen/AArch64/arm64-smaxv.ll
index 4f6e01b..183e667 100644
--- a/test/CodeGen/ARM64/smaxv.ll
+++ b/test/CodeGen/AArch64/arm64-smaxv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
 ; CHECK: test_vmaxv_s8
@@ -6,7 +6,7 @@ define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vmaxv.i to i8
   ret i8 %0
 }
@@ -17,7 +17,7 @@ define signext i16 @test_vmaxv_s16(<4 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vmaxv.i to i16
   ret i16 %0
 }
@@ -29,7 +29,7 @@ define i32 @test_vmaxv_s32(<2 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vmaxv.i
 }
 
@@ -39,7 +39,7 @@ define signext i8 @test_vmaxvq_s8(<16 x i8> %a1) {
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vmaxv.i to i8
   ret i8 %0
 }
@@ -50,7 +50,7 @@ define signext i16 @test_vmaxvq_s16(<8 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vmaxv.i to i16
   ret i16 %0
 }
@@ -61,14 +61,14 @@ define i32 @test_vmaxvq_s32(<4 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vmaxv.i
 }
 
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
 
diff --git a/test/CodeGen/ARM64/sminv.ll b/test/CodeGen/AArch64/arm64-sminv.ll
index a246868..195c4e5 100644
--- a/test/CodeGen/ARM64/sminv.ll
+++ b/test/CodeGen/AArch64/arm64-sminv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define signext i8 @test_vminv_s8(<8 x i8> %a1) {
 ; CHECK: test_vminv_s8
@@ -6,7 +6,7 @@ define signext i8 @test_vminv_s8(<8 x i8> %a1) {
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vminv.i to i8
   ret i8 %0
 }
@@ -17,7 +17,7 @@ define signext i16 @test_vminv_s16(<4 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vminv.i to i16
   ret i16 %0
 }
@@ -29,7 +29,7 @@ define i32 @test_vminv_s32(<2 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vminv.i
 }
 
@@ -39,7 +39,7 @@ define signext i8 @test_vminvq_s8(<16 x i8> %a1) {
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vminv.i to i8
   ret i8 %0
 }
@@ -50,7 +50,7 @@ define signext i16 @test_vminvq_s16(<8 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vminv.i to i16
   ret i16 %0
 }
@@ -61,14 +61,14 @@ define i32 @test_vminvq_s32(<4 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vminv.i
 }
 
-declare i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32>)
-declare i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16>)
-declare i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8>)
-declare i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32>)
-declare i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16>)
-declare i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
 
diff --git a/test/CodeGen/ARM64/spill-lr.ll b/test/CodeGen/AArch64/arm64-spill-lr.ll
index fb6588e..fb6588e 100644
--- a/test/CodeGen/ARM64/spill-lr.ll
+++ b/test/CodeGen/AArch64/arm64-spill-lr.ll
diff --git a/test/CodeGen/ARM64/spill.ll b/test/CodeGen/AArch64/arm64-spill.ll
index 9173c87..47cdc2b 100644
--- a/test/CodeGen/ARM64/spill.ll
+++ b/test/CodeGen/AArch64/arm64-spill.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -aarch64-neon-syntax=apple -verify-machineinstrs
 
 ; CHECK: fpr128
 ; CHECK: ld1.2d
diff --git a/test/CodeGen/ARM64/st1.ll b/test/CodeGen/AArch64/arm64-st1.ll
index b9aafc6..4370484 100644
--- a/test/CodeGen/ARM64/st1.ll
+++ b/test/CodeGen/AArch64/arm64-st1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
 
 define void @st1lane_16b(<16 x i8> %A, i8* %D) {
 ; CHECK-LABEL: st1lane_16b
@@ -83,594 +83,594 @@ define void @st1lane_2s_float(<2 x float> %A, float* %D) {
 define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
 ; CHECK-LABEL: st2lane_16b
 ; CHECK: st2.b
-  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
   ret void
 }
 
 define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
 ; CHECK-LABEL: st2lane_8h
 ; CHECK: st2.h
-  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
   ret void
 }
 
 define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
 ; CHECK-LABEL: st2lane_4s
 ; CHECK: st2.s
-  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
   ret void
 }
 
 define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
 ; CHECK-LABEL: st2lane_2d
 ; CHECK: st2.d
-  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
   ret void
 }
 
-declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
 
 define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
 ; CHECK-LABEL: st3lane_16b
 ; CHECK: st3.b
-  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
   ret void
 }
 
 define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
 ; CHECK-LABEL: st3lane_8h
 ; CHECK: st3.h
-  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
   ret void
 }
 
 define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
 ; CHECK-LABEL: st3lane_4s
 ; CHECK: st3.s
-  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
   ret void
 }
 
 define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
 ; CHECK-LABEL: st3lane_2d
 ; CHECK: st3.d
-  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
   ret void
 }
 
-declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
 
 define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
 ; CHECK-LABEL: st4lane_16b
 ; CHECK: st4.b
-  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
   ret void
 }
 
 define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
 ; CHECK-LABEL: st4lane_8h
 ; CHECK: st4.h
-  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
   ret void
 }
 
 define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
 ; CHECK-LABEL: st4lane_4s
 ; CHECK: st4.s
-  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
   ret void
 }
 
 define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
 ; CHECK-LABEL: st4lane_2d
 ; CHECK: st4.d
-  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
   ret void
 }
 
-declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
 
 
 define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
 ; CHECK-LABEL: st2_8b
 ; CHECK st2.8b
-	call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
+	call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
 	ret void
 }
 
 define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
 ; CHECK-LABEL: st3_8b
 ; CHECK st3.8b
-	call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
+	call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
 	ret void
 }
 
 define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
 ; CHECK-LABEL: st4_8b
 ; CHECK st4.8b
-	call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
+	call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
 
 define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
 ; CHECK-LABEL: st2_16b
 ; CHECK st2.16b
-	call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
+	call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
 	ret void
 }
 
 define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
 ; CHECK-LABEL: st3_16b
 ; CHECK st3.16b
-	call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
+	call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
 	ret void
 }
 
 define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
 ; CHECK-LABEL: st4_16b
 ; CHECK st4.16b
-	call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
+	call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
 
 define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
 ; CHECK-LABEL: st2_4h
 ; CHECK st2.4h
-	call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
+	call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
 	ret void
 }
 
 define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
 ; CHECK-LABEL: st3_4h
 ; CHECK st3.4h
-	call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
+	call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
 	ret void
 }
 
 define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
 ; CHECK-LABEL: st4_4h
 ; CHECK st4.4h
-	call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
+	call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
 
 define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
 ; CHECK-LABEL: st2_8h
 ; CHECK st2.8h
-	call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
+	call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
 	ret void
 }
 
 define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
 ; CHECK-LABEL: st3_8h
 ; CHECK st3.8h
-	call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
+	call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
 	ret void
 }
 
 define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
 ; CHECK-LABEL: st4_8h
 ; CHECK st4.8h
-	call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
+	call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
 
 define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
 ; CHECK-LABEL: st2_2s
 ; CHECK st2.2s
-	call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
+	call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
 	ret void
 }
 
 define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
 ; CHECK-LABEL: st3_2s
 ; CHECK st3.2s
-	call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
+	call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
 	ret void
 }
 
 define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
 ; CHECK-LABEL: st4_2s
 ; CHECK st4.2s
-	call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
+	call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
 
 define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
 ; CHECK-LABEL: st2_4s
 ; CHECK st2.4s
-	call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
+	call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
 	ret void
 }
 
 define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
 ; CHECK-LABEL: st3_4s
 ; CHECK st3.4s
-	call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
+	call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
 	ret void
 }
 
 define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
 ; CHECK-LABEL: st4_4s
 ; CHECK st4.4s
-	call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
+	call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
 
 define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
 ; CHECK-LABEL: st2_1d
 ; CHECK st1.2d
-	call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
+	call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
 	ret void
 }
 
 define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
 ; CHECK-LABEL: st3_1d
 ; CHECK st1.3d
-	call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
+	call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
 	ret void
 }
 
 define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
 ; CHECK-LABEL: st4_1d
 ; CHECK st1.4d
-	call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
+	call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
 
 define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
 ; CHECK-LABEL: st2_2d
 ; CHECK st2.2d
-	call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
+	call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
 	ret void
 }
 
 define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
 ; CHECK-LABEL: st3_2d
 ; CHECK st2.3d
-	call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
+	call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
 	ret void
 }
 
 define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
 ; CHECK-LABEL: st4_2d
 ; CHECK st2.4d
-	call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
+	call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
 
-declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
 
 define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
 ; CHECK-LABEL: st1_x2_v8i8:
 ; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
   ret void
 }
 
 define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
 ; CHECK-LABEL: st1_x2_v4i16:
 ; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
   ret void
 }
 
 define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
 ; CHECK-LABEL: st1_x2_v2i32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
   ret void
 }
 
 define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
 ; CHECK-LABEL: st1_x2_v2f32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
   ret void
 }
 
 define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
 ; CHECK-LABEL: st1_x2_v1i64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
   ret void
 }
 
 define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
 ; CHECK-LABEL: st1_x2_v1f64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
   ret void
 }
 
-declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
 
 define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
 ; CHECK-LABEL: st1_x2_v16i8:
 ; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
   ret void
 }
 
 define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
 ; CHECK-LABEL: st1_x2_v8i16:
 ; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
   ret void
 }
 
 define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
 ; CHECK-LABEL: st1_x2_v4i32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
   ret void
 }
 
 define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
 ; CHECK-LABEL: st1_x2_v4f32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
   ret void
 }
 
 define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
 ; CHECK-LABEL: st1_x2_v2i64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
   ret void
 }
 
 define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
 ; CHECK-LABEL: st1_x2_v2f64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
   ret void
 }
 
-declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
 
 define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
 ; CHECK-LABEL: st1_x3_v8i8:
 ; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
   ret void
 }
 
 define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
 ; CHECK-LABEL: st1_x3_v4i16:
 ; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
   ret void
 }
 
 define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
 ; CHECK-LABEL: st1_x3_v2i32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
   ret void
 }
 
 define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
 ; CHECK-LABEL: st1_x3_v2f32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
   ret void
 }
 
 define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
 ; CHECK-LABEL: st1_x3_v1i64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
   ret void
 }
 
 define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
 ; CHECK-LABEL: st1_x3_v1f64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
   ret void
 }
 
-declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
 
 define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
 ; CHECK-LABEL: st1_x3_v16i8:
 ; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
   ret void
 }
 
 define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
 ; CHECK-LABEL: st1_x3_v8i16:
 ; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
   ret void
 }
 
 define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
 ; CHECK-LABEL: st1_x3_v4i32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
   ret void
 }
 
 define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
 ; CHECK-LABEL: st1_x3_v4f32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
   ret void
 }
 
 define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
 ; CHECK-LABEL: st1_x3_v2i64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
   ret void
 }
 
 define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
 ; CHECK-LABEL: st1_x3_v2f64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
   ret void
 }
 
 
-declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
 
 define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
 ; CHECK-LABEL: st1_x4_v8i8:
 ; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
   ret void
 }
 
 define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
 ; CHECK-LABEL: st1_x4_v4i16:
 ; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
   ret void
 }
 
 define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
 ; CHECK-LABEL: st1_x4_v2i32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
   ret void
 }
 
 define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
 ; CHECK-LABEL: st1_x4_v2f32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
   ret void
 }
 
 define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
 ; CHECK-LABEL: st1_x4_v1i64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
   ret void
 }
 
 define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
 ; CHECK-LABEL: st1_x4_v1f64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
   ret void
 }
 
-declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
 
 define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
 ; CHECK-LABEL: st1_x4_v16i8:
 ; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
   ret void
 }
 
 define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
 ; CHECK-LABEL: st1_x4_v8i16:
 ; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
   ret void
 }
 
 define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
 ; CHECK-LABEL: st1_x4_v4i32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
   ret void
 }
 
 define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
 ; CHECK-LABEL: st1_x4_v4f32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
   ret void
 }
 
 define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
 ; CHECK-LABEL: st1_x4_v2i64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
   ret void
 }
 
 define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
 ; CHECK-LABEL: st1_x4_v2f64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
   ret void
 }
diff --git a/test/CodeGen/ARM64/stack-no-frame.ll b/test/CodeGen/AArch64/arm64-stack-no-frame.ll
index b5970c0..b5970c0 100644
--- a/test/CodeGen/ARM64/stack-no-frame.ll
+++ b/test/CodeGen/AArch64/arm64-stack-no-frame.ll
diff --git a/test/CodeGen/ARM64/stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
index 2c7c6ae..2c7c6ae 100644
--- a/test/CodeGen/ARM64/stackmap.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll
diff --git a/test/CodeGen/AArch64/arm64-stackpointer.ll b/test/CodeGen/AArch64/arm64-stackpointer.ll
new file mode 100644
index 0000000..581faf1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stackpointer.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+define i64 @get_stack() nounwind {
+entry:
+; CHECK-LABEL: get_stack:
+; CHECK: mov   x0, sp
+	%sp = call i64 @llvm.read_register.i64(metadata !0)
+  ret i64 %sp
+}
+
+define void @set_stack(i64 %val) nounwind {
+entry:
+; CHECK-LABEL: set_stack:
+; CHECK: mov   sp, x0
+  call void @llvm.write_register.i64(metadata !0, i64 %val)
+  ret void
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+declare void @llvm.write_register.i64(metadata, i64) nounwind
+
+; register unsigned long current_stack_pointer asm("sp");
+; CHECK-NOT: .asciz  "sp"
+!0 = metadata !{metadata !"sp\00"}
diff --git a/test/CodeGen/ARM64/stacksave.ll b/test/CodeGen/AArch64/arm64-stacksave.ll
index a79e99b..a79e99b 100644
--- a/test/CodeGen/ARM64/stacksave.ll
+++ b/test/CodeGen/AArch64/arm64-stacksave.ll
diff --git a/test/CodeGen/ARM64/stp.ll b/test/CodeGen/AArch64/arm64-stp.ll
index eacf093..40bdf22 100644
--- a/test/CodeGen/ARM64/stp.ll
+++ b/test/CodeGen/AArch64/arm64-stp.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=arm64 -arm64-stp-suppress=false -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
-; RUN:   -verify-machineinstrs | FileCheck -check-prefix=STUR_CHK %s
+; RUN: llc < %s -march=arm64 -aarch64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
+; RUN:   -verify-machineinstrs -mcpu=cyclone | FileCheck -check-prefix=STUR_CHK %s
 
 ; CHECK: stp_int
 ; CHECK: stp w0, w1, [x2]
diff --git a/test/CodeGen/ARM64/strict-align.ll b/test/CodeGen/AArch64/arm64-strict-align.ll
index e392172..5d13704 100644
--- a/test/CodeGen/ARM64/strict-align.ll
+++ b/test/CodeGen/AArch64/arm64-strict-align.ll
@@ -1,10 +1,11 @@
 ; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -arm64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-no-strict-align | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
 
 define i32 @f0(i32* nocapture %p) nounwind {
 ; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
 ; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
-; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
+; CHECK-STRICT: bfi [[LOW]], [[HIGH]], #16, #16
 ; CHECK-STRICT: ret
 
 ; CHECK: ldr w0, [x0]
@@ -15,7 +16,7 @@ define i32 @f0(i32* nocapture %p) nounwind {
 
 define i64 @f1(i64* nocapture %p) nounwind {
 ; CHECK-STRICT:	ldp	w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
-; CHECK-STRICT:	orr	x0, x[[LOW]], x[[HIGH]], lsl #32
+; CHECK-STRICT: bfi x[[LOW]], x[[HIGH]], #32, #32
 ; CHECK-STRICT:	ret
 
 ; CHECK: ldr x0, [x0]
diff --git a/test/CodeGen/ARM64/stur.ll b/test/CodeGen/AArch64/arm64-stur.ll
index 8326bba..a2e684d 100644
--- a/test/CodeGen/ARM64/stur.ll
+++ b/test/CodeGen/AArch64/arm64-stur.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
 %struct.X = type <{ i32, i64, i64 }>
 
 define void @foo1(i32* %p, i64 %val) nounwind {
diff --git a/test/CodeGen/AArch64/arm64-subsections.ll b/test/CodeGen/AArch64/arm64-subsections.ll
new file mode 100644
index 0000000..316e7c3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-subsections.ll
@@ -0,0 +1,5 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefix=CHECK-MACHO
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK-ELF
+
+; CHECK-MACHO: .subsections_via_symbols
+; CHECK-ELF-NOT: .subsections_via_symbols
+\ No newline at end of file
diff --git a/test/CodeGen/ARM64/subvector-extend.ll b/test/CodeGen/AArch64/arm64-subvector-extend.ll
index ad2f06c..d5a178a 100644
--- a/test/CodeGen/ARM64/subvector-extend.ll
+++ b/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 ; Test efficient codegen of vector extends up from legal type to 128 bit
 ; and 256 bit vector types.
diff --git a/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll b/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
index 4ab2bee..4ab2bee 100644
--- a/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
+++ b/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
diff --git a/test/CodeGen/AArch64/arm64-tbl.ll b/test/CodeGen/AArch64/arm64-tbl.ll
new file mode 100644
index 0000000..b1ce15a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tbl.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+; CHECK: tbl1_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK: tbl1_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+; CHECK: tbl2_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK: tbl2_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbl3_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbl3_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbl4_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbl4_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+; CHECK: tbx1_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+; CHECK: tbx1_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbx2_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbx2_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbx3_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbx3_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+; CHECK: tbx4_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+; CHECK: tbx4_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/this-return.ll b/test/CodeGen/AArch64/arm64-this-return.ll
index 30f5b9b..30f5b9b 100644
--- a/test/CodeGen/ARM64/this-return.ll
+++ b/test/CodeGen/AArch64/arm64-this-return.ll
diff --git a/test/CodeGen/ARM64/tls-darwin.ll b/test/CodeGen/AArch64/arm64-tls-darwin.ll
index 5e8ec33..5e8ec33 100644
--- a/test/CodeGen/ARM64/tls-darwin.ll
+++ b/test/CodeGen/AArch64/arm64-tls-darwin.ll
diff --git a/test/CodeGen/ARM64/tls-dynamic-together.ll b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
index 3daae62..3daae62 100644
--- a/test/CodeGen/ARM64/tls-dynamic-together.ll
+++ b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
diff --git a/test/CodeGen/ARM64/tls-dynamics.ll b/test/CodeGen/AArch64/arm64-tls-dynamics.ll
index e8a83fd..e8a83fd 100644
--- a/test/CodeGen/ARM64/tls-dynamics.ll
+++ b/test/CodeGen/AArch64/arm64-tls-dynamics.ll
diff --git a/test/CodeGen/ARM64/tls-execs.ll b/test/CodeGen/AArch64/arm64-tls-execs.ll
index f0130d8..f0130d8 100644
--- a/test/CodeGen/ARM64/tls-execs.ll
+++ b/test/CodeGen/AArch64/arm64-tls-execs.ll
diff --git a/test/CodeGen/ARM64/trap.ll b/test/CodeGen/AArch64/arm64-trap.ll
index c9e0bea..5e99c32 100644
--- a/test/CodeGen/ARM64/trap.ll
+++ b/test/CodeGen/AArch64/arm64-trap.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=arm64 | FileCheck %s
 define void @foo() nounwind {
 ; CHECK: foo
-; CHECK: brk #1
+; CHECK: brk #0x1
   tail call void @llvm.trap()
   ret void
 }
diff --git a/test/CodeGen/ARM64/trn.ll b/test/CodeGen/AArch64/arm64-trn.ll
index f467984..2db7a14 100644
--- a/test/CodeGen/ARM64/trn.ll
+++ b/test/CodeGen/AArch64/arm64-trn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vtrni8:
diff --git a/test/CodeGen/ARM64/trunc-store.ll b/test/CodeGen/AArch64/arm64-trunc-store.ll
index e65f5b5..cf15247 100644
--- a/test/CodeGen/ARM64/trunc-store.ll
+++ b/test/CodeGen/AArch64/arm64-trunc-store.ll
@@ -22,7 +22,7 @@ define void @fct32(i32 %arg, i64 %var) {
 ; w0 is %arg
 ; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
 ; w1 is %var truncated
-; CHECK-NEXT: str w1, {{\[}}[[GLOBALADDR]], x[[OFFSETREGNUM]], sxtw #2]
+; CHECK-NEXT: str w1, {{\[}}[[GLOBALADDR]], w[[OFFSETREGNUM]], sxtw #2]
 ; CHECK-NEXT: ret
 bb:
   %.pre37 = load i32** @zptr32, align 8
@@ -42,7 +42,7 @@ define void @fct16(i32 %arg, i64 %var) {
 ; w0 is %arg
 ; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
 ; w1 is %var truncated
-; CHECK-NEXT: strh w1, {{\[}}[[GLOBALADDR]], x[[OFFSETREGNUM]], sxtw #1]
+; CHECK-NEXT: strh w1, {{\[}}[[GLOBALADDR]], w[[OFFSETREGNUM]], sxtw #1]
 ; CHECK-NEXT: ret
 bb:
   %.pre37 = load i16** @zptr16, align 8
diff --git a/test/CodeGen/ARM64/umaxv.ll b/test/CodeGen/AArch64/arm64-umaxv.ll
index 15277d3..d523f31 100644
--- a/test/CodeGen/ARM64/umaxv.ll
+++ b/test/CodeGen/AArch64/arm64-umaxv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-LABEL: vmax_u8x8:
@@ -7,7 +7,7 @@ define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -30,7 +30,7 @@ define i32 @vmax_u4x16(<4 x i16> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -51,7 +51,7 @@ define i32 @vmax_u8x16(<8 x i16> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -72,7 +72,7 @@ define i32 @vmax_u16x8(<16 x i8> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -86,7 +86,7 @@ return:
   ret i32 %retval.0
 }
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/uminv.ll b/test/CodeGen/AArch64/arm64-uminv.ll
index 440522f..3bade4b 100644
--- a/test/CodeGen/ARM64/uminv.ll
+++ b/test/CodeGen/AArch64/arm64-uminv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-LABEL: vmin_u8x8:
@@ -7,7 +7,7 @@ define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
   %tmp = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -30,7 +30,7 @@ define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
   %tmp = trunc i32 %vminv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -51,7 +51,7 @@ define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
   %tmp = trunc i32 %vminv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -72,7 +72,7 @@ define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
   %tmp = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -86,7 +86,7 @@ return:
   ret i32 %retval.0
 }
 
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/umov.ll b/test/CodeGen/AArch64/arm64-umov.ll
index 7701874..a1ef990 100644
--- a/test/CodeGen/ARM64/umov.ll
+++ b/test/CodeGen/AArch64/arm64-umov.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define zeroext i8 @f1(<16 x i8> %a) {
 ; CHECK-LABEL: f1:
-; CHECK: umov.b w0, v0[3]
+; CHECK: mov.b w0, v0[3]
 ; CHECK-NEXT: ret
   %vecext = extractelement <16 x i8> %a, i32 3
   ret i8 %vecext
@@ -10,7 +10,7 @@ define zeroext i8 @f1(<16 x i8> %a) {
 
 define zeroext i16 @f2(<4 x i16> %a) {
 ; CHECK-LABEL: f2:
-; CHECK: umov.h w0, v0[2]
+; CHECK: mov.h w0, v0[2]
 ; CHECK-NEXT: ret
   %vecext = extractelement <4 x i16> %a, i32 2
   ret i16 %vecext
@@ -18,7 +18,7 @@ define zeroext i16 @f2(<4 x i16> %a) {
 
 define i32 @f3(<2 x i32> %a) {
 ; CHECK-LABEL: f3:
-; CHECK: umov.s w0, v0[1]
+; CHECK: mov.s w0, v0[1]
 ; CHECK-NEXT: ret
   %vecext = extractelement <2 x i32> %a, i32 1
   ret i32 %vecext
@@ -26,7 +26,7 @@ define i32 @f3(<2 x i32> %a) {
 
 define i64 @f4(<2 x i64> %a) {
 ; CHECK-LABEL: f4:
-; CHECK: umov.d x0, v0[1]
+; CHECK: mov.d x0, v0[1]
 ; CHECK-NEXT: ret
   %vecext = extractelement <2 x i64> %a, i32 1
   ret i64 %vecext
diff --git a/test/CodeGen/ARM64/unaligned_ldst.ll b/test/CodeGen/AArch64/arm64-unaligned_ldst.ll
index 20b80c0..20b80c0 100644
--- a/test/CodeGen/ARM64/unaligned_ldst.ll
+++ b/test/CodeGen/AArch64/arm64-unaligned_ldst.ll
diff --git a/test/CodeGen/ARM64/uzp.ll b/test/CodeGen/AArch64/arm64-uzp.ll
index 60e16d0..cdd8d31 100644
--- a/test/CodeGen/ARM64/uzp.ll
+++ b/test/CodeGen/AArch64/arm64-uzp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vuzpi8:
diff --git a/test/CodeGen/ARM64/vaargs.ll b/test/CodeGen/AArch64/arm64-vaargs.ll
index ce07635..ce07635 100644
--- a/test/CodeGen/ARM64/vaargs.ll
+++ b/test/CodeGen/AArch64/arm64-vaargs.ll
diff --git a/test/CodeGen/ARM64/vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
index 0d8aa24..5afc8d9 100644
--- a/test/CodeGen/ARM64/vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 
 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
@@ -6,7 +6,7 @@ define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: sabdl.8h
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
         ret <8 x i16> %tmp4
 }
@@ -16,7 +16,7 @@ define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sabdl.4s
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
         ret <4 x i32> %tmp4
 }
@@ -26,7 +26,7 @@ define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sabdl.2d
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
         ret <2 x i64> %tmp4
 }
@@ -38,7 +38,7 @@ define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
         %load2 = load <16 x i8>* %B
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
         ret <8 x i16> %tmp4
 }
@@ -50,7 +50,7 @@ define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
         %load2 = load <8 x i16>* %B
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
         ret <4 x i32> %tmp4
 }
@@ -62,7 +62,7 @@ define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
         %load2 = load <4 x i32>* %B
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
         ret <2 x i64> %tmp4
 }
@@ -72,7 +72,7 @@ define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uabdl.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
   %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
   ret <8 x i16> %tmp4
 }
@@ -82,7 +82,7 @@ define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uabdl.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
   ret <4 x i32> %tmp4
 }
@@ -92,7 +92,7 @@ define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uabdl.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
   ret <2 x i64> %tmp4
 }
@@ -105,7 +105,7 @@ define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
   %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
   ret <8 x i16> %tmp4
 }
@@ -117,7 +117,7 @@ define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
   %load2 = load <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
   ret <4 x i32> %tmp4
 }
@@ -129,7 +129,7 @@ define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   %load2 = load <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
   ret <2 x i64> %tmp4
 }
@@ -139,7 +139,7 @@ define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK: fabd.2s
         %tmp1 = load <2 x float>* %A
         %tmp2 = load <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
         ret <2 x float> %tmp3
 }
 
@@ -148,7 +148,7 @@ define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fabd.4s
         %tmp1 = load <4 x float>* %A
         %tmp2 = load <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
         ret <4 x float> %tmp3
 }
 
@@ -157,20 +157,20 @@ define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fabd.2d
         %tmp1 = load <2 x double>* %A
         %tmp2 = load <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
         ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sabd_8b:
 ;CHECK: sabd.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -179,7 +179,7 @@ define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sabd.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -188,7 +188,7 @@ define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sabd.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -197,7 +197,7 @@ define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sabd.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -206,7 +206,7 @@ define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sabd.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -215,23 +215,23 @@ define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sabd.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: uabd_8b:
 ;CHECK: uabd.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -240,7 +240,7 @@ define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uabd.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -249,7 +249,7 @@ define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uabd.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -258,7 +258,7 @@ define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uabd.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -267,7 +267,7 @@ define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uabd.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -276,22 +276,22 @@ define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uabd.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqabs_8b:
 ;CHECK: sqabs.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
         ret <8 x i8> %tmp3
 }
 
@@ -299,7 +299,7 @@ define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqabs_16b:
 ;CHECK: sqabs.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
         ret <16 x i8> %tmp3
 }
 
@@ -307,7 +307,7 @@ define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqabs_4h:
 ;CHECK: sqabs.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -315,7 +315,7 @@ define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqabs_8h:
 ;CHECK: sqabs.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -323,7 +323,7 @@ define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqabs_2s:
 ;CHECK: sqabs.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -331,22 +331,22 @@ define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqabs_4s:
 ;CHECK: sqabs.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
 
 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqneg_8b:
 ;CHECK: sqneg.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
         ret <8 x i8> %tmp3
 }
 
@@ -354,7 +354,7 @@ define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqneg_16b:
 ;CHECK: sqneg.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
         ret <16 x i8> %tmp3
 }
 
@@ -362,7 +362,7 @@ define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqneg_4h:
 ;CHECK: sqneg.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -370,7 +370,7 @@ define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqneg_8h:
 ;CHECK: sqneg.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -378,7 +378,7 @@ define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqneg_2s:
 ;CHECK: sqneg.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -386,22 +386,22 @@ define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqneg_4s:
 ;CHECK: sqneg.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
 
 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: abs_8b:
 ;CHECK: abs.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
         ret <8 x i8> %tmp3
 }
 
@@ -409,7 +409,7 @@ define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: abs_16b:
 ;CHECK: abs.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
         ret <16 x i8> %tmp3
 }
 
@@ -417,7 +417,7 @@ define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: abs_4h:
 ;CHECK: abs.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -425,7 +425,7 @@ define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: abs_8h:
 ;CHECK: abs.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -433,7 +433,7 @@ define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: abs_2s:
 ;CHECK: abs.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -441,32 +441,32 @@ define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: abs_4s:
 ;CHECK: abs.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
         ret <4 x i32> %tmp3
 }
 
 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
 ; CHECK-LABEL: abs_1d:
 ; CHECK: abs d0, d0
-  %abs = call <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64> %A)
+  %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
   ret <1 x i64> %abs
 }
 
 define i64 @abs_1d_honestly(i64 %A) nounwind {
 ; CHECK-LABEL: abs_1d_honestly:
 ; CHECK: abs d0, d0
-  %abs = call i64 @llvm.arm64.neon.abs.i64(i64 %A)
+  %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
   ret i64 %abs
 }
 
-declare <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.abs.i64(i64) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
 
 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
 ;CHECK-LABEL: sabal8h:
@@ -474,7 +474,7 @@ define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
         %tmp3 = load <8 x i16>* %C
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -486,7 +486,7 @@ define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
         %tmp3 = load <4 x i32>* %C
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -498,7 +498,7 @@ define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
         %tmp3 = load <2 x i64>* %C
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
@@ -513,7 +513,7 @@ define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin
         %tmp3 = load <8 x i16>* %C
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -527,7 +527,7 @@ define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin
         %tmp3 = load <4 x i32>* %C
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -541,7 +541,7 @@ define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwin
         %tmp3 = load <2 x i64>* %C
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
         ret <2 x i64> %tmp5
@@ -553,7 +553,7 @@ define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
         %tmp3 = load <8 x i16>* %C
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -565,7 +565,7 @@ define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
         %tmp3 = load <4 x i32>* %C
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -577,7 +577,7 @@ define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
         %tmp3 = load <2 x i64>* %C
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
         ret <2 x i64> %tmp5
@@ -591,7 +591,7 @@ define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin
         %tmp3 = load <8 x i16>* %C
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -605,7 +605,7 @@ define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin
         %tmp3 = load <4 x i32>* %C
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -619,7 +619,7 @@ define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwin
         %tmp3 = load <2 x i64>* %C
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
         ret <2 x i64> %tmp5
@@ -630,7 +630,7 @@ define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: saba.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = load <8 x i8>* %C
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -641,7 +641,7 @@ define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
 ;CHECK: saba.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         %tmp4 = load <16 x i8>* %C
         %tmp5 = add <16 x i8> %tmp3, %tmp4
         ret <16 x i8> %tmp5
@@ -652,7 +652,7 @@ define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
 ;CHECK: saba.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = load <4 x i16>* %C
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -663,7 +663,7 @@ define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind
 ;CHECK: saba.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         %tmp4 = load <8 x i16>* %C
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -674,7 +674,7 @@ define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
 ;CHECK: saba.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = load <2 x i32>* %C
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -685,7 +685,7 @@ define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind
 ;CHECK: saba.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         %tmp4 = load <4 x i32>* %C
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -696,7 +696,7 @@ define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: uaba.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = load <8 x i8>* %C
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -707,7 +707,7 @@ define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
 ;CHECK: uaba.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         %tmp4 = load <16 x i8>* %C
         %tmp5 = add <16 x i8> %tmp3, %tmp4
         ret <16 x i8> %tmp5
@@ -718,7 +718,7 @@ define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
 ;CHECK: uaba.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = load <4 x i16>* %C
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -729,7 +729,7 @@ define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind
 ;CHECK: uaba.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         %tmp4 = load <8 x i16>* %C
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -740,7 +740,7 @@ define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
 ;CHECK: uaba.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = load <2 x i32>* %C
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -751,7 +751,7 @@ define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind
 ;CHECK: uaba.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         %tmp4 = load <4 x i32>* %C
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -761,19 +761,19 @@ define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind
 define float @fabds(float %a, float %b) nounwind {
 ; CHECK-LABEL: fabds:
 ; CHECK: fabd s0, s0, s1
-  %vabd.i = tail call float @llvm.arm64.sisd.fabd.f32(float %a, float %b) nounwind
+  %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
   ret float %vabd.i
 }
 
 define double @fabdd(double %a, double %b) nounwind {
 ; CHECK-LABEL: fabdd:
 ; CHECK: fabd d0, d0, d1
-  %vabd.i = tail call double @llvm.arm64.sisd.fabd.f64(double %a, double %b) nounwind
+  %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
   ret double %vabd.i
 }
 
-declare double @llvm.arm64.sisd.fabd.f64(double, double) nounwind readnone
-declare float @llvm.arm64.sisd.fabd.f32(float, float) nounwind readnone
+declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
 
 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uabdl_from_extract_dup:
@@ -784,7 +784,7 @@ define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
   %res1 = zext <2 x i32> %res to <2 x i64>
   ret <2 x i64> %res1
 }
@@ -798,7 +798,7 @@ define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
   %res1 = zext <2 x i32> %res to <2 x i64>
   ret <2 x i64> %res1
 }
diff --git a/test/CodeGen/ARM64/vadd.ll b/test/CodeGen/AArch64/arm64-vadd.ll
index f674c6d..9ed8aa6 100644
--- a/test/CodeGen/ARM64/vadd.ll
+++ b/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: addhn8b:
 ;CHECK: addhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: addhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: addhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -31,8 +31,8 @@ define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;CHECK-LABEL: addhn2_16b:
 ;CHECK: addhn.8b
 ;CHECK-NEXT: addhn2.16b
-  %vaddhn2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vaddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -41,8 +41,8 @@ define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;CHECK-LABEL: addhn2_8h:
 ;CHECK: addhn.4h
 ;CHECK-NEXT: addhn2.8h
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vaddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -51,15 +51,15 @@ define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;CHECK-LABEL: addhn2_4s:
 ;CHECK: addhn.2s
 ;CHECK-NEXT: addhn2.4s
-  %vaddhn2.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vaddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 
 define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
@@ -67,7 +67,7 @@ define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: raddhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -76,7 +76,7 @@ define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: raddhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -85,7 +85,7 @@ define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: raddhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -93,8 +93,8 @@ define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;CHECK-LABEL: raddhn2_16b:
 ;CHECK: raddhn.8b
 ;CHECK-NEXT: raddhn2.16b
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vraddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -103,8 +103,8 @@ define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;CHECK-LABEL: raddhn2_8h:
 ;CHECK: raddhn.4h
 ;CHECK-NEXT: raddhn2.8h
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vraddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -113,15 +113,15 @@ define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;CHECK-LABEL: raddhn2_4s:
 ;CHECK: raddhn.2s
 ;CHECK-NEXT: raddhn2.4s
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vraddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: saddl8h:
@@ -428,7 +428,7 @@ define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: saddlp4h:
 ;CHECK: saddlp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -436,7 +436,7 @@ define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: saddlp2s:
 ;CHECK: saddlp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -444,7 +444,7 @@ define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: saddlp1d:
 ;CHECK: saddlp.1d
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
         ret <1 x i64> %tmp3
 }
 
@@ -452,7 +452,7 @@ define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: saddlp8h:
 ;CHECK: saddlp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -460,7 +460,7 @@ define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: saddlp4s:
 ;CHECK: saddlp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
         ret <4 x i32> %tmp3
 }
 
@@ -468,23 +468,23 @@ define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: saddlp2d:
 ;CHECK: saddlp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
         ret <2 x i64> %tmp3
 }
 
-declare <4 x i16>  @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i16>  @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
 
-declare <8 x i16>  @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 
 define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: uaddlp4h:
 ;CHECK: uaddlp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -492,7 +492,7 @@ define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: uaddlp2s:
 ;CHECK: uaddlp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -500,7 +500,7 @@ define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: uaddlp1d:
 ;CHECK: uaddlp.1d
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
         ret <1 x i64> %tmp3
 }
 
@@ -508,7 +508,7 @@ define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: uaddlp8h:
 ;CHECK: uaddlp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -516,7 +516,7 @@ define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: uaddlp4s:
 ;CHECK: uaddlp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
         ret <4 x i32> %tmp3
 }
 
@@ -524,23 +524,23 @@ define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: uaddlp2d:
 ;CHECK: uaddlp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
         ret <2 x i64> %tmp3
 }
 
-declare <4 x i16>  @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
 
-declare <8 x i16>  @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 
 define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sadalp4h:
 ;CHECK: sadalp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -550,7 +550,7 @@ define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
 ;CHECK-LABEL: sadalp2s:
 ;CHECK: sadalp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -560,7 +560,7 @@ define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: sadalp8h:
 ;CHECK: sadalp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -570,7 +570,7 @@ define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: sadalp4s:
 ;CHECK: sadalp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -580,7 +580,7 @@ define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
 ;CHECK-LABEL: sadalp2d:
 ;CHECK: sadalp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
         ret <2 x i64> %tmp5
@@ -590,7 +590,7 @@ define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: uadalp4h:
 ;CHECK: uadalp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -600,7 +600,7 @@ define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
 ;CHECK-LABEL: uadalp2s:
 ;CHECK: uadalp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -610,7 +610,7 @@ define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: uadalp8h:
 ;CHECK: uadalp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -620,7 +620,7 @@ define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: uadalp4s:
 ;CHECK: uadalp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -630,7 +630,7 @@ define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
 ;CHECK-LABEL: uadalp2d:
 ;CHECK: uadalp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
         ret <2 x i64> %tmp5
@@ -641,7 +641,7 @@ define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: addp.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -650,7 +650,7 @@ define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: addp.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -659,7 +659,7 @@ define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: addp.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -668,7 +668,7 @@ define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: addp.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -677,7 +677,7 @@ define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: addp.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -686,7 +686,7 @@ define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: addp.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -695,24 +695,24 @@ define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: addp.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: faddp_2s:
 ;CHECK: faddp.2s
         %tmp1 = load <2 x float>* %A
         %tmp2 = load <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
         ret <2 x float> %tmp3
 }
 
@@ -721,7 +721,7 @@ define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: faddp.4s
         %tmp1 = load <4 x float>* %A
         %tmp2 = load <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
         ret <4 x float> %tmp3
 }
 
@@ -730,13 +730,13 @@ define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: faddp.2d
         %tmp1 = load <2 x double>* %A
         %tmp2 = load <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
         ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uaddl2_duprhs
diff --git a/test/CodeGen/ARM64/vaddlv.ll b/test/CodeGen/AArch64/arm64-vaddlv.ll
index d4d4608..2d64138 100644
--- a/test/CodeGen/ARM64/vaddlv.ll
+++ b/test/CodeGen/AArch64/arm64-vaddlv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
 ; CHECK: test_vaddlv_s32
@@ -6,7 +6,7 @@ define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
 ; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddlv.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  %vaddlv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
   ret i64 %vaddlv.i
 }
 
@@ -16,11 +16,11 @@ define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
 ; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddlv.i = tail call i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  %vaddlv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
   ret i64 %vaddlv.i
 }
 
-declare i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
 
-declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
+declare i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
 
diff --git a/test/CodeGen/ARM64/vaddv.ll b/test/CodeGen/AArch64/arm64-vaddv.ll
index 44bfa84..2d92ce6 100644
--- a/test/CodeGen/ARM64/vaddv.ll
+++ b/test/CodeGen/AArch64/arm64-vaddv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
 
 define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
 ; CHECK-LABEL: test_vaddv_s8:
@@ -6,7 +6,7 @@ define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -17,7 +17,7 @@ define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -29,7 +29,7 @@ define i32 @test_vaddv_s32(<2 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vaddv.i
 }
 
@@ -39,7 +39,7 @@ define i64 @test_vaddv_s64(<2 x i64> %a1) {
 ; CHECK-NEXT: fmov x0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64> %a1)
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a1)
   ret i64 %vaddv.i
 }
 
@@ -49,7 +49,7 @@ define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -60,7 +60,7 @@ define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
   %0 = and i32 %vaddv.i, 511 ; 0x1ff
   ret i32 %0
 }
@@ -71,7 +71,7 @@ define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -82,7 +82,7 @@ define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
   %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
   ret i32 %0
 }
@@ -94,7 +94,7 @@ define i32 @test_vaddv_u32(<2 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vaddv.i
 }
 
@@ -103,7 +103,7 @@ define float @test_vaddv_f32(<2 x float> %a1) {
 ; CHECK: faddp.2s s0, v0
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
+  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
   ret float %vaddv.i
 }
 
@@ -113,7 +113,7 @@ define float @test_vaddv_v4f32(<4 x float> %a1) {
 ; CHECK: faddp.2s s0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
+  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
   ret float %vaddv.i
 }
 
@@ -122,7 +122,7 @@ define double @test_vaddv_f64(<2 x double> %a1) {
 ; CHECK: faddp.2d d0, v0
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
+  %vaddv.i = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
   ret double %vaddv.i
 }
 
@@ -132,17 +132,29 @@ define i64 @test_vaddv_u64(<2 x i64> %a1) {
 ; CHECK-NEXT: fmov x0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
   ret i64 %vaddv.i
 }
 
+define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_u64_to_vec:
+; CHECK: addp.2d d0, v0
+; CHECK-NOT: fmov
+; CHECK-NOT: ins
+; CHECK: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  %vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
+  ret <1 x i64> %vec
+}
+
 define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
 ; CHECK-LABEL: test_vaddvq_s8:
 ; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -153,7 +165,7 @@ define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -164,7 +176,7 @@ define i32 @test_vaddvq_s32(<4 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vaddv.i
 }
 
@@ -174,7 +186,7 @@ define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -185,7 +197,7 @@ define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -196,38 +208,38 @@ define i32 @test_vaddvq_u32(<4 x i32> %a1) {
 ; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vaddv.i
 }
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
 
-declare i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64>)
+declare i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
 
-declare i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64>)
+declare i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
 
-declare float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
-declare float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
-declare double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
+declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
+declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
+declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
diff --git a/test/CodeGen/ARM64/variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
index ac66902..36a7bfd 100644
--- a/test/CodeGen/ARM64/variadic-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -32,7 +32,7 @@ define void @test_simple(i32 %n, ...) {
 ; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128
 ; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
 
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x37
 ; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
 
 ; CHECK: orr [[VR_OFFS:w[0-9]+]], wzr, #0xffffff80
@@ -70,10 +70,10 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
 ; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #112
 ; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
 
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #39
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x27
 ; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
 
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #111
+; CHECK: movn [[VR_OFFS:w[0-9]+]], #0x6f
 ; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
 
   %addr = bitcast %va_list* @var to i8*
diff --git a/test/CodeGen/ARM64/vbitwise.ll b/test/CodeGen/AArch64/arm64-vbitwise.ll
index 7d8378d..93de95e 100644
--- a/test/CodeGen/ARM64/vbitwise.ll
+++ b/test/CodeGen/AArch64/arm64-vbitwise.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @rbit_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: rbit_8b:
 ;CHECK: rbit.8b
 	%tmp1 = load <8 x i8>* %A
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8> %tmp1)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %tmp1)
 	ret <8 x i8> %tmp3
 }
 
@@ -12,12 +12,12 @@ define <16 x i8> @rbit_16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: rbit_16b:
 ;CHECK: rbit.16b
 	%tmp1 = load <16 x i8>* %A
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8> %tmp1)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %tmp1)
 	ret <16 x i8> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
 
 define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sxtl8h:
diff --git a/test/CodeGen/ARM64/vclz.ll b/test/CodeGen/AArch64/arm64-vclz.ll
index ddc09ed..cf5670a 100644
--- a/test/CodeGen/ARM64/vclz.ll
+++ b/test/CodeGen/AArch64/arm64-vclz.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclz_u8:
diff --git a/test/CodeGen/ARM64/vcmp.ll b/test/CodeGen/AArch64/arm64-vcmp.ll
index f9275b8..982ab09 100644
--- a/test/CodeGen/ARM64/vcmp.ll
+++ b/test/CodeGen/AArch64/arm64-vcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 
 define void @fcmltz_4s(<4 x float> %a, <4 x i16>* %p) nounwind {
@@ -18,7 +18,7 @@ define <2 x i32> @facge_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK: facge.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -27,7 +27,7 @@ define <4 x i32> @facge_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: facge.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -36,20 +36,20 @@ define <2 x i64> @facge_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: facge.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x i32> @facgt_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: facgt_2s:
 ;CHECK: facgt.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -58,7 +58,7 @@ define <4 x i32> @facgt_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: facgt.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -67,47 +67,47 @@ define <2 x i64> @facgt_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: facgt.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define i32 @facge_s(float %A, float %B) nounwind {
 ; CHECK-LABEL: facge_s:
 ; CHECK: facge {{s[0-9]+}}, s0, s1
-  %mask = call i32 @llvm.arm64.neon.facge.i32.f32(float %A, float %B)
+  %mask = call i32 @llvm.aarch64.neon.facge.i32.f32(float %A, float %B)
   ret i32 %mask
 }
 
 define i64 @facge_d(double %A, double %B) nounwind {
 ; CHECK-LABEL: facge_d:
 ; CHECK: facge {{d[0-9]+}}, d0, d1
-  %mask = call i64 @llvm.arm64.neon.facge.i64.f64(double %A, double %B)
+  %mask = call i64 @llvm.aarch64.neon.facge.i64.f64(double %A, double %B)
   ret i64 %mask
 }
 
-declare i64 @llvm.arm64.neon.facge.i64.f64(double, double)
-declare i32 @llvm.arm64.neon.facge.i32.f32(float, float)
+declare i64 @llvm.aarch64.neon.facge.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facge.i32.f32(float, float)
 
 define i32 @facgt_s(float %A, float %B) nounwind {
 ; CHECK-LABEL: facgt_s:
 ; CHECK: facgt {{s[0-9]+}}, s0, s1
-  %mask = call i32 @llvm.arm64.neon.facgt.i32.f32(float %A, float %B)
+  %mask = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %A, float %B)
   ret i32 %mask
 }
 
 define i64 @facgt_d(double %A, double %B) nounwind {
 ; CHECK-LABEL: facgt_d:
 ; CHECK: facgt {{d[0-9]+}}, d0, d1
-  %mask = call i64 @llvm.arm64.neon.facgt.i64.f64(double %A, double %B)
+  %mask = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %A, double %B)
   ret i64 %mask
 }
 
-declare i64 @llvm.arm64.neon.facgt.i64.f64(double, double)
-declare i32 @llvm.arm64.neon.facgt.i32.f32(float, float)
+declare i64 @llvm.aarch64.neon.facgt.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facgt.i32.f32(float, float)
 
 define <8 x i8> @cmtst_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: cmtst_8b:
@@ -225,3 +225,12 @@ define <1 x i64> @fcmlt_d(<1 x double> %A, <1 x double> %B) nounwind {
   %mask = sext <1 x i1> %tst to <1 x i64>
   ret <1 x i64> %mask
 }
+
+define <1 x i64> @cmnez_d(<1 x i64> %A) nounwind {
+; CHECK-LABEL: cmnez_d:
+; CHECK: cmeq d[[EQ:[0-9]+]], d0, #0
+; CHECK: mvn.8b v0, v[[EQ]]
+  %tst = icmp ne <1 x i64> %A, zeroinitializer
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
diff --git a/test/CodeGen/AArch64/arm64-vcnt.ll b/test/CodeGen/AArch64/arm64-vcnt.ll
new file mode 100644
index 0000000..903501e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcnt.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_8b:
+;CHECK: cls.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_16b:
+;CHECK: cls.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_4h:
+;CHECK: cls.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_8h:
+;CHECK: cls.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_2s:
+;CHECK: cls.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_4s:
+;CHECK: cls.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcombine.ll b/test/CodeGen/AArch64/arm64-vcombine.ll
index 16f591e..fa12996 100644
--- a/test/CodeGen/ARM64/vcombine.ll
+++ b/test/CodeGen/AArch64/arm64-vcombine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; LowerCONCAT_VECTORS() was reversing the order of two parts.
 ; rdar://11558157
diff --git a/test/CodeGen/ARM64/vcvt.ll b/test/CodeGen/AArch64/arm64-vcvt.ll
index 19bb8cb..8c9e4e9 100644
--- a/test/CodeGen/ARM64/vcvt.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtas_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtas.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtas.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -23,20 +23,20 @@ define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtas.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtau_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtau.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -45,7 +45,7 @@ define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtau.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -54,20 +54,20 @@ define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtau.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtms_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtms.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -76,7 +76,7 @@ define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtms.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -85,20 +85,20 @@ define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtms.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtmu_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtmu.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -107,7 +107,7 @@ define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtmu.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -116,20 +116,20 @@ define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtmu.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtps_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtps.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -138,7 +138,7 @@ define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtps.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -147,20 +147,20 @@ define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtps.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtpu_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtpu.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -169,7 +169,7 @@ define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtpu.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -178,20 +178,20 @@ define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtpu.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtns_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtns.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -200,7 +200,7 @@ define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtns.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -209,20 +209,20 @@ define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtns.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtnu_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtnu.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -231,7 +231,7 @@ define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtnu.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -240,13 +240,13 @@ define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtnu.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtzs_2s:
@@ -401,7 +401,7 @@ define <2 x float> @frintn_2s(<2 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: frintn.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %A)
 	ret <2 x float> %tmp3
 }
 
@@ -410,7 +410,7 @@ define <4 x float> @frintn_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: frintn.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %A)
 	ret <4 x float> %tmp3
 }
 
@@ -419,13 +419,13 @@ define <2 x double> @frintn_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: frintn.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %A)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) nounwind readnone
 
 define <2 x float> @frintp_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: frintp_2s:
@@ -525,7 +525,7 @@ define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtxn v0.2s, v0.2d
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
 	ret <2 x float> %tmp3
 }
 
@@ -534,19 +534,19 @@ define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtxn2 v0.4s, v1.2d
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
         %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 	ret <4 x float> %res
 }
 
-declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtzsc_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzs.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
 	ret <2 x i32> %tmp3
 }
 
@@ -555,7 +555,7 @@ define <4 x i32> @fcvtzsc_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzs.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
 	ret <4 x i32> %tmp3
 }
 
@@ -564,20 +564,20 @@ define <2 x i64> @fcvtzsc_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzs.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
 
 define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtzuc_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzu.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
 	ret <2 x i32> %tmp3
 }
 
@@ -586,7 +586,7 @@ define <4 x i32> @fcvtzuc_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzu.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
 	ret <4 x i32> %tmp3
 }
 
@@ -595,20 +595,20 @@ define <2 x i64> @fcvtzuc_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzu.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
 
 define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind {
 ;CHECK-LABEL: scvtf_2sc:
 ;CHECK-NOT: ld1
 ;CHECK: scvtf.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
 	ret <2 x float> %tmp3
 }
 
@@ -617,7 +617,7 @@ define <4 x float> @scvtf_4sc(<4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: scvtf.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
 	ret <4 x float> %tmp3
 }
 
@@ -626,20 +626,20 @@ define <2 x double> @scvtf_2dc(<2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: scvtf.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
 
 define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind {
 ;CHECK-LABEL: ucvtf_2sc:
 ;CHECK-NOT: ld1
 ;CHECK: ucvtf.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
 	ret <2 x float> %tmp3
 }
 
@@ -648,7 +648,7 @@ define <4 x float> @ucvtf_4sc(<4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: ucvtf.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
 	ret <4 x float> %tmp3
 }
 
@@ -657,7 +657,7 @@ define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: ucvtf.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
 	ret <2 x double> %tmp3
 }
 
@@ -681,6 +681,6 @@ define void @autogen_SD19225() {
   ret void
 }
 
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_f.ll b/test/CodeGen/AArch64/arm64-vcvt_f.ll
index d67aa3b..d244958 100644
--- a/test/CodeGen/ARM64/vcvt_f.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -O0 -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_f64_f32:
@@ -38,7 +38,7 @@ define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) noun
 
 define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvtx_f32_f64:
-  %vcvtx1.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+  %vcvtx1.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
 ; CHECK: fcvtxn
   ret <2 x float> %vcvtx1.i
 ; CHECK: ret
@@ -46,7 +46,7 @@ define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
 
 define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvtx_high_f32_f64:
-  %vcvtx2.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+  %vcvtx2.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
   %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: fcvtxn2
   ret <4 x float> %res
@@ -54,13 +54,13 @@ define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nou
 }
 
 
-declare <2 x double> @llvm.arm64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfp2df(<2 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfp2df(<2 x float>) nounwind readnone
 
-declare <2 x float> @llvm.arm64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
 
-declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
 
 define i16 @to_half(float %in) {
 ; CHECK-LABEL: to_half:
diff --git a/test/CodeGen/ARM64/vcvt_f32_su32.ll b/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
index 51e053d..1eb7b43 100644
--- a/test/CodeGen/ARM64/vcvt_f32_su32.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp {
 ; CHECK-LABEL: ucvt:
@@ -37,7 +37,7 @@ define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp {
 ; CHECK-LABEL: cvtf16:
 ; CHECK: fcvtl  v0.4s, v0.4h
 ; CHECK-NEXT: ret
-  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %a) nounwind
+  %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %a) nounwind
   ret <4 x float> %vcvt1.i
 }
 
@@ -46,7 +46,7 @@ define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp {
 ; CHECK: fcvtl2  v0.4s, v0.8h
 ; CHECK-NEXT: ret
   %in = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %in) nounwind
+  %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %in) nounwind
   ret <4 x float> %vcvt1.i
 }
 
@@ -56,7 +56,7 @@ define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp {
 ; CHECK-LABEL: cvtf16f32:
 ; CHECK: fcvtn  v0.4h, v0.4s
 ; CHECK-NEXT: ret
-  %vcvt1.i = tail call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %a) nounwind
+  %vcvt1.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) nounwind
   ret <4 x i16> %vcvt1.i
 }
 
@@ -64,10 +64,10 @@ define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) {
 ; CHECK-LABEL: cvtf16f32_high:
 ; CHECK: fcvtn2 v0.8h, v1.4s
 ; CHECK-NEXT: ret
-  %high = call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %high_big)
+  %high = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %high_big)
   %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
 
-declare <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcvt_n.ll b/test/CodeGen/AArch64/arm64-vcvt_n.ll
new file mode 100644
index 0000000..7ed5be6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_n.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxpu:
+; CHECK: ucvtf.2s	v0, v0, #9
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
+  ret <2 x float> %vcvt_n1
+}
+
+define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxps:
+; CHECK: scvtf.2s	v0, v0, #12
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
+  ret <2 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxpu:
+; CHECK: ucvtf.4s	v0, v0, #18
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
+  ret <4 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxps:
+; CHECK: scvtf.4s	v0, v0, #30
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
+  ret <4 x float> %vcvt_n1
+}
+define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
+  ret <2 x double> %vcvt_n1
+}
+
+define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
+  ret <2 x double> %vcvt_n1
+}
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_su32_f32.ll b/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
index 8c82fa0..985a5f7 100644
--- a/test/CodeGen/ARM64/vcvt_su32_f32.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x i32> @c1(<2 x float> %a) nounwind readnone ssp {
 ; CHECK: c1
diff --git a/test/CodeGen/ARM64/vcvtxd_f32_f64.ll b/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
index bbe8f0b..b29c22c 100644
--- a/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
+++ b/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
@@ -4,8 +4,8 @@ define float @fcvtxn(double %a) {
 ; CHECK-LABEL: fcvtxn:
 ; CHECK: fcvtxn s0, d0
 ; CHECK-NEXT: ret
-  %vcvtxd.i = tail call float @llvm.arm64.sisd.fcvtxn(double %a) nounwind
+  %vcvtxd.i = tail call float @llvm.aarch64.sisd.fcvtxn(double %a) nounwind
   ret float %vcvtxd.i
 }
 
-declare float @llvm.arm64.sisd.fcvtxn(double) nounwind readnone
+declare float @llvm.aarch64.sisd.fcvtxn(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/vecCmpBr.ll b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
index e23ef25..c7321e4 100644
--- a/test/CodeGen/ARM64/vecCmpBr.ll
+++ b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
 ; ModuleID = 'arm64_vecCmpBr.c'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios3.0.0"
@@ -13,7 +13,7 @@ define i32 @anyZero64(<4 x i16> %a) #0 {
 ; CHECK-NEXT: b _bar
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -39,7 +39,7 @@ define i32 @anyZero128(<8 x i16> %a) #0 {
 
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -63,7 +63,7 @@ define i32 @anyNonZero64(<4 x i16> %a) #0 {
 
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -86,7 +86,7 @@ define i32 @anyNonZero128(<8 x i16> %a) #0 {
 ; CHECK-NEXT: movz w0, #0
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -109,7 +109,7 @@ define i32 @allZero64(<4 x i16> %a) #0 {
 ; CHECK-NEXT: b _bar
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -132,7 +132,7 @@ define i32 @allZero128(<8 x i16> %a) #0 {
 ; CHECK-NEXT: b _bar
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -155,7 +155,7 @@ define i32 @allNonZero64(<4 x i16> %a) #0 {
 ; CHECK-NEXT: movz w0, #0
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -178,7 +178,7 @@ define i32 @allNonZero128(<8 x i16> %a) #0 {
 ; CHECK-NEXT: movz w0, #0
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -192,13 +192,13 @@ return:                                           ; preds = %entry, %if.then
   ret i32 %retval.0
 }
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) #2
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) #2
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) #2
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) #2
 
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) #2
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) #2
 
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) #2
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) #2
 
 attributes #0 = { nounwind ssp "target-cpu"="cyclone" }
 attributes #1 = { "target-cpu"="cyclone" }
diff --git a/test/CodeGen/ARM64/vecFold.ll b/test/CodeGen/AArch64/arm64-vecFold.ll
index 6888932..aeacfcc 100644
--- a/test/CodeGen/ARM64/vecFold.ll
+++ b/test/CodeGen/AArch64/arm64-vecFold.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s| FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple -o - %s| FileCheck %s
 
 define <16 x i8> @foov16i8(<8 x i16> %a0, <8 x i16> %b0) nounwind readnone ssp {
 ; CHECK-LABEL: foov16i8:
@@ -50,8 +50,8 @@ define <4 x i32> @foov4i32(<2 x i64> %a0, <2 x i64> %b0) nounwind readnone ssp {
 
 define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
 ; CHECK-LABEL: bar:
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vaddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vaddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
 ; CHECK: addhn.4h	v0, v0, v1
 ; CHECK-NEXT: addhn2.8h	v0, v2, v3
 ; CHECK-NEXT: ret
@@ -64,7 +64,7 @@ define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1
 
 define <8 x i16> @baz(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
 ; CHECK-LABEL: baz:
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
   %vshrn_high_shift = ashr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
   %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
 ; CHECK: addhn.4h	v0, v0, v1
@@ -83,8 +83,8 @@ entry:
 ; CHECK: 	raddhn.4h	v0, v0, v1
 ; CHECK-NEXT: 	raddhn2.8h	v0, v2, v3
 ; CHECK-NEXT: 	ret
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vraddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vraddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
   %0 = bitcast <4 x i16> %vraddhn2.i to <1 x i64>
   %1 = bitcast <4 x i16> %vraddhn2.i10 to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -97,8 +97,8 @@ define <8 x i16> @vrshrn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16>
 ; CHECK: rshrn.8b	v0, v0, #5
 ; CHECK-NEXT: rshrn2.16b	v0, v2, #6
 ; CHECK-NEXT: ret
-  %vrshrn_n1 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
-  %vrshrn_n4 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
+  %vrshrn_n1 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
+  %vrshrn_n4 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
   %1 = bitcast <8 x i8> %vrshrn_n1 to <1 x i64>
   %2 = bitcast <8 x i8> %vrshrn_n4 to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -111,8 +111,8 @@ define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16>
 ; CHECK: rsubhn.8b	v0, v0, v1
 ; CHECK: rsubhn2.16b	v0, v2, v3
 ; CHECK-NEXT: 	ret
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
-  %vrsubhn2.i10 = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
+  %vrsubhn2.i10 = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
   %1 = bitcast <8 x i8> %vrsubhn2.i to <1 x i64>
   %2 = bitcast <8 x i8> %vrsubhn2.i10 to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -122,8 +122,8 @@ define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16>
 
 define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
 ; CHECK-LABEL: noOpt1:
-  %vqsub2.i = tail call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %vqsub2.i = tail call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
 ; CHECK:	sqsub.2s	v0, v0, v1
 ; CHECK-NEXT:	addhn2.8h	v0, v2, v3
   %1 = bitcast <2 x i32> %vqsub2.i to <1 x i64>
@@ -133,13 +133,13 @@ define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32>
   ret <8 x i16> %3
 }
 
-declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <8 x i8> @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
diff --git a/test/CodeGen/ARM64/vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
index 88889fd..650ff1e 100644
--- a/test/CodeGen/ARM64/vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ;CHECK: @func30
 ;CHECK: ushll.4s  v0, v0, #0
-;CHECK: movi.4s v1, #1
+;CHECK: movi.4s v1, #0x1
 ;CHECK: and.16b v0, v0, v1
 ;CHECK: str  q0, [x0]
 ;CHECK: ret
diff --git a/test/CodeGen/ARM64/vector-imm.ll b/test/CodeGen/AArch64/arm64-vector-imm.ll
index f1fc3cc..9fb088b 100644
--- a/test/CodeGen/ARM64/vector-imm.ll
+++ b/test/CodeGen/AArch64/arm64-vector-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @v_orrimm(<8 x i8>* %A) nounwind {
 ; CHECK-LABEL: v_orrimm:
@@ -42,7 +42,7 @@ define <16 x i8> @v_bicimmQ(<16 x i8>* %A) nounwind {
 
 define <2 x double> @foo(<2 x double> %bar) nounwind {
 ; CHECK: foo
-; CHECK: fmov.2d	v1, #1.000000e+00
+; CHECK: fmov.2d	v1, #1.0000000
   %add = fadd <2 x double> %bar, <double 1.0, double 1.0>
   ret <2 x double> %add
 }
@@ -50,35 +50,35 @@ define <2 x double> @foo(<2 x double> %bar) nounwind {
 define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: movi_4s_imm_t1:
-; CHECK: movi.4s v0, #75
+; CHECK: movi.4s v0, #0x4b
   ret <4 x i32> <i32 75, i32 75, i32 75, i32 75>
 }
 
 define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: movi_4s_imm_t2:
-; CHECK: movi.4s v0, #75, lsl #8
+; CHECK: movi.4s v0, #0x4b, lsl #8
   ret <4 x i32> <i32 19200, i32 19200, i32 19200, i32 19200>
 }
 
 define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: movi_4s_imm_t3:
-; CHECK: movi.4s v0, #75, lsl #16
+; CHECK: movi.4s v0, #0x4b, lsl #16
   ret <4 x i32> <i32 4915200, i32 4915200, i32 4915200, i32 4915200>
 }
 
 define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: movi_4s_imm_t4:
-; CHECK: movi.4s v0, #75, lsl #24
+; CHECK: movi.4s v0, #0x4b, lsl #24
   ret <4 x i32> <i32 1258291200, i32 1258291200, i32 1258291200, i32 1258291200>
 }
 
 define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: movi_8h_imm_t5:
-; CHECK: movi.8h v0, #75
+; CHECK: movi.8h v0, #0x4b
   ret <8 x i16> <i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75>
 }
 
@@ -86,28 +86,28 @@ entry:
 define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: movi_8h_imm_t6:
-; CHECK: movi.8h v0, #75, lsl #8
+; CHECK: movi.8h v0, #0x4b, lsl #8
   ret <8 x i16> <i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200>
 }
 
 define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: movi_4s_imm_t7:
-; CHECK: movi.4s v0, #75, msl #8
+; CHECK: movi.4s v0, #0x4b, msl #8
 ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
 }
 
 define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: movi_4s_imm_t8:
-; CHECK: movi.4s v0, #75, msl #16
+; CHECK: movi.4s v0, #0x4b, msl #16
 ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
 }
 
 define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: movi_16b_imm_t9:
-; CHECK: movi.16b v0, #75
+; CHECK: movi.16b v0, #0x4b
 ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75,
                i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
 }
@@ -122,13 +122,13 @@ ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
 define <4 x i32> @movi_4s_imm_t11() nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: movi_4s_imm_t11:
-; CHECK: fmov.4s v0, #-3.281250e-01
+; CHECK: fmov.4s v0, #-0.32812500
 ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
 }
 
 define <2 x i64> @movi_2d_imm_t12() nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: movi_2d_imm_t12:
-; CHECK: fmov.2d v0, #-1.718750e-01
+; CHECK: fmov.2d v0, #-0.17187500
 ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
 }
diff --git a/test/CodeGen/AArch64/arm64-vector-insertion.ll b/test/CodeGen/AArch64/arm64-vector-insertion.ll
new file mode 100644
index 0000000..8fbff71
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=arm64 -mcpu=generic -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define void @test0f(float* nocapture %x, float %a) #0 {
+entry:
+  %0 = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %a, i32 0
+  %1 = bitcast float* %x to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+
+  ; CHECK-LABEL: test0f
+  ; CHECK: movi.2d v[[TEMP:[0-9]+]], #0000000000000000
+  ; CHECK: ins.s v[[TEMP]][0], v{{[0-9]+}}[0]
+  ; CHECK: str q[[TEMP]], [x0]
+  ; CHECK: ret
+
+
+}
+
+
+define void @test1f(float* nocapture %x, float %a) #0 {
+entry:
+  %0 = insertelement <4 x float> <float undef, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, float %a, i32 0
+  %1 = bitcast float* %x to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+
+  ; CHECK-LABEL: test1f
+  ; CHECK: fmov  s[[TEMP:[0-9]+]], #1.0000000
+  ; CHECK: dup.4s  v[[TEMP2:[0-9]+]], v[[TEMP]][0]
+  ; CHECK: ins.s v[[TEMP2]][0], v0[0]
+  ; CHECK: str q[[TEMP2]], [x0]
+  ; CHECK: ret
+}
diff --git a/test/CodeGen/ARM64/vector-ldst.ll b/test/CodeGen/AArch64/arm64-vector-ldst.ll
index 154160e..c001915 100644
--- a/test/CodeGen/ARM64/vector-ldst.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
 
 ; rdar://9428579
 
diff --git a/test/CodeGen/ARM64/vext.ll b/test/CodeGen/AArch64/arm64-vext.ll
index c820439..2240dfd 100644
--- a/test/CodeGen/ARM64/vext.ll
+++ b/test/CodeGen/AArch64/arm64-vext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define void @test_vext_s8() nounwind ssp {
   ; CHECK-LABEL: test_vext_s8:
diff --git a/test/CodeGen/AArch64/arm64-vext_reverse.ll b/test/CodeGen/AArch64/arm64-vext_reverse.ll
new file mode 100644
index 0000000..c45e55e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vext_reverse.ll
@@ -0,0 +1,172 @@
+; RUN: llc -mtriple=arm64-linux-gnuabi < %s | FileCheck %s
+
+; The following tests is to check the correctness of reversing input operand 
+; of vext by enumerating all cases of using two undefs in shuffle masks.
+
+define <4 x i16> @vext_6701_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_12:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_13:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 undef, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_14:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 0, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 undef, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_24:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 0, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_12:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 7, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_13:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 undef, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_14:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 7, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 undef, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_24:
+; CHECK: rev32   v0.4h, v1.4h
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 7, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_12:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 1, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_13:
+; CHECK: rev32   v0.4h, v0.4h
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 undef, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_14:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 undef, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_24:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 1, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
diff --git a/test/CodeGen/ARM64/vfloatintrinsics.ll b/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
index a8c882b..255a182 100644
--- a/test/CodeGen/ARM64/vfloatintrinsics.ll
+++ b/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 ;;; Float vectors
 
diff --git a/test/CodeGen/ARM64/vhadd.ll b/test/CodeGen/AArch64/arm64-vhadd.ll
index aed7681..6178bf9 100644
--- a/test/CodeGen/ARM64/vhadd.ll
+++ b/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: shadd8b:
 ;CHECK: shadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: shadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: shadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: shadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: shadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -50,7 +50,7 @@ define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: shadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -59,7 +59,7 @@ define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uhadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -68,7 +68,7 @@ define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uhadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uhadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -86,7 +86,7 @@ define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uhadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -95,7 +95,7 @@ define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uhadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -104,32 +104,32 @@ define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uhadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: srhadd8b:
 ;CHECK: srhadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -138,7 +138,7 @@ define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: srhadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -147,7 +147,7 @@ define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: srhadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -156,7 +156,7 @@ define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: srhadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -165,7 +165,7 @@ define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: srhadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -174,7 +174,7 @@ define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: srhadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -183,7 +183,7 @@ define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: urhadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -192,7 +192,7 @@ define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: urhadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -201,7 +201,7 @@ define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: urhadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -210,7 +210,7 @@ define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: urhadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -219,7 +219,7 @@ define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: urhadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -228,22 +228,22 @@ define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: urhadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vhsub.ll b/test/CodeGen/AArch64/arm64-vhsub.ll
index 85df4d4..13bfda3 100644
--- a/test/CodeGen/ARM64/vhsub.ll
+++ b/test/CodeGen/AArch64/arm64-vhsub.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @shsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: shsub8b:
 ;CHECK: shsub.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <16 x i8> @shsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: shsub.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <4 x i16> @shsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: shsub.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <8 x i16> @shsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: shsub.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <2 x i32> @shsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: shsub.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -50,7 +50,7 @@ define <4 x i32> @shsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: shsub.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -59,7 +59,7 @@ define <8 x i8> @uhsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uhsub.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -68,7 +68,7 @@ define <16 x i8> @uhsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uhsub.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <4 x i16> @uhsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uhsub.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -86,7 +86,7 @@ define <8 x i16> @uhsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uhsub.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -95,7 +95,7 @@ define <2 x i32> @uhsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uhsub.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -104,22 +104,22 @@ define <4 x i32> @uhsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uhsub.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/virtual_base.ll b/test/CodeGen/AArch64/arm64-virtual_base.ll
index cb95954..cb95954 100644
--- a/test/CodeGen/ARM64/virtual_base.ll
+++ b/test/CodeGen/AArch64/arm64-virtual_base.ll
diff --git a/test/CodeGen/ARM64/vmax.ll b/test/CodeGen/AArch64/arm64-vmax.ll
index b2426f3..3f2c134 100644
--- a/test/CodeGen/ARM64/vmax.ll
+++ b/test/CodeGen/AArch64/arm64-vmax.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @smax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: smax_8b:
 ;CHECK: smax.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <16 x i8> @smax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: smax.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <4 x i16> @smax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: smax.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <8 x i16> @smax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: smax.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <2 x i32> @smax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: smax.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -50,23 +50,23 @@ define <4 x i32> @smax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: smax.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @umax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: umax_8b:
 ;CHECK: umax.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -75,7 +75,7 @@ define <16 x i8> @umax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: umax.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -84,7 +84,7 @@ define <4 x i16> @umax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: umax.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -93,7 +93,7 @@ define <8 x i16> @umax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: umax.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -102,7 +102,7 @@ define <2 x i32> @umax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: umax.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -111,23 +111,23 @@ define <4 x i32> @umax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: umax.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @smin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: smin_8b:
 ;CHECK: smin.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -136,7 +136,7 @@ define <16 x i8> @smin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: smin.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -145,7 +145,7 @@ define <4 x i16> @smin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: smin.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -154,7 +154,7 @@ define <8 x i16> @smin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: smin.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -163,7 +163,7 @@ define <2 x i32> @smin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: smin.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -172,23 +172,23 @@ define <4 x i32> @smin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: smin.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @umin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: umin_8b:
 ;CHECK: umin.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -197,7 +197,7 @@ define <16 x i8> @umin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: umin.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -206,7 +206,7 @@ define <4 x i16> @umin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: umin.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -215,7 +215,7 @@ define <8 x i16> @umin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: umin.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -224,7 +224,7 @@ define <2 x i32> @umin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: umin.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -233,25 +233,25 @@ define <4 x i32> @umin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: umin.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @smaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: smaxp_8b:
 ;CHECK: smaxp.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -260,7 +260,7 @@ define <16 x i8> @smaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: smaxp.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -269,7 +269,7 @@ define <4 x i16> @smaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: smaxp.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -278,7 +278,7 @@ define <8 x i16> @smaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: smaxp.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -287,7 +287,7 @@ define <2 x i32> @smaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: smaxp.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -296,23 +296,23 @@ define <4 x i32> @smaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: smaxp.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @umaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: umaxp_8b:
 ;CHECK: umaxp.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -321,7 +321,7 @@ define <16 x i8> @umaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: umaxp.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -330,7 +330,7 @@ define <4 x i16> @umaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: umaxp.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -339,7 +339,7 @@ define <8 x i16> @umaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: umaxp.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -348,7 +348,7 @@ define <2 x i32> @umaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: umaxp.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -357,25 +357,25 @@ define <4 x i32> @umaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: umaxp.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @sminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sminp_8b:
 ;CHECK: sminp.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -384,7 +384,7 @@ define <16 x i8> @sminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sminp.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -393,7 +393,7 @@ define <4 x i16> @sminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sminp.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -402,7 +402,7 @@ define <8 x i16> @sminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sminp.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -411,7 +411,7 @@ define <2 x i32> @sminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sminp.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -420,23 +420,23 @@ define <4 x i32> @sminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sminp.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @uminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: uminp_8b:
 ;CHECK: uminp.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -445,7 +445,7 @@ define <16 x i8> @uminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uminp.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -454,7 +454,7 @@ define <4 x i16> @uminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uminp.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -463,7 +463,7 @@ define <8 x i16> @uminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uminp.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -472,7 +472,7 @@ define <2 x i32> @uminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uminp.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -481,23 +481,23 @@ define <4 x i32> @uminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uminp.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x float> @fmax_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fmax_2s:
 ;CHECK: fmax.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -506,7 +506,7 @@ define <4 x float> @fmax_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fmax.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -515,20 +515,20 @@ define <2 x double> @fmax_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fmax.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @fmaxp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fmaxp_2s:
 ;CHECK: fmaxp.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -537,7 +537,7 @@ define <4 x float> @fmaxp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fmaxp.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -546,20 +546,20 @@ define <2 x double> @fmaxp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fmaxp.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @fmin_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fmin_2s:
 ;CHECK: fmin.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -568,7 +568,7 @@ define <4 x float> @fmin_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fmin.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -577,20 +577,20 @@ define <2 x double> @fmin_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fmin.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @fminp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fminp_2s:
 ;CHECK: fminp.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -599,7 +599,7 @@ define <4 x float> @fminp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fminp.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -608,20 +608,20 @@ define <2 x double> @fminp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fminp.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @fminnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fminnmp_2s:
 ;CHECK: fminnmp.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -630,7 +630,7 @@ define <4 x float> @fminnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fminnmp.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -639,20 +639,20 @@ define <2 x double> @fminnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fminnmp.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @fmaxnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fmaxnmp_2s:
 ;CHECK: fmaxnmp.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -661,7 +661,7 @@ define <4 x float> @fmaxnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fmaxnmp.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -670,10 +670,10 @@ define <2 x double> @fmaxnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fmaxnmp.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vminmaxnm.ll b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
new file mode 100644
index 0000000..b5aca45
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vmaxnm2.i
+}
+
+define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.4s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vmaxnm2.i
+}
+
+define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2d	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vmaxnm2.i
+}
+
+define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.2s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vminnm2.i
+}
+
+define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.4s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vminnm2.i
+}
+
+define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fminnm.2d	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vminnm2.i
+}
+
+declare <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+
+define double @test_fmaxnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fmaxnmv:
+; CHECK: fmaxnmp.2d d0, v0
+  %max = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+define double @test_fminnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fminnmv:
+; CHECK: fminnmp.2d d0, v0
+  %min = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/vmovn.ll b/test/CodeGen/AArch64/arm64-vmovn.ll
index 675633b..67e2816 100644
--- a/test/CodeGen/ARM64/vmovn.ll
+++ b/test/CodeGen/AArch64/arm64-vmovn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @xtn8b(<8 x i16> %A) nounwind {
 ;CHECK-LABEL: xtn8b:
@@ -62,7 +62,7 @@ define <8 x i8> @sqxtn8b(<8 x i16> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn.8b v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
         ret <8 x i8> %tmp3
 }
 
@@ -71,7 +71,7 @@ define <4 x i16> @sqxtn4h(<4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn.4h v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
         ret <4 x i16> %tmp3
 }
 
@@ -80,7 +80,7 @@ define <2 x i32> @sqxtn2s(<2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn.2s v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
         ret <2 x i32> %tmp3
 }
 
@@ -89,7 +89,7 @@ define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn2.16b v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
         %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %res
 }
@@ -99,7 +99,7 @@ define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn2.8h v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
         %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %res
 }
@@ -109,21 +109,21 @@ define <4 x i32> @sqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn2.4s v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
         %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %res
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
 
 define <8 x i8> @uqxtn8b(<8 x i16> %A) nounwind {
 ;CHECK-LABEL: uqxtn8b:
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn.8b v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
         ret <8 x i8> %tmp3
 }
 
@@ -132,7 +132,7 @@ define <4 x i16> @uqxtn4h(<4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn.4h v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
         ret <4 x i16> %tmp3
 }
 
@@ -141,7 +141,7 @@ define <2 x i32> @uqxtn2s(<2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn.2s v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
         ret <2 x i32> %tmp3
 }
 
@@ -150,7 +150,7 @@ define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn2.16b v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
         %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %res
 }
@@ -160,7 +160,7 @@ define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn2.8h v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
         %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %res
 }
@@ -170,21 +170,21 @@ define <4 x i32> @uqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn2.4s v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
         %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %res
 }
 
-declare <8 x i8>  @llvm.arm64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
 
 define <8 x i8> @sqxtun8b(<8 x i16> %A) nounwind {
 ;CHECK-LABEL: sqxtun8b:
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun.8b v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
         ret <8 x i8> %tmp3
 }
 
@@ -193,7 +193,7 @@ define <4 x i16> @sqxtun4h(<4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun.4h v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
         ret <4 x i16> %tmp3
 }
 
@@ -202,7 +202,7 @@ define <2 x i32> @sqxtun2s(<2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun.2s v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
         ret <2 x i32> %tmp3
 }
 
@@ -211,7 +211,7 @@ define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun2.16b v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
         %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %res
 }
@@ -221,7 +221,7 @@ define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun2.8h v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
         %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %res
 }
@@ -231,12 +231,12 @@ define <4 x i32> @sqxtun2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun2.4s v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
         %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %res
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
 
diff --git a/test/CodeGen/ARM64/vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll
index 3ef0a76..6fa60fe 100644
--- a/test/CodeGen/ARM64/vmul.ll
+++ b/test/CodeGen/AArch64/arm64-vmul.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 
 define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
@@ -6,7 +6,7 @@ define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: smull.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -15,7 +15,7 @@ define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: smull.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -24,20 +24,20 @@ define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: smull.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
-declare <8 x i16>  @llvm.arm64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 
 define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: umull8h:
 ;CHECK: umull.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -46,7 +46,7 @@ define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: umull.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -55,20 +55,20 @@ define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: umull.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
-declare <8 x i16>  @llvm.arm64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 
 define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmull4s:
 ;CHECK: sqdmull.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqdmull.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
@@ -88,7 +88,7 @@ define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
   %load2 = load <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -99,31 +99,31 @@ define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   %load2 = load <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
 
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 
 define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: pmull8h:
 ;CHECK: pmull.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
   ret <8 x i16> %tmp3
 }
 
-declare <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
 
 define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmulh_4h:
 ;CHECK: sqdmulh.4h
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i16> %tmp3
 }
 
@@ -132,7 +132,7 @@ define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sqdmulh.8h
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -141,7 +141,7 @@ define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqdmulh.2s
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i32> %tmp3
 }
 
@@ -150,7 +150,7 @@ define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sqdmulh.4s
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -159,22 +159,22 @@ define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
 ;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
   %tmp1 = load i32* %A
   %tmp2 = load i32* %B
-  %tmp3 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
+  %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
   ret i32 %tmp3
 }
 
-declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare i32 @llvm.arm64.neon.sqdmulh.i32(i32, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
 
 define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqrdmulh_4h:
 ;CHECK: sqrdmulh.4h
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i16> %tmp3
 }
 
@@ -183,7 +183,7 @@ define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sqrdmulh.8h
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -192,7 +192,7 @@ define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqrdmulh.2s
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i32> %tmp3
 }
 
@@ -201,7 +201,7 @@ define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sqrdmulh.4s
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -210,22 +210,22 @@ define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
 ;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
   %tmp1 = load i32* %A
   %tmp2 = load i32* %B
-  %tmp3 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
+  %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
   ret i32 %tmp3
 }
 
-declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare i32 @llvm.arm64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
 
 define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fmulx_2s:
 ;CHECK: fmulx.2s
   %tmp1 = load <2 x float>* %A
   %tmp2 = load <2 x float>* %B
-  %tmp3 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+  %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
   ret <2 x float> %tmp3
 }
 
@@ -234,7 +234,7 @@ define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fmulx.4s
   %tmp1 = load <4 x float>* %A
   %tmp2 = load <4 x float>* %B
-  %tmp3 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+  %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
   ret <4 x float> %tmp3
 }
 
@@ -243,13 +243,13 @@ define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fmulx.2d
   %tmp1 = load <2 x double>* %A
   %tmp2 = load <2 x double>* %B
-  %tmp3 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+  %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
   ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: smlal4s:
@@ -257,7 +257,7 @@ define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = add <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -268,7 +268,7 @@ define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = add <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
@@ -279,7 +279,7 @@ define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = sub <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -290,15 +290,15 @@ define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = sub <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
 
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
 define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal4s:
@@ -306,8 +306,8 @@ define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwin
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -317,8 +317,8 @@ define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -330,8 +330,8 @@ define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -343,8 +343,8 @@ define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounw
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -354,8 +354,8 @@ define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwin
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -365,8 +365,8 @@ define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -378,8 +378,8 @@ define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -391,8 +391,8 @@ define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounw
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -402,7 +402,7 @@ define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = add <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -413,7 +413,7 @@ define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = add <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
@@ -424,7 +424,7 @@ define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = sub <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -435,7 +435,7 @@ define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = sub <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
@@ -717,7 +717,7 @@ define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
   %tmp1 = load <2 x float>* %A
   %tmp2 = load <2 x float>* %B
   %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
+  %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
   ret <2 x float> %tmp4
 }
 
@@ -728,7 +728,7 @@ define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
   %tmp1 = load <4 x float>* %A
   %tmp2 = load <4 x float>* %B
   %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
+  %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
   ret <4 x float> %tmp4
 }
 
@@ -739,7 +739,7 @@ define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind
   %tmp1 = load <2 x double>* %A
   %tmp2 = load <2 x double>* %B
   %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
+  %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
   ret <2 x double> %tmp4
 }
 
@@ -750,7 +750,7 @@ define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i16> %tmp4
 }
 
@@ -761,7 +761,7 @@ define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
   ret <8 x i16> %tmp4
 }
 
@@ -772,7 +772,7 @@ define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i32> %tmp4
 }
 
@@ -783,7 +783,7 @@ define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -792,7 +792,7 @@ define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
 ;CHECK-NOT: dup
 ;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
   %tmp1 = extractelement <4 x i32> %B, i32 1
-  %tmp2 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
+  %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
   ret i32 %tmp2
 }
 
@@ -803,7 +803,7 @@ define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i16> %tmp4
 }
 
@@ -814,7 +814,7 @@ define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
   ret <8 x i16> %tmp4
 }
 
@@ -825,7 +825,7 @@ define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i32> %tmp4
 }
 
@@ -836,7 +836,7 @@ define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -845,7 +845,7 @@ define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
 ;CHECK-NOT: dup
 ;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
   %tmp1 = extractelement <4 x i32> %B, i32 1
-  %tmp2 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
+  %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
   ret i32 %tmp2
 }
 
@@ -856,7 +856,7 @@ define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -867,7 +867,7 @@ define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i64> %tmp4
 }
 
@@ -879,7 +879,7 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
   %load2 = load <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp4
 }
 
@@ -891,7 +891,7 @@ define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   %load2 = load <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp4
 }
 
@@ -902,7 +902,7 @@ define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -913,7 +913,7 @@ define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i64> %tmp4
 }
 
@@ -924,7 +924,7 @@ define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -935,7 +935,7 @@ define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i64> %tmp4
 }
 
@@ -947,7 +947,7 @@ define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = add <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -960,7 +960,7 @@ define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = add <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -973,8 +973,8 @@ define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) n
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -986,8 +986,8 @@ define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1000,8 +1000,8 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C)
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -1014,8 +1014,8 @@ define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C)
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1024,45 +1024,45 @@ define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
 ;CHECK: sqdmlal.4s
   %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
   %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
   %prod = extractelement <4 x i32> %prod.vec, i32 0
-  %res = call i32 @llvm.arm64.neon.sqadd.i32(i32 %A, i32 %prod)
+  %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
   ret i32 %res
 }
-declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32)
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
 
 define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: sqdmlsl_lane_1s:
 ;CHECK: sqdmlsl.4s
   %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
   %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
   %prod = extractelement <4 x i32> %prod.vec, i32 0
-  %res = call i32 @llvm.arm64.neon.sqsub.i32(i32 %A, i32 %prod)
+  %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
   ret i32 %res
 }
-declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32)
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
 
 define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: sqdmlal_lane_1d:
 ;CHECK: sqdmlal.s
   %rhs = extractelement <2 x i32> %C, i32 1
-  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
-  %res = call i64 @llvm.arm64.neon.sqadd.i64(i64 %A, i64 %prod)
+  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
   ret i64 %res
 }
-declare i64 @llvm.arm64.neon.sqdmulls.scalar(i32, i32)
-declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
 
 define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: sqdmlsl_lane_1d:
 ;CHECK: sqdmlsl.s
   %rhs = extractelement <2 x i32> %C, i32 1
-  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
-  %res = call i64 @llvm.arm64.neon.sqsub.i64(i64 %A, i64 %prod)
+  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
   ret i64 %res
 }
-declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
 
 
 define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
@@ -1073,7 +1073,7 @@ define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = add <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -1086,7 +1086,7 @@ define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = add <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -1100,7 +1100,7 @@ define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = sub <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -1113,7 +1113,7 @@ define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = sub <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -1126,8 +1126,8 @@ define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) n
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -1139,8 +1139,8 @@ define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1153,8 +1153,8 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C)
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -1167,8 +1167,8 @@ define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C)
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1180,7 +1180,7 @@ define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = sub <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -1193,7 +1193,7 @@ define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = sub <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -1202,7 +1202,7 @@ define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou
 define float @fmulxs(float %a, float %b) nounwind {
 ; CHECK-LABEL: fmulxs:
 ; CHECKNEXT: fmulx s0, s0, s1
-  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
+  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
 ; CHECKNEXT: ret
   ret float %fmulx.i
 }
@@ -1210,7 +1210,7 @@ define float @fmulxs(float %a, float %b) nounwind {
 define double @fmulxd(double %a, double %b) nounwind {
 ; CHECK-LABEL: fmulxd:
 ; CHECKNEXT: fmulx d0, d0, d1
-  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
+  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
 ; CHECKNEXT: ret
   ret double %fmulx.i
 }
@@ -1219,7 +1219,7 @@ define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
 ; CHECK-LABEL: fmulxs_lane:
 ; CHECKNEXT: fmulx.s s0, s0, v1[3]
   %b = extractelement <4 x float> %vec, i32 3
-  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
+  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
 ; CHECKNEXT: ret
   ret float %fmulx.i
 }
@@ -1228,13 +1228,13 @@ define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
 ; CHECK-LABEL: fmulxd_lane:
 ; CHECKNEXT: fmulx d0, d0, v1[1]
   %b = extractelement <2 x double> %vec, i32 1
-  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
+  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
 ; CHECKNEXT: ret
   ret double %fmulx.i
 }
 
-declare double @llvm.arm64.neon.fmulx.f64(double, double) nounwind readnone
-declare float @llvm.arm64.neon.fmulx.f32(float, float) nounwind readnone
+declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
 
 
 define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
@@ -1243,7 +1243,7 @@ define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; CHECK-NEXT: ret
   %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %3 = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
+  %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
   ret <8 x i16> %3
 }
 
@@ -1256,7 +1256,7 @@ define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1269,7 +1269,7 @@ define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1282,7 +1282,7 @@ define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1295,7 +1295,7 @@ define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1308,7 +1308,7 @@ define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1321,7 +1321,7 @@ define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1334,7 +1334,7 @@ entry:
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
   ret <4 x i32> %vmull2.i
 }
 
@@ -1347,7 +1347,7 @@ entry:
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
   ret <2 x i64> %vmull2.i
 }
 
@@ -1360,7 +1360,7 @@ entry:
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
   ret <4 x i32> %vmull2.i
 }
 
@@ -1373,7 +1373,7 @@ entry:
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
   ret <2 x i64> %vmull2.i
 }
 
@@ -1388,7 +1388,7 @@ define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
   %tmp2 = bitcast <16 x i8> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   %add.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i
 }
@@ -1404,7 +1404,7 @@ define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
   %tmp2 = bitcast <8 x i16> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i
 }
@@ -1420,7 +1420,7 @@ define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
   %tmp2 = bitcast <4 x i32> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i
 }
@@ -1436,7 +1436,7 @@ define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
   %tmp2 = bitcast <16 x i8> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   %add.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i
 }
@@ -1452,7 +1452,7 @@ define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
   %tmp2 = bitcast <8 x i16> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i
 }
@@ -1468,7 +1468,7 @@ define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
   %tmp2 = bitcast <4 x i32> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i
 }
@@ -1484,7 +1484,7 @@ define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
   %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add
 }
@@ -1500,7 +1500,7 @@ define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
   %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add
 }
@@ -1517,7 +1517,7 @@ define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
   %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add
 }
@@ -1533,7 +1533,7 @@ define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
   %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add
 }
@@ -1598,6 +1598,32 @@ entry:
   ret <2 x i32> %add
 }
 
+define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
+entry:
+; CHECK: not_really_vmlaq_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: mla.8h v0, v1, v2[5]
+; CHECK-NEXT: ret
+  %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <8 x i16> %shuffle2, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
+entry:
+; CHECK: not_really_vmlaq_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: mla.4s v0, v1, v2[3]
+; CHECK-NEXT: ret
+  %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle2, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
 define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
 entry:
 ; CHECK: vmull_laneq_s16_test
@@ -1605,7 +1631,7 @@ entry:
 ; CHECK: smull.4s v0, v0, v1[6]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
   ret <4 x i32> %vmull2.i
 }
 
@@ -1616,7 +1642,7 @@ entry:
 ; CHECK: smull.2d v0, v0, v1[2]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
   ret <2 x i64> %vmull2.i
 }
 define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
@@ -1626,7 +1652,7 @@ entry:
 ; CHECK: umull.4s v0, v0, v1[6]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
   ret <4 x i32> %vmull2.i
 }
 
@@ -1637,7 +1663,7 @@ entry:
 ; CHECK: umull.2d v0, v0, v1[2]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
   ret <2 x i64> %vmull2.i
 }
 
@@ -1655,7 +1681,7 @@ entry:
   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1670,7 +1696,7 @@ entry:
   %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1688,7 +1714,7 @@ entry:
   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1703,7 +1729,7 @@ entry:
   %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1761,7 +1787,7 @@ define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   ret <2 x i64> %res
 }
 
@@ -1773,8 +1799,8 @@ define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
   ret <2 x i64> %sum
 }
 
@@ -1787,7 +1813,7 @@ define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
   ret <2 x i64> %res
 }
 
@@ -1800,7 +1826,7 @@ define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
 
   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 
-  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
   ret <8 x i16> %res
 }
 
@@ -1812,7 +1838,7 @@ define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 
-  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
   ret <8 x i16> %res
 }
 
@@ -1824,7 +1850,7 @@ define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   ret <2 x i64> %res
 }
 
@@ -1836,8 +1862,8 @@ define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs,
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
   ret <2 x i64> %sum
 }
 
@@ -1849,7 +1875,7 @@ define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   %sum = add <2 x i64> %accum, %res
   ret <2 x i64> %sum
 }
@@ -1971,23 +1997,23 @@ define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind
 define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
 ;CHECK-LABEL: sqdmlal_d:
 ;CHECK: sqdmlal
-  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
-  %tmp5 = call i64 @llvm.arm64.neon.sqadd.i64(i64 %C, i64 %tmp4)
+  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
   ret i64 %tmp5
 }
 
 define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
 ;CHECK-LABEL: sqdmlsl_d:
 ;CHECK: sqdmlsl
-  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
-  %tmp5 = call i64 @llvm.arm64.neon.sqsub.i64(i64 %C, i64 %tmp4)
+  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
   ret i64 %tmp5
 }
 
 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
 ; CHECK-LABEL: test_pmull_64:
 ; CHECK: pmull.1q
-  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l, i64 %r)
+  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
   ret <16 x i8> %val
 }
 
@@ -1996,8 +2022,15 @@ define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
 ; CHECK: pmull2.1q
   %l_hi = extractelement <2 x i64> %l, i32 1
   %r_hi = extractelement <2 x i64> %r, i32 1
-  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l_hi, i64 %r_hi)
+  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
   ret <16 x i8> %val
 }
 
-declare <16 x i8> @llvm.arm64.neon.pmull64(i64, i64)
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
+
+define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
+; CHECK-LABEL: test_mul_v1i64:
+; CHECK: mul
+  %prod = mul <1 x i64> %lhs, %rhs
+  ret <1 x i64> %prod
+}
diff --git a/test/CodeGen/ARM64/volatile.ll b/test/CodeGen/AArch64/arm64-volatile.ll
index e00ac5a..e00ac5a 100644
--- a/test/CodeGen/ARM64/volatile.ll
+++ b/test/CodeGen/AArch64/arm64-volatile.ll
diff --git a/test/CodeGen/AArch64/arm64-vpopcnt.ll b/test/CodeGen/AArch64/arm64-vpopcnt.ll
new file mode 100644
index 0000000..25306eb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vpopcnt.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; The non-byte ones used to fail with "Cannot select"
+
+; CHECK-LABEL: ctpopv8i8
+; CHECK: cnt.8b
+define <8 x i8> @ctpopv8i8(<8 x i8> %x) nounwind readnone {
+  %cnt = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %x)
+  ret <8 x i8> %cnt
+}
+
+declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
+
+; CHECK-LABEL: ctpopv4i16
+; CHECK: cnt.8b
+define <4 x i16> @ctpopv4i16(<4 x i16> %x) nounwind readnone {
+  %cnt = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %x)
+  ret <4 x i16> %cnt
+}
+
+declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
+
+; CHECK-LABEL: ctpopv2i32
+; CHECK: cnt.8b
+define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone {
+  %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
+  ret <2 x i32> %cnt
+}
+
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
+
+
+; CHECK-LABEL: ctpopv16i8
+; CHECK: cnt.16b
+define <16 x i8> @ctpopv16i8(<16 x i8> %x) nounwind readnone {
+  %cnt = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %x)
+  ret <16 x i8> %cnt
+}
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
+
+; CHECK-LABEL: ctpopv8i16
+; CHECK: cnt.8b
+define <8 x i16> @ctpopv8i16(<8 x i16> %x) nounwind readnone {
+  %cnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x)
+  ret <8 x i16> %cnt
+}
+
+declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
+
+; CHECK-LABEL: ctpopv4i32
+; CHECK: cnt.8b
+define <4 x i32> @ctpopv4i32(<4 x i32> %x) nounwind readnone {
+  %cnt = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
+  ret <4 x i32> %cnt
+}
+
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
+
+; CHECK-LABEL: ctpopv2i64
+; CHECK: cnt.8b
+define <2 x i64> @ctpopv2i64(<2 x i64> %x) nounwind readnone {
+  %cnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
+  ret <2 x i64> %cnt
+}
+
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vqadd.ll b/test/CodeGen/AArch64/arm64-vqadd.ll
index 0b7f7e5..20f7e2c 100644
--- a/test/CodeGen/ARM64/vqadd.ll
+++ b/test/CodeGen/AArch64/arm64-vqadd.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @sqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sqadd8b:
 ;CHECK: sqadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i16> @sqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sqadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <2 x i32> @sqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <8 x i8> @uqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uqadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <4 x i16> @uqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uqadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -50,7 +50,7 @@ define <2 x i32> @uqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uqadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -59,7 +59,7 @@ define <16 x i8> @sqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sqadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -68,7 +68,7 @@ define <8 x i16> @sqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sqadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <4 x i32> @sqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sqadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -86,7 +86,7 @@ define <2 x i64> @sqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: sqadd.2d
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
@@ -95,7 +95,7 @@ define <16 x i8> @uqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uqadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -104,7 +104,7 @@ define <8 x i16> @uqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uqadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -113,7 +113,7 @@ define <4 x i32> @uqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uqadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -122,36 +122,36 @@ define <2 x i64> @uqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: uqadd.2d
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @usqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: usqadd8b:
 ;CHECK: usqadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -160,7 +160,7 @@ define <4 x i16> @usqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: usqadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -169,7 +169,7 @@ define <2 x i32> @usqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: usqadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -178,7 +178,7 @@ define <16 x i8> @usqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: usqadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -187,7 +187,7 @@ define <8 x i16> @usqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: usqadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -196,7 +196,7 @@ define <4 x i32> @usqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: usqadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -205,42 +205,42 @@ define <2 x i64> @usqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: usqadd.2d
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
 define i64 @usqadd_d(i64 %l, i64 %r) nounwind {
 ; CHECK-LABEL: usqadd_d:
 ; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call i64 @llvm.arm64.neon.usqadd.i64(i64 %l, i64 %r)
+  %sum = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %l, i64 %r)
   ret i64 %sum
 }
 
 define i32 @usqadd_s(i32 %l, i32 %r) nounwind {
 ; CHECK-LABEL: usqadd_s:
 ; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
-  %sum = call i32 @llvm.arm64.neon.usqadd.i32(i32 %l, i32 %r)
+  %sum = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %l, i32 %r)
   ret i32 %sum
 }
 
-declare <8 x i8>  @llvm.arm64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.usqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.usqadd.i32(i32, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.usqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.usqadd.i32(i32, i32) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @suqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: suqadd8b:
 ;CHECK: suqadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -249,7 +249,7 @@ define <4 x i16> @suqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: suqadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -258,7 +258,7 @@ define <2 x i32> @suqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: suqadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -267,7 +267,7 @@ define <16 x i8> @suqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: suqadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -276,7 +276,7 @@ define <8 x i16> @suqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: suqadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -285,7 +285,7 @@ define <4 x i32> @suqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: suqadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -294,39 +294,39 @@ define <2 x i64> @suqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: suqadd.2d
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
 define <1 x i64> @suqadd_1d(<1 x i64> %l, <1 x i64> %r) nounwind {
 ; CHECK-LABEL: suqadd_1d:
 ; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
+  %sum = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
   ret <1 x i64> %sum
 }
 
 define i64 @suqadd_d(i64 %l, i64 %r) nounwind {
 ; CHECK-LABEL: suqadd_d:
 ; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call i64 @llvm.arm64.neon.suqadd.i64(i64 %l, i64 %r)
+  %sum = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %l, i64 %r)
   ret i64 %sum
 }
 
 define i32 @suqadd_s(i32 %l, i32 %r) nounwind {
 ; CHECK-LABEL: suqadd_s:
 ; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
-  %sum = call i32 @llvm.arm64.neon.suqadd.i32(i32 %l, i32 %r)
+  %sum = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %l, i32 %r)
   ret i32 %sum
 }
 
-declare <8 x i8>  @llvm.arm64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.suqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.suqadd.i32(i32, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.suqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.suqadd.i32(i32, i32) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vqsub.ll b/test/CodeGen/AArch64/arm64-vqsub.ll
new file mode 100644
index 0000000..dde3ac3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vqsub.ll
@@ -0,0 +1,147 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub8b:
+;CHECK: sqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub4h:
+;CHECK: sqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub2s:
+;CHECK: sqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub8b:
+;CHECK: uqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub4h:
+;CHECK: uqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub2s:
+;CHECK: uqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub16b:
+;CHECK: sqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub8h:
+;CHECK: sqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub4s:
+;CHECK: sqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqsub2d:
+;CHECK: sqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub16b:
+;CHECK: uqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub8h:
+;CHECK: uqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub4s:
+;CHECK: uqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqsub2d:
+;CHECK: uqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vselect.ll b/test/CodeGen/AArch64/arm64-vselect.ll
index 07274a0..9988512 100644
--- a/test/CodeGen/ARM64/vselect.ll
+++ b/test/CodeGen/AArch64/arm64-vselect.ll
@@ -1,8 +1,15 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ;CHECK: @func63
 ;CHECK: cmeq.4h v0, v0, v1
-;CHECK: sshll.4s  v0, v0, #0
+
+;FIXME: currently, it will generate 3 instructions:
+; ushll.4s	v0, v0, #0
+; shl.4s	v0, v0, #31
+; sshr.4s	v0, v0, #31
+;But these instrucitons can be optimized into 1 instruction:
+; sshll.4s  v0, v0, #0
+
 ;CHECK: bsl.16b v0, v2, v3
 ;CHECK: str  q0, [x0]
 ;CHECK: ret
diff --git a/test/CodeGen/ARM64/vsetcc_fp.ll b/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
index c93aad5..f4f4714 100644
--- a/test/CodeGen/ARM64/vsetcc_fp.ll
+++ b/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 define <2 x i32> @fcmp_one(<2 x float> %x, <2 x float> %y) nounwind optsize readnone {
 ; CHECK-LABEL: fcmp_one:
 ; CHECK-NEXT: fcmgt.2s [[REG:v[0-9]+]], v0, v1
diff --git a/test/CodeGen/ARM64/vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll
index ae5da38..82ae486 100644
--- a/test/CodeGen/ARM64/vshift.ll
+++ b/test/CodeGen/AArch64/arm64-vshift.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -enable-misched=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -enable-misched=false | FileCheck %s
 
 define <8 x i8> @sqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sqshl8b:
 ;CHECK: sqshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i16> @sqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sqshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <2 x i32> @sqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <8 x i8> @uqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uqshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <4 x i16> @uqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uqshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -50,7 +50,7 @@ define <2 x i32> @uqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uqshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -59,7 +59,7 @@ define <16 x i8> @sqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sqshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -68,7 +68,7 @@ define <8 x i16> @sqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sqshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <4 x i32> @sqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sqshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -86,7 +86,7 @@ define <2 x i64> @sqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: sqshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
@@ -95,7 +95,7 @@ define <16 x i8> @uqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uqshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -104,7 +104,7 @@ define <8 x i16> @uqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uqshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -113,7 +113,7 @@ define <4 x i32> @uqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uqshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -122,36 +122,36 @@ define <2 x i64> @uqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: uqshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @srshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: srshl8b:
 ;CHECK: srshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -160,7 +160,7 @@ define <4 x i16> @srshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: srshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -169,7 +169,7 @@ define <2 x i32> @srshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: srshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -178,7 +178,7 @@ define <8 x i8> @urshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: urshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -187,7 +187,7 @@ define <4 x i16> @urshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: urshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -196,7 +196,7 @@ define <2 x i32> @urshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: urshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -205,7 +205,7 @@ define <16 x i8> @srshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: srshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -214,7 +214,7 @@ define <8 x i16> @srshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: srshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -223,7 +223,7 @@ define <4 x i32> @srshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: srshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -232,7 +232,7 @@ define <2 x i64> @srshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: srshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
@@ -241,7 +241,7 @@ define <16 x i8> @urshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: urshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -250,7 +250,7 @@ define <8 x i16> @urshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: urshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -259,7 +259,7 @@ define <4 x i32> @urshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: urshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -268,36 +268,36 @@ define <2 x i64> @urshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: urshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @sqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sqrshl8b:
 ;CHECK: sqrshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -306,7 +306,7 @@ define <4 x i16> @sqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sqrshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -315,7 +315,7 @@ define <2 x i32> @sqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqrshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -324,7 +324,7 @@ define <8 x i8> @uqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uqrshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -333,7 +333,7 @@ define <4 x i16> @uqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uqrshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -342,7 +342,7 @@ define <2 x i32> @uqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uqrshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -351,7 +351,7 @@ define <16 x i8> @sqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sqrshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -360,7 +360,7 @@ define <8 x i16> @sqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sqrshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -369,7 +369,7 @@ define <4 x i32> @sqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sqrshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -378,7 +378,7 @@ define <2 x i64> @sqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: sqrshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
@@ -387,7 +387,7 @@ define <16 x i8> @uqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uqrshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -396,7 +396,7 @@ define <8 x i16> @uqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uqrshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -405,7 +405,7 @@ define <4 x i32> @uqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uqrshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -414,35 +414,35 @@ define <2 x i64> @uqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: uqrshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @urshr8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: urshr8b:
 ;CHECK: urshr.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <8 x i8> %tmp3
 }
 
@@ -450,7 +450,7 @@ define <4 x i16> @urshr4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: urshr4h:
 ;CHECK: urshr.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <4 x i16> %tmp3
 }
 
@@ -458,7 +458,7 @@ define <2 x i32> @urshr2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: urshr2s:
 ;CHECK: urshr.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         ret <2 x i32> %tmp3
 }
 
@@ -466,7 +466,7 @@ define <16 x i8> @urshr16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: urshr16b:
 ;CHECK: urshr.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <16 x i8> %tmp3
 }
 
@@ -474,7 +474,7 @@ define <8 x i16> @urshr8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: urshr8h:
 ;CHECK: urshr.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <8 x i16> %tmp3
 }
 
@@ -482,7 +482,7 @@ define <4 x i32> @urshr4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: urshr4s:
 ;CHECK: urshr.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         ret <4 x i32> %tmp3
 }
 
@@ -490,7 +490,7 @@ define <2 x i64> @urshr2d(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: urshr2d:
 ;CHECK: urshr.2d
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         ret <2 x i64> %tmp3
 }
 
@@ -498,7 +498,7 @@ define <8 x i8> @srshr8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: srshr8b:
 ;CHECK: srshr.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <8 x i8> %tmp3
 }
 
@@ -506,7 +506,7 @@ define <4 x i16> @srshr4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: srshr4h:
 ;CHECK: srshr.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <4 x i16> %tmp3
 }
 
@@ -514,7 +514,7 @@ define <2 x i32> @srshr2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: srshr2s:
 ;CHECK: srshr.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         ret <2 x i32> %tmp3
 }
 
@@ -522,7 +522,7 @@ define <16 x i8> @srshr16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: srshr16b:
 ;CHECK: srshr.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <16 x i8> %tmp3
 }
 
@@ -530,7 +530,7 @@ define <8 x i16> @srshr8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: srshr8h:
 ;CHECK: srshr.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <8 x i16> %tmp3
 }
 
@@ -538,7 +538,7 @@ define <4 x i32> @srshr4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: srshr4s:
 ;CHECK: srshr.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         ret <4 x i32> %tmp3
 }
 
@@ -546,7 +546,7 @@ define <2 x i64> @srshr2d(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: srshr2d:
 ;CHECK: srshr.2d
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         ret <2 x i64> %tmp3
 }
 
@@ -554,7 +554,7 @@ define <8 x i8> @sqshlu8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqshlu8b:
 ;CHECK: sqshlu.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <8 x i8> %tmp3
 }
 
@@ -562,7 +562,7 @@ define <4 x i16> @sqshlu4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshlu4h:
 ;CHECK: sqshlu.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
         ret <4 x i16> %tmp3
 }
 
@@ -570,7 +570,7 @@ define <2 x i32> @sqshlu2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshlu2s:
 ;CHECK: sqshlu.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
         ret <2 x i32> %tmp3
 }
 
@@ -578,7 +578,7 @@ define <16 x i8> @sqshlu16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqshlu16b:
 ;CHECK: sqshlu.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <16 x i8> %tmp3
 }
 
@@ -586,7 +586,7 @@ define <8 x i16> @sqshlu8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshlu8h:
 ;CHECK: sqshlu.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
         ret <8 x i16> %tmp3
 }
 
@@ -594,7 +594,7 @@ define <4 x i32> @sqshlu4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshlu4s:
 ;CHECK: sqshlu.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
         ret <4 x i32> %tmp3
 }
 
@@ -602,25 +602,25 @@ define <2 x i64> @sqshlu2d(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqshlu2d:
 ;CHECK: sqshlu.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @rshrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: rshrn8b:
 ;CHECK: rshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -628,7 +628,7 @@ define <4 x i16> @rshrn4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: rshrn4h:
 ;CHECK: rshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -636,7 +636,7 @@ define <2 x i32> @rshrn2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: rshrn2s:
 ;CHECK: rshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -645,7 +645,7 @@ define <16 x i8> @rshrn16b(<8 x i8> *%ret, <8 x i16>* %A) nounwind {
 ;CHECK: rshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -655,7 +655,7 @@ define <8 x i16> @rshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: rshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -665,14 +665,14 @@ define <4 x i32> @rshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: rshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare <8 x i8>  @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define <8 x i8> @shrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: shrn8b:
@@ -734,14 +734,14 @@ define <4 x i32> @shrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
         ret <4 x i32> %tmp4
 }
 
-declare <8 x i8>  @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqshrn1s:
 ; CHECK: sqshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -749,7 +749,7 @@ define <8 x i8> @sqshrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshrn8b:
 ;CHECK: sqshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -757,7 +757,7 @@ define <4 x i16> @sqshrn4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshrn4h:
 ;CHECK: sqshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -765,7 +765,7 @@ define <2 x i32> @sqshrn2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqshrn2s:
 ;CHECK: sqshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -775,7 +775,7 @@ define <16 x i8> @sqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: sqshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -785,7 +785,7 @@ define <8 x i16> @sqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: sqshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -795,20 +795,20 @@ define <4 x i32> @sqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: sqshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqshrun1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqshrun1s:
 ; CHECK: sqshrun {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqshrun.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -816,7 +816,7 @@ define <8 x i8> @sqshrun8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshrun8b:
 ;CHECK: sqshrun.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -824,7 +824,7 @@ define <4 x i16> @sqshrun4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshrun4h:
 ;CHECK: sqshrun.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -832,7 +832,7 @@ define <2 x i32> @sqshrun2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqshrun2s:
 ;CHECK: sqshrun.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -841,7 +841,7 @@ define <16 x i8> @sqshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: sqshrun2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -851,7 +851,7 @@ define <8 x i16> @sqshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: sqshrun2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -861,20 +861,20 @@ define <4 x i32> @sqshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: sqshrun2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqshrun.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqrshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqrshrn1s:
 ; CHECK: sqrshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqrshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -882,7 +882,7 @@ define <8 x i8> @sqrshrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqrshrn8b:
 ;CHECK: sqrshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -890,7 +890,7 @@ define <4 x i16> @sqrshrn4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqrshrn4h:
 ;CHECK: sqrshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -898,7 +898,7 @@ define <2 x i32> @sqrshrn2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqrshrn2s:
 ;CHECK: sqrshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -907,7 +907,7 @@ define <16 x i8> @sqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: sqrshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -917,7 +917,7 @@ define <8 x i16> @sqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: sqrshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -927,20 +927,20 @@ define <4 x i32> @sqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: sqrshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqrshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqrshrun1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqrshrun1s:
 ; CHECK: sqrshrun {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqrshrun.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -948,7 +948,7 @@ define <8 x i8> @sqrshrun8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqrshrun8b:
 ;CHECK: sqrshrun.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -956,7 +956,7 @@ define <4 x i16> @sqrshrun4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqrshrun4h:
 ;CHECK: sqrshrun.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -964,7 +964,7 @@ define <2 x i32> @sqrshrun2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqrshrun2s:
 ;CHECK: sqrshrun.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -973,7 +973,7 @@ define <16 x i8> @sqrshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: sqrshrun2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -983,7 +983,7 @@ define <8 x i16> @sqrshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: sqrshrun2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -993,20 +993,20 @@ define <4 x i32> @sqrshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: sqrshrun2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqrshrun.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqrshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @uqrshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: uqrshrn1s:
 ; CHECK: uqrshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.uqrshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -1014,7 +1014,7 @@ define <8 x i8> @uqrshrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: uqrshrn8b:
 ;CHECK: uqrshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -1022,7 +1022,7 @@ define <4 x i16> @uqrshrn4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: uqrshrn4h:
 ;CHECK: uqrshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -1030,7 +1030,7 @@ define <2 x i32> @uqrshrn2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: uqrshrn2s:
 ;CHECK: uqrshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -1039,7 +1039,7 @@ define <16 x i8> @uqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: uqrshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -1049,7 +1049,7 @@ define <8 x i16> @uqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: uqrshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -1059,20 +1059,20 @@ define <4 x i32> @uqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: uqrshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.uqrshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.uqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @uqshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: uqshrn1s:
 ; CHECK: uqshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.uqshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -1080,7 +1080,7 @@ define <8 x i8> @uqshrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: uqshrn8b:
 ;CHECK: uqshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -1088,7 +1088,7 @@ define <4 x i16> @uqshrn4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: uqshrn4h:
 ;CHECK: uqshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -1096,7 +1096,7 @@ define <2 x i32> @uqshrn2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: uqshrn2s:
 ;CHECK: uqshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -1105,7 +1105,7 @@ define <16 x i8> @uqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: uqshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -1115,7 +1115,7 @@ define <8 x i16> @uqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: uqshrn2.8h v0, {{v[0-9]+}}, #1
   %out = load <4 x i16>* %ret
   %tmp1 = load <4 x i32>* %A
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
   %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %tmp4
 }
@@ -1125,15 +1125,15 @@ define <4 x i32> @uqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: uqshrn2.4s v0, {{v[0-9]+}}, #1
   %out = load <2 x i32>* %ret
   %tmp1 = load <2 x i64>* %A
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
   %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.uqshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.uqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define <8 x i16> @ushll8h(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: ushll8h:
@@ -1253,7 +1253,7 @@ define <8 x i8> @sqshli8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqshli8b:
 ;CHECK: sqshl.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <8 x i8> %tmp3
 }
 
@@ -1261,7 +1261,7 @@ define <4 x i16> @sqshli4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshli4h:
 ;CHECK: sqshl.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
         ret <4 x i16> %tmp3
 }
 
@@ -1269,7 +1269,7 @@ define <2 x i32> @sqshli2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshli2s:
 ;CHECK: sqshl.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
         ret <2 x i32> %tmp3
 }
 
@@ -1277,7 +1277,7 @@ define <16 x i8> @sqshli16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqshli16b:
 ;CHECK: sqshl.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <16 x i8> %tmp3
 }
 
@@ -1285,7 +1285,7 @@ define <8 x i16> @sqshli8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshli8h:
 ;CHECK: sqshl.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
         ret <8 x i16> %tmp3
 }
 
@@ -1293,7 +1293,7 @@ define <4 x i32> @sqshli4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshli4s:
 ;CHECK: sqshl.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
         ret <4 x i32> %tmp3
 }
 
@@ -1301,7 +1301,7 @@ define <2 x i64> @sqshli2d(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqshli2d:
 ;CHECK: sqshl.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
         ret <2 x i64> %tmp3
 }
 
@@ -1309,7 +1309,7 @@ define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: uqshli8b:
 ;CHECK: uqshl.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <8 x i8> %tmp3
 }
 
@@ -1317,7 +1317,7 @@ define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: uqshli4h:
 ;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
         ret <4 x i16> %tmp3
 }
 
@@ -1325,7 +1325,7 @@ define <2 x i32> @uqshli2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: uqshli2s:
 ;CHECK: uqshl.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
         ret <2 x i32> %tmp3
 }
 
@@ -1333,7 +1333,7 @@ define <16 x i8> @uqshli16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: uqshli16b:
 ;CHECK: uqshl.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <16 x i8> %tmp3
 }
 
@@ -1341,7 +1341,7 @@ define <8 x i16> @uqshli8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: uqshli8h:
 ;CHECK: uqshl.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
         ret <8 x i16> %tmp3
 }
 
@@ -1349,7 +1349,7 @@ define <4 x i32> @uqshli4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: uqshli4s:
 ;CHECK: uqshl.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
         ret <4 x i32> %tmp3
 }
 
@@ -1357,7 +1357,7 @@ define <2 x i64> @uqshli2d(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: uqshli2d:
 ;CHECK: uqshl.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
         ret <2 x i64> %tmp3
 }
 
@@ -1365,7 +1365,7 @@ define <8 x i8> @ursra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: ursra8b:
 ;CHECK: ursra.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <8 x i8>* %B
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -1375,7 +1375,7 @@ define <4 x i16> @ursra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: ursra4h:
 ;CHECK: ursra.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -1385,7 +1385,7 @@ define <2 x i32> @ursra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK-LABEL: ursra2s:
 ;CHECK: ursra.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -1395,7 +1395,7 @@ define <16 x i8> @ursra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: ursra16b:
 ;CHECK: ursra.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <16 x i8>* %B
         %tmp5 = add <16 x i8> %tmp3, %tmp4
          ret <16 x i8> %tmp5
@@ -1405,7 +1405,7 @@ define <8 x i16> @ursra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: ursra8h:
 ;CHECK: ursra.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
          ret <8 x i16> %tmp5
@@ -1415,7 +1415,7 @@ define <4 x i32> @ursra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: ursra4s:
 ;CHECK: ursra.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
          ret <4 x i32> %tmp5
@@ -1425,7 +1425,7 @@ define <2 x i64> @ursra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK-LABEL: ursra2d:
 ;CHECK: ursra.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
          ret <2 x i64> %tmp5
@@ -1435,7 +1435,7 @@ define <8 x i8> @srsra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: srsra8b:
 ;CHECK: srsra.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <8 x i8>* %B
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -1445,7 +1445,7 @@ define <4 x i16> @srsra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: srsra4h:
 ;CHECK: srsra.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -1455,7 +1455,7 @@ define <2 x i32> @srsra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK-LABEL: srsra2s:
 ;CHECK: srsra.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -1465,7 +1465,7 @@ define <16 x i8> @srsra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: srsra16b:
 ;CHECK: srsra.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <16 x i8>* %B
         %tmp5 = add <16 x i8> %tmp3, %tmp4
          ret <16 x i8> %tmp5
@@ -1475,7 +1475,7 @@ define <8 x i16> @srsra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: srsra8h:
 ;CHECK: srsra.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
          ret <8 x i16> %tmp5
@@ -1485,7 +1485,7 @@ define <4 x i32> @srsra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: srsra4s:
 ;CHECK: srsra.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
          ret <4 x i32> %tmp5
@@ -1495,7 +1495,7 @@ define <2 x i64> @srsra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK-LABEL: srsra2d:
 ;CHECK: srsra.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
          ret <2 x i64> %tmp5
@@ -1831,7 +1831,7 @@ define <8 x i8> @sli8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: sli.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -1840,7 +1840,7 @@ define <4 x i16> @sli4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sli.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -1849,7 +1849,7 @@ define <2 x i32> @sli2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sli.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -1858,7 +1858,7 @@ define <1 x i64> @sli1d(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ;CHECK: sli d0, {{d[0-9]+}}, #1
         %tmp1 = load <1 x i64>* %A
         %tmp2 = load <1 x i64>* %B
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
         ret <1 x i64> %tmp3
 }
 
@@ -1867,7 +1867,7 @@ define <16 x i8> @sli16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sli.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
         ret <16 x i8> %tmp3
 }
 
@@ -1876,7 +1876,7 @@ define <8 x i16> @sli8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sli.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
         ret <8 x i16> %tmp3
 }
 
@@ -1885,7 +1885,7 @@ define <4 x i32> @sli4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sli.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
         ret <4 x i32> %tmp3
 }
 
@@ -1894,16 +1894,24 @@ define <2 x i64> @sli2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: sli.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
+
+define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: ashr_v1i64:
+; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %c = ashr <1 x i64> %a, %b
+  ret <1 x i64> %c
+}
diff --git a/test/CodeGen/ARM64/vshr.ll b/test/CodeGen/AArch64/arm64-vshr.ll
index 6adb81c..21eb579 100644
--- a/test/CodeGen/ARM64/vshr.ll
+++ b/test/CodeGen/AArch64/arm64-vshr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
 
 define <8 x i16> @testShiftRightArith_v8i16(<8 x i16> %a, <8 x i16> %b) #0 {
 ; CHECK-LABEL: testShiftRightArith_v8i16:
diff --git a/test/CodeGen/ARM64/vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
index f90200c..62fd961 100644
--- a/test/CodeGen/ARM64/vshuffle.ll
+++ b/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s
 
 
 ; The mask:
@@ -15,7 +15,7 @@
 ; CHECK:  .byte   0                       ; 0x0
 ; CHECK: test1
 ; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0
-; CHECK: movi.8h v[[REG1:[0-9]+]], #1, lsl #8
+; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8
 ; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
 define <8 x i1> @test1() {
 entry:
@@ -71,11 +71,11 @@ bb:
 ; CHECK: test3
 ; CHECK: adrp    x[[REG3:[0-9]+]], lCPI2_0@PAGE
 ; CHECK: ldr     q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF]
-; CHECK: movi.2d v[[REG1:[0-9]+]], #0000000000000000
+; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF]
 ; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
 define <16 x i1> @test3(i1* %ptr, i32 %v) {
 bb:
-  %Shuff = shufflevector <16 x i1> zeroinitializer, <16 x i1> undef,
+  %Shuff = shufflevector <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i1> undef,
      <16 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
                  i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12,
                  i32 14, i32 0>
diff --git a/test/CodeGen/ARM64/vsqrt.ll b/test/CodeGen/AArch64/arm64-vsqrt.ll
index 094d704..02b7c7e 100644
--- a/test/CodeGen/ARM64/vsqrt.ll
+++ b/test/CodeGen/AArch64/arm64-vsqrt.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x float> @frecps_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: frecps_2s:
 ;CHECK: frecps.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x float> @frecps_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: frecps.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -23,13 +23,13 @@ define <2 x double> @frecps_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: frecps.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 
 define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
@@ -37,7 +37,7 @@ define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK: frsqrts.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -46,7 +46,7 @@ define <4 x float> @frsqrts_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: frsqrts.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -55,19 +55,19 @@ define <2 x double> @frsqrts_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: frsqrts.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @frecpe_2s(<2 x float>* %A) nounwind {
 ;CHECK-LABEL: frecpe_2s:
 ;CHECK: frecpe.2s
 	%tmp1 = load <2 x float>* %A
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float> %tmp1)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %tmp1)
 	ret <2 x float> %tmp3
 }
 
@@ -75,7 +75,7 @@ define <4 x float> @frecpe_4s(<4 x float>* %A) nounwind {
 ;CHECK-LABEL: frecpe_4s:
 ;CHECK: frecpe.4s
 	%tmp1 = load <4 x float>* %A
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float> %tmp1)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %tmp1)
 	ret <4 x float> %tmp3
 }
 
@@ -83,7 +83,7 @@ define <2 x double> @frecpe_2d(<2 x double>* %A) nounwind {
 ;CHECK-LABEL: frecpe_2d:
 ;CHECK: frecpe.2d
 	%tmp1 = load <2 x double>* %A
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double> %tmp1)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> %tmp1)
 	ret <2 x double> %tmp3
 }
 
@@ -91,7 +91,7 @@ define float @frecpe_s(float* %A) nounwind {
 ;CHECK-LABEL: frecpe_s:
 ;CHECK: frecpe s0, {{s[0-9]+}}
   %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frecpe.f32(float %tmp1)
+  %tmp3 = call float @llvm.aarch64.neon.frecpe.f32(float %tmp1)
   ret float %tmp3
 }
 
@@ -99,21 +99,21 @@ define double @frecpe_d(double* %A) nounwind {
 ;CHECK-LABEL: frecpe_d:
 ;CHECK: frecpe d0, {{d[0-9]+}}
   %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frecpe.f64(double %tmp1)
+  %tmp3 = call double @llvm.aarch64.neon.frecpe.f64(double %tmp1)
   ret double %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
-declare float @llvm.arm64.neon.frecpe.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frecpe.f64(double) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frecpe.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpe.f64(double) nounwind readnone
 
 define float @frecpx_s(float* %A) nounwind {
 ;CHECK-LABEL: frecpx_s:
 ;CHECK: frecpx s0, {{s[0-9]+}}
   %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frecpx.f32(float %tmp1)
+  %tmp3 = call float @llvm.aarch64.neon.frecpx.f32(float %tmp1)
   ret float %tmp3
 }
 
@@ -121,18 +121,18 @@ define double @frecpx_d(double* %A) nounwind {
 ;CHECK-LABEL: frecpx_d:
 ;CHECK: frecpx d0, {{d[0-9]+}}
   %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frecpx.f64(double %tmp1)
+  %tmp3 = call double @llvm.aarch64.neon.frecpx.f64(double %tmp1)
   ret double %tmp3
 }
 
-declare float @llvm.arm64.neon.frecpx.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frecpx.f64(double) nounwind readnone
+declare float @llvm.aarch64.neon.frecpx.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpx.f64(double) nounwind readnone
 
 define <2 x float> @frsqrte_2s(<2 x float>* %A) nounwind {
 ;CHECK-LABEL: frsqrte_2s:
 ;CHECK: frsqrte.2s
 	%tmp1 = load <2 x float>* %A
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float> %tmp1)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %tmp1)
 	ret <2 x float> %tmp3
 }
 
@@ -140,7 +140,7 @@ define <4 x float> @frsqrte_4s(<4 x float>* %A) nounwind {
 ;CHECK-LABEL: frsqrte_4s:
 ;CHECK: frsqrte.4s
 	%tmp1 = load <4 x float>* %A
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float> %tmp1)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %tmp1)
 	ret <4 x float> %tmp3
 }
 
@@ -148,7 +148,7 @@ define <2 x double> @frsqrte_2d(<2 x double>* %A) nounwind {
 ;CHECK-LABEL: frsqrte_2d:
 ;CHECK: frsqrte.2d
 	%tmp1 = load <2 x double>* %A
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double> %tmp1)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> %tmp1)
 	ret <2 x double> %tmp3
 }
 
@@ -156,7 +156,7 @@ define float @frsqrte_s(float* %A) nounwind {
 ;CHECK-LABEL: frsqrte_s:
 ;CHECK: frsqrte s0, {{s[0-9]+}}
   %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frsqrte.f32(float %tmp1)
+  %tmp3 = call float @llvm.aarch64.neon.frsqrte.f32(float %tmp1)
   ret float %tmp3
 }
 
@@ -164,21 +164,21 @@ define double @frsqrte_d(double* %A) nounwind {
 ;CHECK-LABEL: frsqrte_d:
 ;CHECK: frsqrte d0, {{d[0-9]+}}
   %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frsqrte.f64(double %tmp1)
+  %tmp3 = call double @llvm.aarch64.neon.frsqrte.f64(double %tmp1)
   ret double %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
-declare float @llvm.arm64.neon.frsqrte.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frsqrte.f64(double) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrte.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frsqrte.f64(double) nounwind readnone
 
 define <2 x i32> @urecpe_2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: urecpe_2s:
 ;CHECK: urecpe.2s
 	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32> %tmp1)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> %tmp1)
 	ret <2 x i32> %tmp3
 }
 
@@ -186,18 +186,18 @@ define <4 x i32> @urecpe_4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: urecpe_4s:
 ;CHECK: urecpe.4s
 	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32> %tmp1)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> %tmp1)
 	ret <4 x i32> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
 
 define <2 x i32> @ursqrte_2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: ursqrte_2s:
 ;CHECK: ursqrte.2s
 	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
 	ret <2 x i32> %tmp3
 }
 
@@ -205,18 +205,18 @@ define <4 x i32> @ursqrte_4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: ursqrte_4s:
 ;CHECK: ursqrte.4s
 	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
 	ret <4 x i32> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
 
 define float @f1(float %a, float %b) nounwind readnone optsize ssp {
 ; CHECK-LABEL: f1:
 ; CHECK: frsqrts s0, s0, s1
 ; CHECK-NEXT: ret
-  %vrsqrtss.i = tail call float @llvm.arm64.neon.frsqrts.f32(float %a, float %b) nounwind
+  %vrsqrtss.i = tail call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) nounwind
   ret float %vrsqrtss.i
 }
 
@@ -224,9 +224,9 @@ define double @f2(double %a, double %b) nounwind readnone optsize ssp {
 ; CHECK-LABEL: f2:
 ; CHECK: frsqrts d0, d0, d1
 ; CHECK-NEXT: ret
-  %vrsqrtsd.i = tail call double @llvm.arm64.neon.frsqrts.f64(double %a, double %b) nounwind
+  %vrsqrtsd.i = tail call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) nounwind
   ret double %vrsqrtsd.i
 }
 
-declare double @llvm.arm64.neon.frsqrts.f64(double, double) nounwind readnone
-declare float @llvm.arm64.neon.frsqrts.f32(float, float) nounwind readnone
+declare double @llvm.aarch64.neon.frsqrts.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrts.f32(float, float) nounwind readnone
diff --git a/test/CodeGen/ARM64/vsra.ll b/test/CodeGen/AArch64/arm64-vsra.ll
index 3611eb3..5e9cef3 100644
--- a/test/CodeGen/ARM64/vsra.ll
+++ b/test/CodeGen/AArch64/arm64-vsra.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vsras8:
diff --git a/test/CodeGen/ARM64/vsub.ll b/test/CodeGen/AArch64/arm64-vsub.ll
index 5c7e84f..c2c8755 100644
--- a/test/CodeGen/ARM64/vsub.ll
+++ b/test/CodeGen/AArch64/arm64-vsub.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: subhn8b:
 ;CHECK: subhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: subhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: subhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -31,8 +31,8 @@ define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
 ;CHECK-LABEL: subhn2_16b:
 ;CHECK: subhn.8b
 ;CHECK-NEXT: subhn2.16b
-  %vsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -41,8 +41,8 @@ define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
 ;CHECK-LABEL: subhn2_8h:
 ;CHECK: subhn.4h
 ;CHECK-NEXT: subhn2.8h
-  %vsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -51,22 +51,22 @@ define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
 ;CHECK-LABEL: subhn2_4s:
 ;CHECK: subhn.2s
 ;CHECK-NEXT: subhn2.4s
-  %vsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: rsubhn8b:
 ;CHECK: rsubhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -75,7 +75,7 @@ define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: rsubhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -84,7 +84,7 @@ define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: rsubhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -92,8 +92,8 @@ define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
 ;CHECK-LABEL: rsubhn2_16b:
 ;CHECK: rsubhn.8b
 ;CHECK-NEXT: rsubhn2.16b
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vrsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vrsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -102,8 +102,8 @@ define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
 ;CHECK-LABEL: rsubhn2_8h:
 ;CHECK: rsubhn.4h
 ;CHECK-NEXT: rsubhn2.8h
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vrsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vrsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -112,15 +112,15 @@ define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
 ;CHECK-LABEL: rsubhn2_4s:
 ;CHECK: rsubhn.2s
 ;CHECK-NEXT: rsubhn2.4s
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vrsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vrsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: ssubl8h:
diff --git a/test/CodeGen/ARM64/weak-reference.ll b/test/CodeGen/AArch64/arm64-weak-reference.ll
index b2135e0..b2135e0 100644
--- a/test/CodeGen/ARM64/weak-reference.ll
+++ b/test/CodeGen/AArch64/arm64-weak-reference.ll
diff --git a/test/CodeGen/ARM64/xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
index 6a8520d..6cffbde 100644
--- a/test/CodeGen/ARM64/xaluo.ll
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -7,7 +7,7 @@ define i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  saddo.i32
 ; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+; CHECK-NEXT:   cset w0, vs
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -19,7 +19,7 @@ define i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  saddo.i64
 ; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+; CHECK-NEXT:   cset w0, vs
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -31,7 +31,7 @@ define i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  uaddo.i32
 ; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   csinc w0, wzr, wzr, cc
+; CHECK-NEXT:   cset w0, hs
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -43,7 +43,7 @@ define i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  uaddo.i64
 ; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   csinc w0, wzr, wzr, cc
+; CHECK-NEXT:   cset w0, hs
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -55,7 +55,7 @@ define i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  ssubo.i32
 ; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+; CHECK-NEXT:   cset w0, vs
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -67,7 +67,7 @@ define i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  ssubo.i64
 ; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+; CHECK-NEXT:   cset w0, vs
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -79,7 +79,7 @@ define i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  usubo.i32
 ; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   csinc w0, wzr, wzr, cs
+; CHECK-NEXT:   cset w0, lo
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -91,7 +91,7 @@ define i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  usubo.i64
 ; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   csinc w0, wzr, wzr, cs
+; CHECK-NEXT:   cset w0, lo
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -105,7 +105,7 @@ entry:
 ; CHECK:        smull x8, w0, w1
 ; CHECK-NEXT:   lsr x9, x8, #32
 ; CHECK-NEXT:   cmp w9, w8, asr #31
-; CHECK-NEXT:   csinc w0, wzr, wzr, eq
+; CHECK-NEXT:   cset w0, ne
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -119,7 +119,7 @@ entry:
 ; CHECK:        mul x8, x0, x1
 ; CHECK-NEXT:   smulh x9, x0, x1
 ; CHECK-NEXT:   cmp x9, x8, asr #63
-; CHECK-NEXT:   csinc w0, wzr, wzr, eq
+; CHECK-NEXT:   cset w0, ne
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -132,7 +132,7 @@ entry:
 ; CHECK-LABEL:  umulo.i32
 ; CHECK:        umull x8, w0, w1
 ; CHECK-NEXT:   cmp xzr, x8, lsr #32
-; CHECK-NEXT:   csinc w0, wzr, wzr, eq
+; CHECK-NEXT:   cset w0, ne
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -145,7 +145,7 @@ entry:
 ; CHECK-LABEL:  umulo.i64
 ; CHECK:        umulh x8, x0, x1
 ; CHECK-NEXT:   cmp xzr, x8
-; CHECK-NEXT:   csinc w8, wzr, wzr, eq
+; CHECK-NEXT:   cset w8, ne
 ; CHECK-NEXT:   mul x9, x0, x1
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
@@ -184,7 +184,7 @@ define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  uaddo.select.i32
 ; CHECK:        cmn w0, w1
-; CHECK-NEXT:   csel w0, w0, w1, cs
+; CHECK-NEXT:   csel w0, w0, w1, hs
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -195,7 +195,7 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  uaddo.select.i64
 ; CHECK:        cmn x0, x1
-; CHECK-NEXT:   csel x0, x0, x1, cs
+; CHECK-NEXT:   csel x0, x0, x1, hs
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -228,7 +228,7 @@ define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  usubo.select.i32
 ; CHECK:        cmp w0, w1
-; CHECK-NEXT:   csel w0, w0, w1, cc
+; CHECK-NEXT:   csel w0, w0, w1, lo
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -239,7 +239,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  usubo.select.i64
 ; CHECK:        cmp x0, x1
-; CHECK-NEXT:   csel x0, x0, x1, cc
+; CHECK-NEXT:   csel x0, x0, x1, lo
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -338,7 +338,7 @@ define i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  uaddo.br.i32
 ; CHECK:        cmn w0, w1
-; CHECK-NEXT:   b.cc
+; CHECK-NEXT:   b.lo
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -355,7 +355,7 @@ define i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  uaddo.br.i64
 ; CHECK:        cmn x0, x1
-; CHECK-NEXT:   b.cc
+; CHECK-NEXT:   b.lo
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -406,7 +406,7 @@ define i1 @usubo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  usubo.br.i32
 ; CHECK:        cmp w0, w1
-; CHECK-NEXT:   b.cs
+; CHECK-NEXT:   b.hs
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -423,7 +423,7 @@ define i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  usubo.br.i64
 ; CHECK:        cmp x0, x1
-; CHECK-NEXT:   b.cs
+; CHECK-NEXT:   b.hs
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
diff --git a/test/CodeGen/ARM64/zero-cycle-regmov.ll b/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
index c56d607..c56d607 100644
--- a/test/CodeGen/ARM64/zero-cycle-regmov.ll
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
diff --git a/test/CodeGen/ARM64/zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
index 349bb6f..349bb6f 100644
--- a/test/CodeGen/ARM64/zero-cycle-zeroing.ll
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
diff --git a/test/CodeGen/ARM64/zext.ll b/test/CodeGen/AArch64/arm64-zext.ll
index 8d9e5ea..8d9e5ea 100644
--- a/test/CodeGen/ARM64/zext.ll
+++ b/test/CodeGen/AArch64/arm64-zext.ll
diff --git a/test/CodeGen/ARM64/zextload-unscaled.ll b/test/CodeGen/AArch64/arm64-zextload-unscaled.ll
index c475dbd..c475dbd 100644
--- a/test/CodeGen/ARM64/zextload-unscaled.ll
+++ b/test/CodeGen/AArch64/arm64-zextload-unscaled.ll
diff --git a/test/CodeGen/ARM64/zip.ll b/test/CodeGen/AArch64/arm64-zip.ll
index d06a9f8..304b280 100644
--- a/test/CodeGen/ARM64/zip.ll
+++ b/test/CodeGen/AArch64/arm64-zip.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vzipi8:
diff --git a/test/CodeGen/AArch64/asm-large-immediate.ll b/test/CodeGen/AArch64/asm-large-immediate.ll
new file mode 100644
index 0000000..05e4ddd
--- /dev/null
+++ b/test/CodeGen/AArch64/asm-large-immediate.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=aarch64 -no-integrated-as < %s | FileCheck %s
+
+define void @test() {
+entry:
+; CHECK: /* result: 68719476738 */
+        tail call void asm sideeffect "/* result: ${0:c} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+; CHECK: /* result: -68719476738 */
+        tail call void asm sideeffect "/* result: ${0:n} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+        ret void
+}
diff --git a/test/CodeGen/AArch64/assertion-rc-mismatch.ll b/test/CodeGen/AArch64/assertion-rc-mismatch.ll
index 02b0c0e..bcf206e 100644
--- a/test/CodeGen/AArch64/assertion-rc-mismatch.ll
+++ b/test/CodeGen/AArch64/assertion-rc-mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 ; Test case related to <rdar://problem/15633429>.
 
 ; CHECK-LABEL: small
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index 5fe2936..58b5d1d 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,5 +1,11 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-REG %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
+
+
+; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
+; (i.e. reusing a register for status & data in store exclusive).
+; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], w[[NEW]], [x{{[0-9]+}}]
+; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], x[[NEW]], [x{{[0-9]+}}]
 
 @var8 = global i8 0
 @var16 = global i16 0
@@ -11,20 +17,18 @@ define i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
    %old = atomicrmw add i8* @var8, i8 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -33,20 +37,18 @@ define i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
    %old = atomicrmw add i16* @var16, i16 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -55,20 +57,18 @@ define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
    %old = atomicrmw add i32* @var32, i32 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -77,15 +77,13 @@ define i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
    %old = atomicrmw add i64* @var64, i64 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: add [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: add x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -99,20 +97,18 @@ define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
    %old = atomicrmw sub i8* @var8, i8 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -121,20 +117,18 @@ define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
    %old = atomicrmw sub i16* @var16, i16 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -143,20 +137,18 @@ define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
    %old = atomicrmw sub i32* @var32, i32 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -165,15 +157,13 @@ define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
    %old = atomicrmw sub i64* @var64, i64 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: sub [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: sub x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -187,20 +177,18 @@ define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
    %old = atomicrmw and i8* @var8, i8 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -209,20 +197,18 @@ define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
    %old = atomicrmw and i16* @var16, i16 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -231,20 +217,18 @@ define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
    %old = atomicrmw and i32* @var32, i32 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -253,15 +237,13 @@ define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
    %old = atomicrmw and i64* @var64, i64 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: and [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: and x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -275,20 +257,18 @@ define i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
    %old = atomicrmw or i8* @var8, i8 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -297,20 +277,18 @@ define i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
    %old = atomicrmw or i16* @var16, i16 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -319,20 +297,18 @@ define i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
    %old = atomicrmw or i32* @var32, i32 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -341,15 +317,13 @@ define i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
    %old = atomicrmw or i64* @var64, i64 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: orr [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: orr x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -363,20 +337,18 @@ define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
    %old = atomicrmw xor i8* @var8, i8 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -385,20 +357,18 @@ define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
    %old = atomicrmw xor i16* @var16, i16 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -407,20 +377,18 @@ define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
    %old = atomicrmw xor i32* @var32, i32 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -429,15 +397,13 @@ define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
    %old = atomicrmw xor i64* @var64, i64 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: eor [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: eor x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -451,18 +417,17 @@ define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
    %old = atomicrmw xchg i8* @var8, i8 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-REG-NOT: stxrb w0, w0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -471,18 +436,17 @@ define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
    %old = atomicrmw xchg i16* @var16, i16 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-REG-NOT: stlxrh w0, w0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -491,18 +455,17 @@ define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
    %old = atomicrmw xchg i32* @var32, i32 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-REG-NOT: stlxr w0, w0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -511,13 +474,12 @@ define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
    %old = atomicrmw xchg i64* @var64, i64 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-REG-NOT: stxr w0, x0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], x0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -532,21 +494,22 @@ define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
    %old = atomicrmw min i8* @var8, i8 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
-; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -555,21 +518,23 @@ define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
    %old = atomicrmw min i16* @var16, i16 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
-; CHECK-REG-NOT: stlxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+
+
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -578,21 +543,22 @@ define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
    %old = atomicrmw min i32* @var32, i32 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
-; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+
+
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -601,16 +567,17 @@ define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
    %old = atomicrmw min i64* @var64, i64 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, gt
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, le
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -624,21 +591,23 @@ define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
    %old = atomicrmw max i8* @var8, i8 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+
+
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -647,21 +616,23 @@ define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
    %old = atomicrmw max i16* @var16, i16 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+
+
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -670,21 +641,22 @@ define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
    %old = atomicrmw max i32* @var32, i32 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -693,16 +665,17 @@ define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
    %old = atomicrmw max i64* @var64, i64 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lt
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, lt
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
+
+
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -716,21 +689,22 @@ define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
    %old = atomicrmw umin i8* @var8, i8 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+
+
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -739,21 +713,22 @@ define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
    %old = atomicrmw umin i16* @var16, i16 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+
+
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -762,21 +737,22 @@ define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
    %old = atomicrmw umin i32* @var32, i32 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -785,16 +761,17 @@ define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
    %old = atomicrmw umin i64* @var64, i64 %offset acq_rel
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, hi
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, ls
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -808,21 +785,22 @@ define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
    %old = atomicrmw umax i8* @var8, i8 %offset acq_rel
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+
+
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -831,21 +809,22 @@ define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
    %old = atomicrmw umax i16* @var16, i16 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+
+
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -854,21 +833,22 @@ define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
    %old = atomicrmw umax i32* @var32, i32 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -877,16 +857,17 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
    %old = atomicrmw umax i64* @var64, i64 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lo
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, lo
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -900,21 +881,20 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
    %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stxrb w1, w1, [x{{[0-9]+}}]
 ; CHECK: stxrb [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -923,21 +903,20 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
    %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stlxrh w1, w1, [x{{[0-9]+}}]
 ; CHECK: stlxrh [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -946,45 +925,44 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
    %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stlxr w1, w1, [x{{[0-9]+}}]
 ; CHECK: stlxr [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
-define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
+define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i64:
    %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp x[[OLD]], x0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stxr w1, x1, [x{{[0-9]+}}]
 ; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
-   ret i64 %old
+; CHECK: str x[[OLD]],
+   store i64 %old, i64* @var64
+   ret void
 }
 
 define i8 @test_atomic_load_monotonic_i8() nounwind {
@@ -992,7 +970,7 @@ define i8 @test_atomic_load_monotonic_i8() nounwind {
   %val = load atomic i8* @var8 monotonic, align 1
 ; CHECK-NOT: dmb
 ; CHECK: adrp x[[HIADDR:[0-9]+]], var8
-; CHECK: ldrb w0, [x[[HIADDR]], #:lo12:var8]
+; CHECK: ldrb w0, [x[[HIADDR]], {{#?}}:lo12:var8]
 ; CHECK-NOT: dmb
 
   ret i8 %val
@@ -1017,7 +995,7 @@ define i8 @test_atomic_load_acquire_i8() nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 ; CHECK-NOT: dmb
 ; CHECK: ldarb w0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
@@ -1030,7 +1008,7 @@ define i8 @test_atomic_load_seq_cst_i8() nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[HIADDR:x[0-9]+]], var8
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
 ; CHECK-NOT: dmb
 ; CHECK: ldarb w0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
@@ -1043,7 +1021,7 @@ define i16 @test_atomic_load_monotonic_i16() nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp x[[HIADDR:[0-9]+]], var16
 ; CHECK-NOT: dmb
-; CHECK: ldrh w0, [x[[HIADDR]], #:lo12:var16]
+; CHECK: ldrh w0, [x[[HIADDR]], {{#?}}:lo12:var16]
 ; CHECK-NOT: dmb
 
   ret i16 %val
@@ -1068,7 +1046,7 @@ define i64 @test_atomic_load_seq_cst_i64() nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[HIADDR:x[0-9]+]], var64
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var64
 ; CHECK-NOT: dmb
 ; CHECK: ldar x0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
@@ -1079,7 +1057,7 @@ define void @test_atomic_store_monotonic_i8(i8 %val) nounwind {
 ; CHECK-LABEL: test_atomic_store_monotonic_i8:
   store atomic i8 %val, i8* @var8 monotonic, align 1
 ; CHECK: adrp x[[HIADDR:[0-9]+]], var8
-; CHECK: strb w0, [x[[HIADDR]], #:lo12:var8]
+; CHECK: strb w0, [x[[HIADDR]], {{#?}}:lo12:var8]
 
   ret void
 }
@@ -1101,7 +1079,7 @@ define void @test_atomic_store_release_i8(i8 %val) nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[HIADDR:x[0-9]+]], var8
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
 ; CHECK-NOT: dmb
 ; CHECK: stlrb w0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
@@ -1114,7 +1092,7 @@ define void @test_atomic_store_seq_cst_i8(i8 %val) nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[HIADDR:x[0-9]+]], var8
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
 ; CHECK-NOT: dmb
 ; CHECK: stlrb w0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
@@ -1128,7 +1106,7 @@ define void @test_atomic_store_monotonic_i16(i16 %val) nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp x[[HIADDR:[0-9]+]], var16
 ; CHECK-NOT: dmb
-; CHECK: strh w0, [x[[HIADDR]], #:lo12:var16]
+; CHECK: strh w0, [x[[HIADDR]], {{#?}}:lo12:var16]
 ; CHECK-NOT: dmb
   ret void
 }
@@ -1153,7 +1131,7 @@ define void @test_atomic_store_release_i64(i64 %val) nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[HIADDR:x[0-9]+]], var64
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var64
 ; CHECK-NOT: dmb
 ; CHECK: stlr x0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
diff --git a/test/CodeGen/AArch64/basic-pic.ll b/test/CodeGen/AArch64/basic-pic.ll
index 682b7ba..62d41bc 100644
--- a/test/CodeGen/AArch64/basic-pic.ll
+++ b/test/CodeGen/AArch64/basic-pic.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
 
 @var = global i32 0
 
@@ -7,7 +7,7 @@ define i32 @get_globalvar() {
 
   %val = load i32* @var
 ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
-; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], #:got_lo12:var]
+; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], {{#?}}:got_lo12:var]
 ; CHECK: ldr w0, [x[[GOTLOC]]]
 
   ret i32 %val
@@ -18,7 +18,7 @@ define i32* @get_globalvaraddr() {
 
   %val = load i32* @var
 ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
-; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:var]
+; CHECK: ldr x0, [x[[GOTHI]], {{#?}}:got_lo12:var]
 
   ret i32* @var
 }
@@ -30,7 +30,7 @@ define i32 @get_hiddenvar() {
 
   %val = load i32* @hiddenvar
 ; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
-; CHECK: ldr w0, [x[[HI]], #:lo12:hiddenvar]
+; CHECK: ldr w0, [x[[HI]], {{#?}}:lo12:hiddenvar]
 
   ret i32 %val
 }
@@ -40,7 +40,7 @@ define i32* @get_hiddenvaraddr() {
 
   %val = load i32* @hiddenvar
 ; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
-; CHECK: add x0, [[HI]], #:lo12:hiddenvar
+; CHECK: add x0, [[HI]], {{#?}}:lo12:hiddenvar
 
   ret i32* @hiddenvar
 }
@@ -50,5 +50,5 @@ define void()* @get_func() {
 
   ret void()* bitcast(void()*()* @get_func to void()*)
 ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
-; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:get_func]
+; CHECK: ldr x0, [x[[GOTHI]], {{#?}}:got_lo12:get_func]
 }
diff --git a/test/CodeGen/AArch64/bitfield-insert-0.ll b/test/CodeGen/AArch64/bitfield-insert-0.ll
index 37a18b7..da0ed8a 100644
--- a/test/CodeGen/AArch64/bitfield-insert-0.ll
+++ b/test/CodeGen/AArch64/bitfield-insert-0.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -disassemble - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -filetype=obj -o - %s | llvm-objdump -disassemble - | FileCheck %s
 
 ; The encoding of lsb -> immr in the CGed bitfield instructions was wrong at one
 ; point, in the edge case where lsb = 0. Just make sure.
diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll
index 1f04608..2369a55 100644
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK
 
 ; First, a simple example from Clang. The registers could plausibly be
 ; different, but probably won't be.
@@ -7,8 +7,7 @@
 
 define [1 x i64] @from_clang([1 x i64] %f.coerce, i32 %n) nounwind readnone {
 ; CHECK-LABEL: from_clang:
-; CHECK: bfi w0, w1, #3, #4
-; CHECK-NEXT: ret
+; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
 
 entry:
   %f.coerce.fca.0.extract = extractvalue [1 x i64] %f.coerce, 0
@@ -26,6 +25,7 @@ entry:
 
 define void @test_whole32(i32* %existing, i32* %new) {
 ; CHECK-LABEL: test_whole32:
+
 ; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5
 
   %oldval = load volatile i32* %existing
@@ -62,8 +62,10 @@ define void @test_whole64(i64* %existing, i64* %new) {
 
 define void @test_whole32_from64(i64* %existing, i64* %new) {
 ; CHECK-LABEL: test_whole32_from64:
-; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16
-; CHECK-NOT: and
+
+
+; CHECK: bfxil {{x[0-9]+}}, {{x[0-9]+}}, #0, #16
+
 ; CHECK: ret
 
   %oldval = load volatile i64* %existing
@@ -80,8 +82,9 @@ define void @test_whole32_from64(i64* %existing, i64* %new) {
 
 define void @test_32bit_masked(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_masked:
+
+; CHECK: and
 ; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
-; CHECK: and {{w[0-9]+}}, [[INSERT]], #0xff
 
   %oldval = load volatile i32* %existing
   %oldval_keep = and i32 %oldval, 135 ; = 0x87
@@ -98,8 +101,8 @@ define void @test_32bit_masked(i32 *%existing, i32 *%new) {
 
 define void @test_64bit_masked(i64 *%existing, i64 *%new) {
 ; CHECK-LABEL: test_64bit_masked:
+; CHECK: and
 ; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
-; CHECK: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000
 
   %oldval = load volatile i64* %existing
   %oldval_keep = and i64 %oldval, 1095216660480 ; = 0xff_0000_0000
@@ -117,8 +120,9 @@ define void @test_64bit_masked(i64 *%existing, i64 *%new) {
 ; Mask is too complicated for literal ANDwwi, make sure other avenues are tried.
 define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_complexmask:
+
+; CHECK: and
 ; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
-; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
 
   %oldval = load volatile i32* %existing
   %oldval_keep = and i32 %oldval, 647 ; = 0x287
@@ -137,6 +141,7 @@ define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
 define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_badmask:
 ; CHECK-NOT: bfi
+; CHECK-NOT: bfm
 ; CHECK: ret
 
   %oldval = load volatile i32* %existing
@@ -156,6 +161,7 @@ define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
 define void @test_64bit_badmask(i64 *%existing, i64 *%new) {
 ; CHECK-LABEL: test_64bit_badmask:
 ; CHECK-NOT: bfi
+; CHECK-NOT: bfm
 ; CHECK: ret
 
   %oldval = load volatile i64* %existing
@@ -186,8 +192,7 @@ define void @test_32bit_with_shr(i32* %existing, i32* %new) {
   %combined = or i32 %oldval_keep, %newval_masked
   store volatile i32 %combined, i32* %existing
 ; CHECK: lsr [[BIT:w[0-9]+]], {{w[0-9]+}}, #14
-; CHECK: bfi {{w[0-9]}}, [[BIT]], #26, #5
+; CHECK: bfi {{w[0-9]+}}, [[BIT]], #26, #5
 
   ret void
 }
-
diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll
index 1c84f5d..0e12653 100644
--- a/test/CodeGen/AArch64/bitfield.ll
+++ b/test/CodeGen/AArch64/bitfield.ll
@@ -1,5 +1,4 @@
-
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -24,7 +23,7 @@ define void @test_extendb(i8 %var) {
 
   %uxt64 = zext i8 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: uxtb {{x[0-9]+}}, {{w[0-9]+}}
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
   ret void
 }
 
@@ -48,7 +47,7 @@ define void @test_extendh(i16 %var) {
 
   %uxt64 = zext i16 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: uxth {{x[0-9]+}}, {{w[0-9]+}}
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
   ret void
 }
 
@@ -61,7 +60,7 @@ define void @test_extendw(i32 %var) {
 
   %uxt64 = zext i32 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: ubfx {{w[0-9]+}}, {{w[0-9]+}}, #0, #32
+; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32
   ret void
 }
 
@@ -190,7 +189,6 @@ define i32 @test_ubfx32(i32* %addr) {
 define i64 @test_ubfx64(i64* %addr) {
 ; CHECK-LABEL: test_ubfx64:
 ; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #25, #10
-
    %fields = load i64* %addr
    %shifted = lshr i64 %fields, 25
    %masked = and i64 %shifted, 1023
diff --git a/test/CodeGen/AArch64/blockaddress.ll b/test/CodeGen/AArch64/blockaddress.ll
index 8cda431..1eec4cc 100644
--- a/test/CodeGen/AArch64/blockaddress.ll
+++ b/test/CodeGen/AArch64/blockaddress.ll
@@ -9,7 +9,7 @@ define void @test_blockaddress() {
   %val = load volatile i8** @addr
   indirectbr i8* %val, [label %block]
 ; CHECK: adrp [[DEST_HI:x[0-9]+]], [[DEST_LBL:.Ltmp[0-9]+]]
-; CHECK: add [[DEST:x[0-9]+]], [[DEST_HI]], #:lo12:[[DEST_LBL]]
+; CHECK: add [[DEST:x[0-9]+]], [[DEST_HI]], {{#?}}:lo12:[[DEST_LBL]]
 ; CHECK: str [[DEST]],
 ; CHECK: ldr [[NEWDEST:x[0-9]+]]
 ; CHECK: br [[NEWDEST]]
diff --git a/test/CodeGen/AArch64/bool-loads.ll b/test/CodeGen/AArch64/bool-loads.ll
index 5c7640b..881aeaa 100644
--- a/test/CodeGen/AArch64/bool-loads.ll
+++ b/test/CodeGen/AArch64/bool-loads.ll
@@ -1,54 +1,54 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
 
 @var = global i1 0
 
 define i32 @test_sextloadi32() {
-; CHECK: test_sextloadi32
+; CHECK-LABEL: test_sextloadi32
 
   %val = load i1* @var
   %ret = sext i1 %val to i32
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
-; CHECK: sbfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #1
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
+; CHECK: {{sbfx x[0-9]+, x[0-9]+, #0, #1|sbfx w[0-9]+, w[0-9]+, #0, #1}}
 
   ret i32 %ret
 ; CHECK: ret
 }
 
 define i64 @test_sextloadi64() {
-; CHECK: test_sextloadi64
+; CHECK-LABEL: test_sextloadi64
 
   %val = load i1* @var
   %ret = sext i1 %val to i64
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
-; CHECK: sbfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #1
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
+; CHECK: {{sbfx x[0-9]+, x[0-9]+, #0, #1}}
 
   ret i64 %ret
 ; CHECK: ret
 }
 
 define i32 @test_zextloadi32() {
-; CHECK: test_zextloadi32
+; CHECK-LABEL: test_zextloadi32
 
 ; It's not actually necessary that "ret" is next, but as far as LLVM
 ; is concerned only 0 or 1 should be loadable so no extension is
 ; necessary.
   %val = load i1* @var
   %ret = zext i1 %val to i32
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
 
   ret i32 %ret
 ; CHECK-NEXT: ret
 }
 
 define i64 @test_zextloadi64() {
-; CHECK: test_zextloadi64
+; CHECK-LABEL: test_zextloadi64
 
 ; It's not actually necessary that "ret" is next, but as far as LLVM
 ; is concerned only 0 or 1 should be loadable so no extension is
 ; necessary.
   %val = load i1* @var
   %ret = zext i1 %val to i64
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
 
   ret i64 %ret
 ; CHECK-NEXT: ret
diff --git a/test/CodeGen/AArch64/breg.ll b/test/CodeGen/AArch64/breg.ll
index 1ed5b9b..591f483 100644
--- a/test/CodeGen/AArch64/breg.ll
+++ b/test/CodeGen/AArch64/breg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @stored_label = global i8* null
 
@@ -7,7 +7,7 @@ define void @foo() {
   %lab = load i8** @stored_label
   indirectbr i8* %lab, [label  %otherlab, label %retlab]
 ; CHECK: adrp {{x[0-9]+}}, stored_label
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:stored_label]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:stored_label]
 ; CHECK: br {{x[0-9]+}}
 
 otherlab:
diff --git a/test/CodeGen/AArch64/callee-save.ll b/test/CodeGen/AArch64/callee-save.ll
index 52243b0..046e6ce 100644
--- a/test/CodeGen/AArch64/callee-save.ll
+++ b/test/CodeGen/AArch64/callee-save.ll
@@ -1,14 +1,14 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @var = global float 0.0
 
 define void @foo() {
 ; CHECK-LABEL: foo:
 
-; CHECK: stp d14, d15, [sp
-; CHECK: stp d12, d13, [sp
-; CHECK: stp d10, d11, [sp
-; CHECK: stp d8, d9, [sp
+; CHECK: stp d15, d14, [sp
+; CHECK: stp d13, d12, [sp
+; CHECK: stp d11, d10, [sp
+; CHECK: stp d9, d8, [sp
 
   ; Create lots of live variables to exhaust the supply of
   ; caller-saved registers
@@ -78,9 +78,9 @@ define void @foo() {
   store volatile float %val31, float* @var
   store volatile float %val32, float* @var
 
-; CHECK: ldp     d8, d9, [sp
-; CHECK: ldp     d10, d11, [sp
-; CHECK: ldp     d12, d13, [sp
-; CHECK: ldp     d14, d15, [sp
+; CHECK: ldp     d9, d8, [sp
+; CHECK: ldp     d11, d10, [sp
+; CHECK: ldp     d13, d12, [sp
+; CHECK: ldp     d15, d14, [sp
   ret void
 }
diff --git a/test/CodeGen/AArch64/code-model-large-abs.ll b/test/CodeGen/AArch64/code-model-large-abs.ll
index b387f28..ca92500 100644
--- a/test/CodeGen/AArch64/code-model-large-abs.ll
+++ b/test/CodeGen/AArch64/code-model-large-abs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -o - %s | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
diff --git a/test/CodeGen/AArch64/compare-branch.ll b/test/CodeGen/AArch64/compare-branch.ll
index 75efd9d..a1a87cf 100644
--- a/test/CodeGen/AArch64/compare-branch.ll
+++ b/test/CodeGen/AArch64/compare-branch.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/concatvector-bugs.ll b/test/CodeGen/AArch64/concatvector-bugs.ll
deleted file mode 100644
index 5889e22..0000000
--- a/test/CodeGen/AArch64/concatvector-bugs.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon
-; Bug: i8 type in FRP8 register but not registering with register class causes segmentation fault.
-; Fix: Removed i8 type from FPR8 register class.
-
-define void @test_concatvector_v8i8() {
-entry.split:
-  br i1 undef, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry.split
-  unreachable
-
-if.end:                                           ; preds = %entry.split
-  br i1 undef, label %if.then9, label %if.end18
-
-if.then9:                                         ; preds = %if.end
-  unreachable
-
-if.end18:                                         ; preds = %if.end
-  br label %for.body
-
-for.body:                                         ; preds = %for.inc, %if.end18
-  br i1 false, label %if.then30, label %for.inc
-
-if.then30:                                        ; preds = %for.body
-  unreachable
-
-for.inc:                                          ; preds = %for.body
-  br i1 undef, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.inc
-  br label %for.body77
-
-for.body77:                                       ; preds = %for.body77, %for.end
-  br i1 undef, label %for.end106, label %for.body77
-
-for.end106:                                       ; preds = %for.body77
-  br i1 undef, label %for.body130.us.us, label %stmt.for.body130.us.us
-
-stmt.for.body130.us.us:                     ; preds = %stmt.for.body130.us.us, %for.end106
-  %_p_splat.us = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
-  store <8 x i8> %_p_splat.us, <8 x i8>* undef, align 1
-  br label %stmt.for.body130.us.us
-
-for.body130.us.us:                                ; preds = %for.body130.us.us, %for.end106
-  br label %for.body130.us.us
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32)
-
-define <8 x i16> @test_splat(i32 %l) nounwind {
-; CHECK-LABEL: test_splat:
-; CHECK: ret
-  %lhs = insertelement <1 x i32> undef, i32 %l, i32 0
-  %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11)
-  %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> zeroinitializer
-  ret <8 x i16> %vec
-}
-
-
-define <8 x i16> @test_notsplat(<8 x i16> %a, <8 x i16> %b, i32 %l) nounwind {
-; CHECK-LABEL: test_notsplat:
-; CHECK: ret
-entry:
-  %lhs = insertelement <1 x i32> undef, i32 %l, i32 0
-  %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11)
-  %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0>
-  ret <8 x i16> %vec
-}
diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
index 9c1dfeb..5f81cba 100644
--- a/test/CodeGen/AArch64/cond-sel.ll
+++ b/test/CodeGen/AArch64/cond-sel.ll
@@ -1,25 +1,25 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var32 = global i32 0
 @var64 = global i64 0
 
-define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
 ; CHECK-LABEL: test_csel:
 
   %tst1 = icmp ugt i32 %lhs32, %rhs32
   %val1 = select i1 %tst1, i32 42, i32 52
   store i32 %val1, i32* @var32
-; CHECK-DAG: movz [[W52:w[0-9]+]], #52
-; CHECK-DAG: movz [[W42:w[0-9]+]], #42
+; CHECK-DAG: movz [[W52:w[0-9]+]], #{{52|0x34}}
+; CHECK-DAG: movz [[W42:w[0-9]+]], #{{42|0x2a}}
 ; CHECK: csel {{w[0-9]+}}, [[W42]], [[W52]], hi
 
   %rhs64 = sext i32 %rhs32 to i64
   %tst2 = icmp sle i64 %lhs64, %rhs64
   %val2 = select i1 %tst2, i64 %lhs64, i64 %rhs64
   store i64 %val2, i64* @var64
-; CHECK-DAG: cmp [[LHS:x[0-9]+]], [[RHS:w[0-9]+]], sxtw
-; CHECK-DAG: sxtw [[EXT_RHS:x[0-9]+]], [[RHS]]
+; CHECK: sxtw [[EXT_RHS:x[0-9]+]], {{[wx]}}[[RHS:[0-9]+]]
+; CHECK: cmp [[LHS:x[0-9]+]], w[[RHS]], sxtw
 ; CHECK: csel {{x[0-9]+}}, [[LHS]], [[EXT_RHS]], le
 
   ret void
@@ -34,8 +34,8 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
 ; CHECK-NOFP-NOT: fcmp
   %val1 = select i1 %tst1, i32 42, i32 52
   store i32 %val1, i32* @var32
-; CHECK: movz [[W52:w[0-9]+]], #52
-; CHECK: movz [[W42:w[0-9]+]], #42
+; CHECK: movz [[W52:w[0-9]+]], #{{52|0x34}}
+; CHECK: movz [[W42:w[0-9]+]], #{{42|0x2a}}
 ; CHECK: csel [[MAYBETRUE:w[0-9]+]], [[W42]], [[W52]], mi
 ; CHECK: csel {{w[0-9]+}}, [[W42]], [[MAYBETRUE]], gt
 
@@ -45,17 +45,17 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
 ; CHECK-NOFP-NOT: fcmp
   %val2 = select i1 %tst2, i64 9, i64 15
   store i64 %val2, i64* @var64
-; CHECK: movz [[CONST15:x[0-9]+]], #15
-; CHECK: movz [[CONST9:x[0-9]+]], #9
-; CHECK: csel [[MAYBETRUE:x[0-9]+]], [[CONST9]], [[CONST15]], eq
-; CHECK: csel {{x[0-9]+}}, [[CONST9]], [[MAYBETRUE]], vs
+; CHECK: orr w[[CONST15:[0-9]+]], wzr, #0xf
+; CHECK: movz {{[wx]}}[[CONST9:[0-9]+]], #{{9|0x9}}
+; CHECK: csel [[MAYBETRUE:x[0-9]+]], x[[CONST9]], x[[CONST15]], eq
+; CHECK: csel {{x[0-9]+}}, x[[CONST9]], [[MAYBETRUE]], vs
 
   ret void
 ; CHECK: ret
 }
 
 
-define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
 ; CHECK-LABEL: test_csinc:
 
 ; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
@@ -95,7 +95,7 @@ define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
 ; CHECK: ret
 }
 
-define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
 ; CHECK-LABEL: test_csinv:
 
 ; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
@@ -135,7 +135,7 @@ define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
 ; CHECK: ret
 }
 
-define void @test_csneg(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csneg(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
 ; CHECK-LABEL: test_csneg:
 
 ; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
@@ -184,13 +184,13 @@ define void @test_cset(i32 %lhs, i32 %rhs, i64 %lhs64) {
   %val1 = zext i1 %tst1 to i32
   store i32 %val1, i32* @var32
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ne
+; CHECK: cset {{w[0-9]+}}, eq
 
   %rhs64 = sext i32 %rhs to i64
   %tst2 = icmp ule i64 %lhs64, %rhs64
   %val2 = zext i1 %tst2 to i64
   store i64 %val2, i64* @var64
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, hi
+; CHECK: cset {{w[0-9]+}}, ls
 
   ret void
 ; CHECK: ret
@@ -203,13 +203,13 @@ define void @test_csetm(i32 %lhs, i32 %rhs, i64 %lhs64) {
   %val1 = sext i1 %tst1 to i32
   store i32 %val1, i32* @var32
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: csinv {{w[0-9]+}}, wzr, wzr, ne
+; CHECK: csetm {{w[0-9]+}}, eq
 
   %rhs64 = sext i32 %rhs to i64
   %tst2 = icmp ule i64 %lhs64, %rhs64
   %val2 = sext i1 %tst2 to i64
   store i64 %val2, i64* @var64
-; CHECK: csinv {{x[0-9]+}}, xzr, xzr, hi
+; CHECK: csetm {{x[0-9]+}}, ls
 
   ret void
 ; CHECK: ret
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index f0b60f0..f0f36bd 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -1,9 +1,10 @@
 ; This tests that llc accepts all valid AArch64 CPUs
 
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
+
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; CHECK-NOT: {{.*}}  is not a recognized processor for this target
 ; INVALID: {{.*}}  is not a recognized processor for this target
diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll
index 12c7b6a..1b51928 100644
--- a/test/CodeGen/AArch64/directcond.ll
+++ b/test/CodeGen/AArch64/directcond.ll
@@ -1,11 +1,10 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 ; CHECK-LABEL: test_select_i32:
   %val = select i1 %bit, i32 %a, i32 %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: csel w0, w1, w2, ne
 
   ret i32 %val
@@ -14,8 +13,7 @@ define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
 ; CHECK-LABEL: test_select_i64:
   %val = select i1 %bit, i64 %a, i64 %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: csel x0, x1, x2, ne
 
   ret i64 %val
@@ -24,8 +22,7 @@ define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
 define float @test_select_float(i1 %bit, float %a, float %b) {
 ; CHECK-LABEL: test_select_float:
   %val = select i1 %bit, float %a, float %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: fcsel s0, s0, s1, ne
 ; CHECK-NOFP-NOT: fcsel
   ret float %val
@@ -34,8 +31,7 @@ define float @test_select_float(i1 %bit, float %a, float %b) {
 define double @test_select_double(i1 %bit, double %a, double %b) {
 ; CHECK-LABEL: test_select_double:
   %val = select i1 %bit, double %a, double %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: fcsel d0, d0, d1, ne
 ; CHECK-NOFP-NOT: fcsel
 
@@ -45,7 +41,7 @@ define double @test_select_double(i1 %bit, double %a, double %b) {
 define i32 @test_brcond(i1 %bit) {
 ; CHECK-LABEL: test_brcond:
   br i1 %bit, label %true, label %false
-; CHECK: tbz {{w[0-9]+}}, #0, .LBB
+; CHECK: tbz {{w[0-9]+}}, #0, {{.?LBB}}
 
 true:
   ret i32 0
@@ -57,7 +53,7 @@ define i1 @test_setcc_float(float %lhs, float %rhs) {
 ; CHECK: test_setcc_float
   %val = fcmp oeq float %lhs, %rhs
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w0, wzr, wzr, ne
+; CHECK: cset w0, eq
 ; CHECK-NOFP-NOT: fcmp
   ret i1 %val
 }
@@ -66,7 +62,7 @@ define i1 @test_setcc_double(double %lhs, double %rhs) {
 ; CHECK: test_setcc_double
   %val = fcmp oeq double %lhs, %rhs
 ; CHECK: fcmp d0, d1
-; CHECK: csinc w0, wzr, wzr, ne
+; CHECK: cset w0, eq
 ; CHECK-NOFP-NOT: fcmp
   ret i1 %val
 }
@@ -75,7 +71,7 @@ define i1 @test_setcc_i32(i32 %lhs, i32 %rhs) {
 ; CHECK: test_setcc_i32
   %val = icmp ugt i32 %lhs, %rhs
 ; CHECK: cmp w0, w1
-; CHECK: csinc w0, wzr, wzr, ls
+; CHECK: cset w0, hi
   ret i1 %val
 }
 
@@ -83,6 +79,6 @@ define i1 @test_setcc_i64(i64 %lhs, i64 %rhs) {
 ; CHECK: test_setcc_i64
   %val = icmp ne i64 %lhs, %rhs
 ; CHECK: cmp x0, x1
-; CHECK: csinc w0, wzr, wzr, eq
+; CHECK: cset w0, ne
   ret i1 %val
 }
diff --git a/test/CodeGen/AArch64/dp-3source.ll b/test/CodeGen/AArch64/dp-3source.ll
index 81d9e15..22bd4a8 100644
--- a/test/CodeGen/AArch64/dp-3source.ll
+++ b/test/CodeGen/AArch64/dp-3source.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i32 @test_madd32(i32 %val0, i32 %val1, i32 %val2) {
 ; CHECK-LABEL: test_madd32:
diff --git a/test/CodeGen/AArch64/dp1.ll b/test/CodeGen/AArch64/dp1.ll
index 6a8d55c..662b415 100644
--- a/test/CodeGen/AArch64/dp1.ll
+++ b/test/CodeGen/AArch64/dp1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/dp2.ll b/test/CodeGen/AArch64/dp2.ll
index 48b0701..71b3169 100644
--- a/test/CodeGen/AArch64/dp2.ll
+++ b/test/CodeGen/AArch64/dp2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64 | FileCheck %s
 
 @var32_0 = global i32 0
 @var32_1 = global i32 0
@@ -13,7 +13,7 @@ define void @rorv_i64() {
     %val3_tmp = shl i64 %val0_tmp, %val2_tmp
     %val4_tmp = lshr i64 %val0_tmp, %val1_tmp
     %val5_tmp = or i64 %val3_tmp, %val4_tmp
-; CHECK: ror	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{ror|rorv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
     store volatile i64 %val5_tmp, i64* @var64_0
     ret void
 }
@@ -23,7 +23,7 @@ define void @asrv_i64() {
     %val0_tmp = load i64* @var64_0
     %val1_tmp = load i64* @var64_1
     %val4_tmp = ashr i64 %val0_tmp, %val1_tmp
-; CHECK: asr	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{asr|asrv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
     store volatile i64 %val4_tmp, i64* @var64_1
     ret void
 }
@@ -33,7 +33,7 @@ define void @lsrv_i64() {
     %val0_tmp = load i64* @var64_0
     %val1_tmp = load i64* @var64_1
     %val4_tmp = lshr i64 %val0_tmp, %val1_tmp
-; CHECK: lsr	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{lsr|lsrv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
     store volatile i64 %val4_tmp, i64* @var64_0
     ret void
 }
@@ -43,7 +43,7 @@ define void @lslv_i64() {
     %val0_tmp = load i64* @var64_0
     %val1_tmp = load i64* @var64_1
     %val4_tmp = shl i64 %val0_tmp, %val1_tmp
-; CHECK: lsl	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{lsl|lslv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
     store volatile i64 %val4_tmp, i64* @var64_1
     ret void
 }
@@ -75,7 +75,7 @@ define void @lsrv_i32() {
     %val1_tmp = load i32* @var32_1
     %val2_tmp = add i32 1, %val1_tmp
     %val4_tmp = lshr i32 %val0_tmp, %val2_tmp
-; CHECK: lsr	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsr|lsrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
     store volatile i32 %val4_tmp, i32* @var32_0
     ret void
 }
@@ -86,7 +86,7 @@ define void @lslv_i32() {
     %val1_tmp = load i32* @var32_1
     %val2_tmp = add i32 1, %val1_tmp
     %val4_tmp = shl i32 %val0_tmp, %val2_tmp
-; CHECK: lsl	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsl|lslv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
     store volatile i32 %val4_tmp, i32* @var32_1
     ret void
 }
@@ -100,7 +100,7 @@ define void @rorv_i32() {
     %val3_tmp = shl i32 %val0_tmp, %val2_tmp
     %val4_tmp = lshr i32 %val0_tmp, %val1_tmp
     %val5_tmp = or i32 %val3_tmp, %val4_tmp
-; CHECK: ror	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{ror|rorv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
     store volatile i32 %val5_tmp, i32* @var32_0
     ret void
 }
@@ -111,7 +111,7 @@ define void @asrv_i32() {
     %val1_tmp = load i32* @var32_1
     %val2_tmp = add i32 1, %val1_tmp
     %val4_tmp = ashr i32 %val0_tmp, %val2_tmp
-; CHECK: asr	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{asr|asrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
     store volatile i32 %val4_tmp, i32* @var32_1
     ret void
 }
@@ -143,7 +143,7 @@ define i32 @test_lsl32() {
 
   %val = load i32* @var32_0
   %ret = shl i32 1, %val
-; CHECK: lsl {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsl|lslv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
 
   ret i32 %ret
 }
@@ -153,7 +153,7 @@ define i32 @test_lsr32() {
 
   %val = load i32* @var32_0
   %ret = lshr i32 1, %val
-; CHECK: lsr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsr|lsrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
 
   ret i32 %ret
 }
@@ -163,7 +163,7 @@ define i32 @test_asr32(i32 %in) {
 
   %val = load i32* @var32_0
   %ret = ashr i32 %in, %val
-; CHECK: asr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{asr|asrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
 
   ret i32 %ret
 }
diff --git a/test/CodeGen/AArch64/eliminate-trunc.ll b/test/CodeGen/AArch64/eliminate-trunc.ll
new file mode 100644
index 0000000..ea86a08
--- /dev/null
+++ b/test/CodeGen/AArch64/eliminate-trunc.ll
@@ -0,0 +1,39 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-apple-ios7.0 -mcpu=cyclone | FileCheck %s
+
+; Check  trunc i64 operation is translated as a subregister access
+; eliminating an i32 induction varible.
+
+; CHECK-NOT: add {{x[0-9]+}}, {{x[0-9]+}}, #1
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+define void @test1_signed([8 x i8]* nocapture %a, i8* nocapture readonly %box, i8 %limit) minsize {
+entry:
+  %conv = zext i8 %limit to i32
+  %cmp223 = icmp eq i8 %limit, 0
+  br i1 %cmp223, label %for.end15, label %for.body4.lr.ph.us
+
+for.body4.us:
+  %indvars.iv = phi i64 [ 0, %for.body4.lr.ph.us ], [ %indvars.iv.next, %for.body4.us ]
+  %arrayidx6.us = getelementptr inbounds [8 x i8]* %a, i64 %indvars.iv26, i64 %indvars.iv
+  %0 = load i8* %arrayidx6.us, align 1
+  %idxprom7.us = zext i8 %0 to i64
+  %arrayidx8.us = getelementptr inbounds i8* %box, i64 %idxprom7.us
+  %1 = load i8* %arrayidx8.us, align 1
+  store i8 %1, i8* %arrayidx6.us, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %2 = trunc i64 %indvars.iv.next to i32
+  %cmp2.us = icmp slt i32 %2, %conv
+  br i1 %cmp2.us, label %for.body4.us, label %for.cond1.for.inc13_crit_edge.us
+
+for.body4.lr.ph.us:
+  %indvars.iv26 = phi i64 [ %indvars.iv.next27, %for.cond1.for.inc13_crit_edge.us ], [ 0, %entry ]
+  br label %for.body4.us
+
+for.cond1.for.inc13_crit_edge.us:
+  %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+  %exitcond28 = icmp eq i64 %indvars.iv26, 3
+  br i1 %exitcond28, label %for.end15, label %for.body4.lr.ph.us
+
+for.end15:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
index 322b3f4..ce5c0f6 100644
--- a/test/CodeGen/AArch64/extern-weak.ll
+++ b/test/CodeGen/AArch64/extern-weak.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -o - < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
 
@@ -7,10 +7,10 @@ define i32()* @foo() {
 ; The usual ADRP/ADD pair can't be used for a weak reference because it must
 ; evaluate to 0 if the symbol is undefined. We use a litpool entry.
   ret i32()* @var
-; CHECK: .LCPI0_0:
-; CHECK-NEXT: .xword var
 
-; CHECK: ldr x0, [{{x[0-9]+}}, #:lo12:.LCPI0_0]
+
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[ADDRHI]], :got_lo12:var]
 
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
@@ -25,27 +25,29 @@ define i32()* @foo() {
 
 define i32* @bar() {
   %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
-; CHECK: .LCPI1_0:
-; CHECK-NEXT: .xword arr_var
 
-; CHECK: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI1_0]
+
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:arr_var
+; CHECK: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
 ; CHECK: add x0, [[BASE]], #20
+
   ret i32* %addr
 
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
-; CHECK-LARGE: movz x0, #:abs_g3:arr_var
-; CHECK-LARGE: movk x0, #:abs_g2_nc:arr_var
-; CHECK-LARGE: movk x0, #:abs_g1_nc:arr_var
-; CHECK-LARGE: movk x0, #:abs_g0_nc:arr_var
+; CHECK-LARGE: movz [[ADDR:x[0-9]+]], #:abs_g3:arr_var
+; CHECK-LARGE: movk [[ADDR]], #:abs_g2_nc:arr_var
+; CHECK-LARGE: movk [[ADDR]], #:abs_g1_nc:arr_var
+; CHECK-LARGE: movk [[ADDR]], #:abs_g0_nc:arr_var
 }
 
 @defined_weak_var = internal unnamed_addr global i32 0
 
 define i32* @wibble() {
   ret i32* @defined_weak_var
+
 ; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
-; CHECK: add x0, [[BASE]], #:lo12:defined_weak_var
+; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
 
 ; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
diff --git a/test/CodeGen/AArch64/extract.ll b/test/CodeGen/AArch64/extract.ll
index 62d9ed2..1fc9387 100644
--- a/test/CodeGen/AArch64/extract.ll
+++ b/test/CodeGen/AArch64/extract.ll
@@ -1,11 +1,11 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i64 @ror_i64(i64 %in) {
 ; CHECK-LABEL: ror_i64:
     %left = shl i64 %in, 19
     %right = lshr i64 %in, 45
     %val5 = or i64 %left, %right
-; CHECK: extr {{x[0-9]+}}, x0, x0, #45
+; CHECK: ror {{x[0-9]+}}, x0, #45
     ret i64 %val5
 }
 
@@ -14,7 +14,7 @@ define i32 @ror_i32(i32 %in) {
     %left = shl i32 %in, 9
     %right = lshr i32 %in, 23
     %val5 = or i32 %left, %right
-; CHECK: extr {{w[0-9]+}}, w0, w0, #23
+; CHECK: ror {{w[0-9]+}}, w0, #23
     ret i32 %val5
 }
 
diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll
index c6c0505..a392619 100644
--- a/test/CodeGen/AArch64/fastcc-reserved.ll
+++ b/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -12,8 +12,8 @@ define fastcc void @foo(i32 %in) {
   %addr = alloca i8, i32 %in
 
 ; Normal frame setup stuff:
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame:
 ; CHECK: sub sp, sp, #16
@@ -26,8 +26,8 @@ define fastcc void @foo(i32 %in) {
 ; CHECK-NOT: sub sp, sp, #16
 ; CHECK-NOT: add sp, sp,
 
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
+; CHECK: mov     sp, x29
+; CHECK: ldp     x29, x30, [sp], #16
   ret void
 }
 
@@ -38,8 +38,8 @@ define void @foo1(i32 %in) {
 
   %addr = alloca i8, i32 %in
 ; Normal frame setup again
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame
 ; CHECK: sub sp, sp, #16
@@ -52,7 +52,7 @@ define void @foo1(i32 %in) {
 
 ; Check for epilogue (primarily to make sure sp spotted above wasn't
 ; part of it).
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
+; CHECK: mov     sp, x29
+; CHECK: ldp     x29, x30, [sp], #16
   ret void
 }
diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll
index a4cd378..9917fcd 100644
--- a/test/CodeGen/AArch64/fastcc.ll
+++ b/test/CodeGen/AArch64/fastcc.ll
@@ -6,10 +6,13 @@
 
 define fastcc void @func_stack0() {
 ; CHECK-LABEL: func_stack0:
-; CHECK: sub sp, sp, #48
+; CHECK: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #32
 
 ; CHECK-TAIL-LABEL: func_stack0:
-; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL-NEXT: mov x29, sp
+; CHECK-TAIL-NEXT: sub sp, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -24,6 +27,7 @@ define fastcc void @func_stack0() {
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
+
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
@@ -32,30 +36,39 @@ define fastcc void @func_stack0() {
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
+
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #48
-; CHECK-TAIL-NEXT: ret
 
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
-; CHECK: sub sp, sp, #48
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #32
+
 
 ; CHECK-TAIL-LABEL: func_stack8:
-; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL: mov x29, sp
+; CHECK-TAIL: sub sp, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
+
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
@@ -64,6 +77,7 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
+
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
@@ -76,19 +90,22 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #64
+
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32:
-; CHECK: sub sp, sp, #48
+; CHECK: mov x29, sp
 
 ; CHECK-TAIL-LABEL: func_stack32:
-; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: mov x29, sp
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -103,6 +120,7 @@ define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
+
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
@@ -111,13 +129,16 @@ define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
+
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #80
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
 }
diff --git a/test/CodeGen/AArch64/fcmp.ll b/test/CodeGen/AArch64/fcmp.ll
index a9518ea..3c74508 100644
--- a/test/CodeGen/AArch64/fcmp.ll
+++ b/test/CodeGen/AArch64/fcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 declare void @bar(i32)
 
diff --git a/test/CodeGen/AArch64/fcvt-fixed.ll b/test/CodeGen/AArch64/fcvt-fixed.ll
index 9d66da4..ccb3616 100644
--- a/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -1,4 +1,8 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 -O0
+
+; (The O0 test is to make sure FastISel still constrains its operands properly
+; and the verifier doesn't trigger).
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/fcvt-int.ll b/test/CodeGen/AArch64/fcvt-int.ll
index 97427a7..d549c7e 100644
--- a/test/CodeGen/AArch64/fcvt-int.ll
+++ b/test/CodeGen/AArch64/fcvt-int.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i32 @test_floattoi32(float %in) {
 ; CHECK-LABEL: test_floattoi32:
diff --git a/test/CodeGen/AArch64/flags-multiuse.ll b/test/CodeGen/AArch64/flags-multiuse.ll
index e99c728..c9b0b9f 100644
--- a/test/CodeGen/AArch64/flags-multiuse.ll
+++ b/test/CodeGen/AArch64/flags-multiuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 ; LLVM should be able to cope with multiple uses of the same flag-setting
 ; instruction at different points of a routine. Either by rematerializing the
@@ -15,7 +15,7 @@ define i32 @test_multiflag(i32 %n, i32 %m, i32 %o) {
 ; CHECK: cmp [[LHS:w[0-9]+]], [[RHS:w[0-9]+]]
 
   %val = zext i1 %test to i32
-; CHECK: csinc {{[xw][0-9]+}}, {{xzr|wzr}}, {{xzr|wzr}}, eq
+; CHECK: cset {{[xw][0-9]+}}, ne
 
   store i32 %val, i32* @var
 
diff --git a/test/CodeGen/AArch64/floatdp_1source.ll b/test/CodeGen/AArch64/floatdp_1source.ll
index 3d7f8f0..8c02787 100644
--- a/test/CodeGen/AArch64/floatdp_1source.ll
+++ b/test/CodeGen/AArch64/floatdp_1source.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @varhalf = global half 0.0
 @varfloat = global float 0.0
diff --git a/test/CodeGen/AArch64/floatdp_2source.ll b/test/CodeGen/AArch64/floatdp_2source.ll
index bb65528..2622717 100644
--- a/test/CodeGen/AArch64/floatdp_2source.ll
+++ b/test/CodeGen/AArch64/floatdp_2source.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -mcpu=cyclone | FileCheck %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/fp-cond-sel.ll b/test/CodeGen/AArch64/fp-cond-sel.ll
index 572f42e..b4f4d77 100644
--- a/test/CodeGen/AArch64/fp-cond-sel.ll
+++ b/test/CodeGen/AArch64/fp-cond-sel.ll
@@ -1,25 +1,34 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
 
+declare void @use_float(float)
+declare void @use_double(double)
+
 define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
 ; CHECK-LABEL: test_csel:
 
   %tst1 = icmp ugt i32 %lhs32, %rhs32
   %val1 = select i1 %tst1, float 0.0, float 1.0
   store float %val1, float* @varfloat
-; CHECK: ldr [[FLT0:s[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
-; CHECK: fmov [[FLT1:s[0-9]+]], #1.0
-; CHECK: fcsel {{s[0-9]+}}, [[FLT0]], [[FLT1]], hi
+; CHECK: movi v[[FLT0:[0-9]+]].2d, #0
+; CHECK: fmov s[[FLT1:[0-9]+]], #1.0
+; CHECK: fcsel {{s[0-9]+}}, s[[FLT0]], s[[FLT1]], hi
 
   %rhs64 = sext i32 %rhs32 to i64
   %tst2 = icmp sle i64 %lhs64, %rhs64
   %val2 = select i1 %tst2, double 1.0, double 0.0
   store double %val2, double* @vardouble
-; CHECK: ldr [[FLT0:d[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
-; CHECK: fmov [[FLT1:d[0-9]+]], #1.0
-; CHECK: fcsel {{d[0-9]+}}, [[FLT1]], [[FLT0]], le
+; FLT0 is reused from above on ARM64.
+; CHECK: fmov d[[FLT1:[0-9]+]], #1.0
+; CHECK: fcsel {{d[0-9]+}}, d[[FLT1]], d[[FLT0]], le
+
+  call void @use_float(float 0.0)
+  call void @use_float(float 1.0)
+
+  call void @use_double(double 0.0)
+  call void @use_double(double 1.0)
 
   ret void
 ; CHECK: ret
diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll
index 2a6790e..10f88fd 100644
--- a/test/CodeGen/AArch64/fp-dp3.ll
+++ b/test/CodeGen/AArch64/fp-dp3.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s -check-prefix=CHECK-NOFAST
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s -check-prefix=CHECK-NOFAST
 
 declare float @llvm.fma.f32(float, float, float)
 declare double @llvm.fma.f64(double, double, double)
diff --git a/test/CodeGen/AArch64/fp128-folding.ll b/test/CodeGen/AArch64/fp128-folding.ll
index b1c560d..892b19c 100644
--- a/test/CodeGen/AArch64/fp128-folding.ll
+++ b/test/CodeGen/AArch64/fp128-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 declare void @bar(i8*, i8*, i32*)
 
 ; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
@@ -12,6 +12,6 @@ define fp128 @test_folding() {
   %fpval = sitofp i32 %val to fp128
   ; If the value is loaded from a constant pool into an fp128, it's been folded
   ; successfully.
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.LCPI
   ret fp128 %fpval
 }
diff --git a/test/CodeGen/AArch64/fp128.ll b/test/CodeGen/AArch64/fp128.ll
deleted file mode 100644
index c312bb1..0000000
--- a/test/CodeGen/AArch64/fp128.ll
+++ /dev/null
@@ -1,279 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-
-@lhs = global fp128 zeroinitializer
-@rhs = global fp128 zeroinitializer
-
-define fp128 @test_add() {
-; CHECK-LABEL: test_add:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-  %val = fadd fp128 %lhs, %rhs
-; CHECK: bl __addtf3
-  ret fp128 %val
-}
-
-define fp128 @test_sub() {
-; CHECK-LABEL: test_sub:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-  %val = fsub fp128 %lhs, %rhs
-; CHECK: bl __subtf3
-  ret fp128 %val
-}
-
-define fp128 @test_mul() {
-; CHECK-LABEL: test_mul:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-  %val = fmul fp128 %lhs, %rhs
-; CHECK: bl __multf3
-  ret fp128 %val
-}
-
-define fp128 @test_div() {
-; CHECK-LABEL: test_div:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-  %val = fdiv fp128 %lhs, %rhs
-; CHECK: bl __divtf3
-  ret fp128 %val
-}
-
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @test_fptosi() {
-; CHECK-LABEL: test_fptosi:
-  %val = load fp128* @lhs
-
-  %val32 = fptosi fp128 %val to i32
-  store i32 %val32, i32* @var32
-; CHECK: bl __fixtfsi
-
-  %val64 = fptosi fp128 %val to i64
-  store i64 %val64, i64* @var64
-; CHECK: bl __fixtfdi
-
-  ret void
-}
-
-define void @test_fptoui() {
-; CHECK-LABEL: test_fptoui:
-  %val = load fp128* @lhs
-
-  %val32 = fptoui fp128 %val to i32
-  store i32 %val32, i32* @var32
-; CHECK: bl __fixunstfsi
-
-  %val64 = fptoui fp128 %val to i64
-  store i64 %val64, i64* @var64
-; CHECK: bl __fixunstfdi
-
-  ret void
-}
-
-define void @test_sitofp() {
-; CHECK-LABEL: test_sitofp:
-
-  %src32 = load i32* @var32
-  %val32 = sitofp i32 %src32 to fp128
-  store volatile fp128 %val32, fp128* @lhs
-; CHECK: bl __floatsitf
-
-  %src64 = load i64* @var64
-  %val64 = sitofp i64 %src64 to fp128
-  store volatile fp128 %val64, fp128* @lhs
-; CHECK: bl __floatditf
-
-  ret void
-}
-
-define void @test_uitofp() {
-; CHECK-LABEL: test_uitofp:
-
-  %src32 = load i32* @var32
-  %val32 = uitofp i32 %src32 to fp128
-  store volatile fp128 %val32, fp128* @lhs
-; CHECK: bl __floatunsitf
-
-  %src64 = load i64* @var64
-  %val64 = uitofp i64 %src64 to fp128
-  store volatile fp128 %val64, fp128* @lhs
-; CHECK: bl __floatunditf
-
-  ret void
-}
-
-define i1 @test_setcc1() {
-; CHECK-LABEL: test_setcc1:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-; Technically, everything after the call to __letf2 is redundant, but we'll let
-; LLVM have its fun for now.
-  %val = fcmp ole fp128 %lhs, %rhs
-; CHECK: bl __letf2
-; CHECK: cmp w0, #0
-; CHECK: csinc w0, wzr, wzr, gt
-
-  ret i1 %val
-; CHECK: ret
-}
-
-define i1 @test_setcc2() {
-; CHECK-LABEL: test_setcc2:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-; Technically, everything after the call to __letf2 is redundant, but we'll let
-; LLVM have its fun for now.
-  %val = fcmp ugt fp128 %lhs, %rhs
-; CHECK: bl      __gttf2
-; CHECK: cmp w0, #0
-; CHECK: csinc   [[GT:w[0-9]+]], wzr, wzr, le
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp w0, #0
-; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
-
-; CHECK: orr     w0, [[UNORDERED]], [[GT]]
-
-  ret i1 %val
-; CHECK: ret
-}
-
-define i32 @test_br_cc() {
-; CHECK-LABEL: test_br_cc:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
-  %cond = fcmp olt fp128 %lhs, %rhs
-; CHECK: bl      __getf2
-; CHECK: cmp w0, #0
-; CHECK: csinc   [[OGE:w[0-9]+]], wzr, wzr, lt
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp w0, #0
-; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
-
-; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
-; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
-  br i1 %cond, label %iftrue, label %iffalse
-
-iftrue:
-  ret i32 42
-; CHECK-NEXT: BB#
-; CHECK-NEXT: movz x0, #42
-; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
-
-iffalse:
-  ret i32 29
-; CHECK: [[RET29]]:
-; CHECK-NEXT: movz x0, #29
-; CHECK-NEXT: [[REALRET]]:
-; CHECK: ret
-}
-
-define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
-; CHECK-LABEL: test_select:
-
-  %val = select i1 %cond, fp128 %lhs, fp128 %rhs
-  store fp128 %val, fp128* @lhs
-; CHECK: cmp w0, #0
-; CHECK: str q1, [sp]
-; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: BB#
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: [[IFFALSE]]:
-; CHECK-NEXT: ldr q0, [sp]
-; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
-  ret void
-; CHECK: ret
-}
-
-@varfloat = global float 0.0
-@vardouble = global double 0.0
-
-define void @test_round() {
-; CHECK-LABEL: test_round:
-
-  %val = load fp128* @lhs
-
-  %float = fptrunc fp128 %val to float
-  store float %float, float* @varfloat
-; CHECK: bl __trunctfsf2
-; CHECK: str s0, [{{x[0-9]+}}, #:lo12:varfloat]
-
-  %double = fptrunc fp128 %val to double
-  store double %double, double* @vardouble
-; CHECK: bl __trunctfdf2
-; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble]
-
-  ret void
-}
-
-define void @test_extend() {
-; CHECK-LABEL: test_extend:
-
-  %val = load fp128* @lhs
-
-  %float = load float* @varfloat
-  %fromfloat = fpext float %float to fp128
-  store volatile fp128 %fromfloat, fp128* @lhs
-; CHECK: bl __extendsftf2
-; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
-
-  %double = load double* @vardouble
-  %fromdouble = fpext double %double to fp128
-  store volatile fp128 %fromdouble, fp128* @lhs
-; CHECK: bl __extenddftf2
-; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
-
-  ret void
-; CHECK: ret
-}
-
-define fp128 @test_neg(fp128 %in) {
-; CHECK: [[MINUS0:.LCPI[0-9]+_0]]:
-; Make sure the weird hex constant below *is* -0.0
-; CHECK-NEXT: fp128 -0
-
-; CHECK-LABEL: test_neg:
-
-  ; Could in principle be optimized to fneg which we can't select, this makes
-  ; sure that doesn't happen.
-  %ret = fsub fp128 0xL00000000000000008000000000000000, %in
-; CHECK: str q0, [sp, #-16]
-; CHECK-NEXT: ldr q1, [sp], #16
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:[[MINUS0]]]
-; CHECK: bl __subtf3
-
-  ret fp128 %ret
-; CHECK: ret
-}
diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll
index b8f7169..e59520c 100644
--- a/test/CodeGen/AArch64/fpimm.ll
+++ b/test/CodeGen/AArch64/fpimm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @varf32 = global float 0.0
 @varf64 = global double 0.0
@@ -13,7 +13,7 @@ define void @check_float() {
 
   %newval2 = fadd float %val, 128.0
   store volatile float %newval2, float* @varf32
-; CHECK-DAG: ldr [[HARD:s[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI0_0
+; CHECK-DAG: ldr [[HARD:s[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:.LCPI0_0
 
 ; CHECK: ret
   ret void
@@ -29,7 +29,7 @@ define void @check_double() {
 
   %newval2 = fadd double %val, 128.0
   store volatile double %newval2, double* @varf64
-; CHECK-DAG: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI1_0
+; CHECK-DAG: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.LCPI1_0
 
 ; CHECK: ret
   ret void
diff --git a/test/CodeGen/AArch64/frameaddr.ll b/test/CodeGen/AArch64/frameaddr.ll
index 182704b..85d95e2 100644
--- a/test/CodeGen/AArch64/frameaddr.ll
+++ b/test/CodeGen/AArch64/frameaddr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios7.0  | FileCheck %s
 
 define i8* @t() nounwind {
 entry:
@@ -12,7 +12,7 @@ define i8* @t2() nounwind {
 entry:
 ; CHECK-LABEL: t2:
 ; CHECK: ldr x[[reg:[0-9]+]], [x29]
-; CHECK: ldr x[[reg]], [x[[reg]]]
+; CHECK: ldr {{x[0-9]+}}, [x[[reg]]]
 	%0 = call i8* @llvm.frameaddress(i32 2)
         ret i8* %0
 }
diff --git a/test/CodeGen/AArch64/free-zext.ll b/test/CodeGen/AArch64/free-zext.ll
new file mode 100644
index 0000000..d69105e
--- /dev/null
+++ b/test/CodeGen/AArch64/free-zext.ll
@@ -0,0 +1,14 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define i64 @test_free_zext(i8* %a, i16* %b) {
+; CHECK-LABEL: test_free_zext
+; CHECK-DAG: ldrb w[[A:[0-9]+]], [x0]
+; CHECK: ldrh w[[B:[0-9]+]], [x1]
+; CHECK: add x0, x[[B]], x[[A]]
+  %1 = load i8* %a, align 1
+  %conv = zext i8 %1 to i64
+  %2 = load i16* %b, align 2
+  %conv1 = zext i16 %2 to i64
+  %add = add nsw i64 %conv1, %conv
+  ret i64 %add
+}
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index f307686..abb732c 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -18,7 +16,7 @@ define void @take_i8s(i8 %val1, i8 %val2) {
     store i8 %val2, i8* @var8
     ; Not using w1 may be technically allowed, but it would indicate a
     ; problem in itself.
-;  CHECK: strb w1, [{{x[0-9]+}}, #:lo12:var8]
+;  CHECK: strb w1, [{{x[0-9]+}}, {{#?}}:lo12:var8]
     ret void
 }
 
@@ -28,7 +26,7 @@ define void @add_floats(float %val1, float %val2) {
 ; CHECK: fadd [[ADDRES:s[0-9]+]], s0, s1
 ; CHECK-NOFP-NOT: fadd
     store float %newval, float* @varfloat
-; CHECK: str [[ADDRES]], [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK: str [[ADDRES]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
     ret void
 }
 
@@ -43,12 +41,12 @@ define void @take_struct(%myStruct* byval %structval) {
     ; Some weird move means x0 is used for one access
 ; CHECK: ldr [[REG32:w[0-9]+]], [{{x[0-9]+|sp}}, #12]
     store volatile i32 %val0, i32* @var32
-; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str [[REG32]], [{{x[0-9]+}}, {{#?}}:lo12:var32]
 
     %val1 = load volatile i64* %addr1
 ; CHECK: ldr [[REG64:x[0-9]+]], [{{x[0-9]+|sp}}]
     store volatile i64 %val1, i64* @var64
-; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str [[REG64]], [{{x[0-9]+}}, {{#?}}:lo12:var64]
 
     ret void
 }
@@ -62,15 +60,14 @@ define void @check_byval_align(i32* byval %ignore, %myStruct* byval align 16 %st
 
     %val0 = load volatile i32* %addr0
     ; Some weird move means x0 is used for one access
-; CHECK: add x[[STRUCTVAL_ADDR:[0-9]+]], sp, #16
-; CHECK: ldr [[REG32:w[0-9]+]], [x[[STRUCTVAL_ADDR]], #12]
+; CHECK: ldr [[REG32:w[0-9]+]], [sp, #28]
     store i32 %val0, i32* @var32
-; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str [[REG32]], [{{x[0-9]+}}, {{#?}}:lo12:var32]
 
     %val1 = load volatile i64* %addr1
 ; CHECK: ldr [[REG64:x[0-9]+]], [sp, #16]
     store i64 %val1, i64* @var64
-; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str [[REG64]], [{{x[0-9]+}}, {{#?}}:lo12:var64]
 
     ret void
 }
@@ -79,7 +76,7 @@ define i32 @return_int() {
 ; CHECK-LABEL: return_int:
     %val = load i32* @var32
     ret i32 %val
-; CHECK: ldr w0, [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: ldr w0, [{{x[0-9]+}}, {{#?}}:lo12:var32]
     ; Make sure epilogue follows
 ; CHECK-NEXT: ret
 }
@@ -87,7 +84,7 @@ define i32 @return_int() {
 define double @return_double() {
 ; CHECK-LABEL: return_double:
     ret double 3.14
-; CHECK: ldr d0, [{{x[0-9]+}}, #:lo12:.LCPI
+; CHECK: ldr d0, [{{x[0-9]+}}, {{#?}}:lo12:.LCPI
 ; CHECK-NOFP-NOT: ldr d0,
 }
 
@@ -99,10 +96,10 @@ define [2 x i64] @return_struct() {
     %addr = bitcast %myStruct* @varstruct to [2 x i64]*
     %val = load [2 x i64]* %addr
     ret [2 x i64] %val
-; CHECK: ldr x0, [{{x[0-9]+}}, #:lo12:varstruct]
+; CHECK-DAG: ldr x0, [{{x[0-9]+}}, {{#?}}:lo12:varstruct]
     ; Odd register regex below disallows x0 which we want to be live now.
-; CHECK: add {{x[1-9][0-9]*}}, {{x[1-9][0-9]*}}, #:lo12:varstruct
-; CHECK-NEXT: ldr x1, [{{x[1-9][0-9]*}}, #8]
+; CHECK-DAG: add {{x[1-9][0-9]*}}, {{x[1-9][0-9]*}}, {{#?}}:lo12:varstruct
+; CHECK: ldr x1, [{{x[1-9][0-9]*}}, #8]
     ; Make sure epilogue immediately follows
 ; CHECK-NEXT: ret
 }
@@ -139,17 +136,16 @@ define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var
     store volatile i64 %val64, i64* @var64
     ; Currently nothing on local stack, so struct should be at sp
 ; CHECK: ldr [[VAL64:x[0-9]+]], [sp]
-; CHECK: str [[VAL64]], [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str [[VAL64]], [{{x[0-9]+}}, {{#?}}:lo12:var64]
 
     store volatile double %notstacked, double* @vardouble
 ; CHECK-NOT: ldr d0
-; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble
+; CHECK: str d0, [{{x[0-9]+}}, {{#?}}:lo12:vardouble
 ; CHECK-NOFP-NOT: str d0,
 
     %retval = load volatile i32* %stacked
     ret i32 %retval
 ; CHECK-LE: ldr w0, [sp, #16]
-; CHECK-BE: ldr w0, [sp, #20]
 }
 
 define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
@@ -159,36 +155,36 @@ define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
     store float %var8, float* @varfloat
     ; Beware as above: the offset would be different on big-endian
     ; machines if the first ldr were changed to use s-registers.
-; CHECK: ldr d[[VALFLOAT:[0-9]+]], [sp]
-; CHECK: str s[[VALFLOAT]], [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK: ldr {{[ds]}}[[VALFLOAT:[0-9]+]], [sp]
+; CHECK: str s[[VALFLOAT]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
 
     ret void
 }
 
 ; 128-bit integer types should be passed in xEVEN, xODD rather than
 ; the reverse. In this case x2 and x3. Nothing should use x1.
-define i32 @check_i128_regalign(i32 %val0, i128 %val1, i32 %val2) {
-; CHECK: check_i128_regalign
+define i64 @check_i128_regalign(i32 %val0, i128 %val1, i64 %val2) {
+; CHECK-LABEL: check_i128_regalign
     store i128 %val1, i128* @var128
-; CHECK: str x2, [{{x[0-9]+}}, #:lo12:var128]
-; CHECK: str x3, [{{x[0-9]+}}, #8]
+; CHECK-DAG: str x2, [{{x[0-9]+}}, {{#?}}:lo12:var128]
+; CHECK-DAG: str x3, [{{x[0-9]+}}, #8]
 
-    ret i32 %val2
+    ret i64 %val2
 ; CHECK: mov x0, x4
 }
 
 define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
                                    i32 %val4, i32 %val5, i32 %val6, i32 %val7,
                                    i32 %stack1, i128 %stack2) {
-; CHECK: check_i128_stackalign
+; CHECK-LABEL: check_i128_stackalign
     store i128 %stack2, i128* @var128
     ; Nothing local on stack in current codegen, so first stack is 16 away
 ; CHECK-LE: add     x[[REG:[0-9]+]], sp, #16
 ; CHECK-LE: ldr {{x[0-9]+}}, [x[[REG]], #8]
-; CHECK-BE: ldr {{x[0-9]+}}, [sp, #24]
 
     ; Important point is that we address sp+24 for second dword
-; CHECK: ldr     {{x[0-9]+}}, [sp, #16]
+
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
     ret void
 }
 
@@ -200,3 +196,13 @@ define i32 @test_extern() {
 ; CHECK: bl memcpy
   ret i32 0
 }
+
+
+; A sub-i32 stack argument must be loaded on big endian with ldr{h,b}, not just
+; implicitly extended to a 32-bit load.
+define i16 @stacked_i16(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
+                        i32 %val4, i32 %val5, i32 %val6, i32 %val7,
+                        i16 %stack1) {
+; CHECK-LABEL: stacked_i16
+  ret i16 %stack1
+}
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index f029bf2..422c576 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-BE --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -24,15 +24,15 @@ define void @simple_args() {
   %char1 = load i8* @var8
   %char2 = load i8* @var8_2
   call void @take_i8s(i8 %char1, i8 %char2)
-; CHECK-DAG: ldrb w0, [{{x[0-9]+}}, #:lo12:var8]
-; CHECK-DAG: ldrb w1, [{{x[0-9]+}}, #:lo12:var8_2]
+; CHECK-DAG: ldrb w0, [{{x[0-9]+}}, {{#?}}:lo12:var8]
+; CHECK-DAG: ldrb w1, [{{x[0-9]+}}, {{#?}}:lo12:var8_2]
 ; CHECK: bl take_i8s
 
   %float1 = load float* @varfloat
   %float2 = load float* @varfloat_2
   call void @take_floats(float %float1, float %float2)
-; CHECK-DAG: ldr s1, [{{x[0-9]+}}, #:lo12:varfloat_2]
-; CHECK-DAG: ldr s0, [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK-DAG: ldr s1, [{{x[0-9]+}}, {{#?}}:lo12:varfloat_2]
+; CHECK-DAG: ldr s0, [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
 ; CHECK: bl take_floats
 ; CHECK-NOFP-NOT: ldr s1,
 ; CHECK-NOFP-NOT: ldr s0,
@@ -51,22 +51,22 @@ define void @simple_rets() {
   %int = call i32 @return_int()
   store i32 %int, i32* @var32
 ; CHECK: bl return_int
-; CHECK: str w0, [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str w0, [{{x[0-9]+}}, {{#?}}:lo12:var32]
 
   %dbl = call double @return_double()
   store double %dbl, double* @vardouble
 ; CHECK: bl return_double
-; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble]
+; CHECK: str d0, [{{x[0-9]+}}, {{#?}}:lo12:vardouble]
 ; CHECK-NOFP-NOT: str d0,
 
   %arr = call [2 x i64] @return_smallstruct()
   store [2 x i64] %arr, [2 x i64]* @varsmallstruct
 ; CHECK: bl return_smallstruct
 ; CHECK: str x1, [{{x[0-9]+}}, #8]
-; CHECK: str x0, [{{x[0-9]+}}, #:lo12:varsmallstruct]
+; CHECK: str x0, [{{x[0-9]+}}, {{#?}}:lo12:varsmallstruct]
 
   call void @return_large_struct(%myStruct* sret @varstruct)
-; CHECK: add x8, {{x[0-9]+}}, #:lo12:varstruct
+; CHECK: add x8, {{x[0-9]+}}, {{#?}}:lo12:varstruct
 ; CHECK: bl return_large_struct
 
   ret void
@@ -88,19 +88,28 @@ define void @check_stack_args() {
   ; Want to check that the final double is passed in registers and
   ; that varstruct is passed on the stack. Rather dependent on how a
   ; memcpy gets created, but the following works for now.
-; CHECK: mov x[[SPREG:[0-9]+]], sp
-; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]]]
-; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]], #12]
-; CHECK-DAG: fmov d0,
+
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b
+
+; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]]
+
 ; CHECK: bl struct_on_stack
 ; CHECK-NOFP-NOT: fmov
 
   call void @stacked_fpu(float -1.0, double 1.0, float 4.0, float 2.0,
                          float -2.0, float -8.0, float 16.0, float 1.0,
                          float 64.0)
-; CHECK: ldr s[[STACKEDREG:[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
-; CHECK: mov x0, sp
-; CHECK: str d[[STACKEDREG]], [x0]
+
+; CHECK:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK: str [[SIXTY_FOUR]], [sp]
+
+; CHECK-NONEON:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK-NONEON: str [[SIXTY_FOUR]], [sp]
+
 ; CHECK: bl stacked_fpu
   ret void
 }
@@ -119,18 +128,20 @@ define void @check_i128_align() {
   call void @check_i128_stackalign(i32 0, i32 1, i32 2, i32 3,
                                    i32 4, i32 5, i32 6, i32 7,
                                    i32 42, i128 %val)
-; CHECK: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var128]
+; CHECK: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:var128]
 ; CHECK: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
-; CHECK: mov x[[SPREG:[0-9]+]], sp
-; CHECK: str [[I128HI]], [x[[SPREG]], #24]
-; CHECK: str [[I128LO]], [x[[SPREG]], #16]
+; CHECK: stp [[I128LO]], [[I128HI]], [sp, #16]
+
+; CHECK-NONEON: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, :lo12:var128]
+; CHECK-NONEON: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
+; CHECK-NONEON: stp [[I128LO]], [[I128HI]], [sp, #16]
 ; CHECK: bl check_i128_stackalign
 
   call void @check_i128_regalign(i32 0, i128 42)
 ; CHECK-NOT: mov x1
-; CHECK-LE: movz x2, #42
+; CHECK-LE: movz x2, #{{0x2a|42}}
 ; CHECK-LE: mov x3, xzr
-; CHECK-BE: movz x3, #42
+; CHECK-BE: movz {{x|w}}3, #{{0x2a|42}}
 ; CHECK-BE: mov x2, xzr
 ; CHECK: bl check_i128_regalign
 
@@ -143,7 +154,7 @@ define void @check_indirect_call() {
 ; CHECK-LABEL: check_indirect_call:
   %func = load void()** @fptr
   call void %func()
-; CHECK: ldr [[FPTR:x[0-9]+]], [{{x[0-9]+}}, #:lo12:fptr]
+; CHECK: ldr [[FPTR:x[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:fptr]
 ; CHECK: blr [[FPTR]]
 
   ret void
diff --git a/test/CodeGen/AArch64/global-alignment.ll b/test/CodeGen/AArch64/global-alignment.ll
index 56e5cba..451b9d6 100644
--- a/test/CodeGen/AArch64/global-alignment.ll
+++ b/test/CodeGen/AArch64/global-alignment.ll
@@ -1,8 +1,9 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 @var32 = global [3 x i32] zeroinitializer
 @var64 = global [3 x i64] zeroinitializer
 @var32_align64 = global [3 x i32] zeroinitializer, align 8
+@alias = alias [3 x i32]* @var32_align64
 
 define i64 @test_align32() {
 ; CHECK-LABEL: test_align32:
@@ -12,7 +13,7 @@ define i64 @test_align32() {
   ; emit an "LDR x0, [x0, #:lo12:var32] instruction to implement this load.
   %val = load i64* %addr
 ; CHECK: adrp [[HIBITS:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], {{#?}}:lo12:var32
 ; CHECK: ldr x0, [x[[ADDR]]]
 
   ret i64 %val
@@ -27,7 +28,7 @@ define i64 @test_align64() {
   %val = load i64* %addr
 ; CHECK: adrp x[[HIBITS:[0-9]+]], var64
 ; CHECK-NOT: add x[[HIBITS]]
-; CHECK: ldr x0, [x[[HIBITS]], #:lo12:var64]
+; CHECK: ldr x0, [x[[HIBITS]], {{#?}}:lo12:var64]
 
   ret i64 %val
 }
@@ -41,7 +42,20 @@ define i64 @test_var32_align64() {
   %val = load i64* %addr
 ; CHECK: adrp x[[HIBITS:[0-9]+]], var32_align64
 ; CHECK-NOT: add x[[HIBITS]]
-; CHECK: ldr x0, [x[[HIBITS]], #:lo12:var32_align64]
+; CHECK: ldr x0, [x[[HIBITS]], {{#?}}:lo12:var32_align64]
+
+  ret i64 %val
+}
+
+define i64 @test_var32_alias() {
+; CHECK-LABEL: test_var32_alias:
+  %addr = bitcast [3 x i32]* @alias to i64*
+
+  ; Test that we can find the alignment for aliases.
+  %val = load i64* %addr
+; CHECK: adrp x[[HIBITS:[0-9]+]], alias
+; CHECK-NOT: add x[[HIBITS]]
+; CHECK: ldr x0, [x[[HIBITS]], {{#?}}:lo12:alias]
 
   ret i64 %val
 }
@@ -56,7 +70,7 @@ define i64 @test_yet_another_var() {
   ; so we can't fold the load.
   %val = load i64* bitcast({i32, i32}* @yet_another_var to i64*)
 ; CHECK: adrp [[HIBITS:x[0-9]+]], yet_another_var
-; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], #:lo12:yet_another_var
+; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], {{#?}}:lo12:yet_another_var
 ; CHECK: ldr x0, [x[[ADDR]]]
   ret i64 %val
 }
@@ -65,5 +79,5 @@ define i64()* @test_functions() {
 ; CHECK-LABEL: test_functions:
   ret i64()* @test_yet_another_var
 ; CHECK: adrp [[HIBITS:x[0-9]+]], test_yet_another_var
-; CHECK: add x0, [[HIBITS]], #:lo12:test_yet_another_var
+; CHECK: add x0, [[HIBITS]], {{#?}}:lo12:test_yet_another_var
 }
diff --git a/test/CodeGen/AArch64/got-abuse.ll b/test/CodeGen/AArch64/got-abuse.ll
index 8b06031..7a02b10 100644
--- a/test/CodeGen/AArch64/got-abuse.ll
+++ b/test/CodeGen/AArch64/got-abuse.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj -o - %s
 
 ; LLVM gives well-defined semantics to this horrible construct (though C says
 ; it's undefined). Regardless, we shouldn't crash. The important feature here is
@@ -17,7 +17,7 @@ define void @foo() nounwind {
 entry:
   call void @consume(i32 ptrtoint (void ()* @func to i32))
 ; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:func
-; CHECK: ldr {{x[0-9]+}}, [x[[ADDRHI]], #:got_lo12:func]
+; CHECK: ldr {{x[0-9]+}}, [x[[ADDRHI]], {{#?}}:got_lo12:func]
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/i1-contents.ll b/test/CodeGen/AArch64/i1-contents.ll
new file mode 100644
index 0000000..7f133fc
--- /dev/null
+++ b/test/CodeGen/AArch64/i1-contents.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+%big = type i32
+
+@var = global %big 0
+
+; AAPCS: low 8 bits of %in (== w0) will be either 0 or 1. Need to extend to
+; 32-bits.
+define void @consume_i1_arg(i1 %in) {
+; CHECK-LABEL: consume_i1_arg:
+; CHECK: and [[BOOL32:w[0-9]+]], w0, #{{0x1|0xff}}
+; CHECK: str [[BOOL32]], [{{x[0-9]+}}, :lo12:var]
+  %val = zext i1 %in to %big
+  store %big %val, %big* @var
+  ret void
+}
+
+; AAPCS: low 8 bits of %val1 (== w0) will be either 0 or 1. Need to extend to
+; 32-bits (doesn't really matter if it's from 1 or 8 bits).
+define void @consume_i1_ret() {
+; CHECK-LABEL: consume_i1_ret:
+; CHECK: bl produce_i1_ret
+; CHECK: and [[BOOL32:w[0-9]+]], w0, #{{0x1|0xff}}
+; CHECK: str [[BOOL32]], [{{x[0-9]+}}, :lo12:var]
+  %val1 = call i1 @produce_i1_ret()
+  %val = zext i1 %val1 to %big
+  store %big %val, %big* @var
+  ret void
+}
+
+; AAPCS: low 8 bits of w0 must be either 0 or 1. Need to mask them off.
+define i1 @produce_i1_ret() {
+; CHECK-LABEL: produce_i1_ret:
+; CHECK: ldr [[VAR32:w[0-9]+]], [{{x[0-9]+}}, :lo12:var]
+; CHECK: and w0, [[VAR32]], #{{0x1|0xff}}
+  %val = load %big* @var
+  %val1 = trunc %big %val to i1
+  ret i1 %val1
+}
+
+define void @produce_i1_arg() {
+; CHECK-LABEL: produce_i1_arg:
+; CHECK: ldr [[VAR32:w[0-9]+]], [{{x[0-9]+}}, :lo12:var]
+; CHECK: and w0, [[VAR32]], #{{0x1|0xff}}
+; CHECK: bl consume_i1_arg
+  %val = load %big* @var
+  %val1 = trunc %big %val to i1
+  call void @consume_i1_arg(i1 %val1)
+  ret void
+}
+
+
+;define zeroext i1 @foo(i8 %in) {
+;  %val = trunc i8 %in to i1
+;  ret i1 %val
+;}
diff --git a/test/CodeGen/AArch64/i128-align.ll b/test/CodeGen/AArch64/i128-align.ll
index 21ca7ed..a1b4d6f 100644
--- a/test/CodeGen/AArch64/i128-align.ll
+++ b/test/CodeGen/AArch64/i128-align.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios7.0 -verify-machineinstrs -o - %s | FileCheck %s
 
 %struct = type { i32, i128, i8 }
 
@@ -13,7 +13,7 @@ define i64 @check_size() {
 
   %diff = sub i64 %endi, %starti
   ret i64 %diff
-; CHECK: movz x0, #48
+; CHECK: {{movz x0, #48|orr w0, wzr, #0x30}}
 }
 
 define i64 @check_field() {
@@ -25,5 +25,5 @@ define i64 @check_field() {
 
   %diff = sub i64 %endi, %starti
   ret i64 %diff
-; CHECK: movz x0, #16
+; CHECK: {{movz x0, #16|orr w0, wzr, #0x10}}
 }
diff --git a/test/CodeGen/AArch64/i128-shift.ll b/test/CodeGen/AArch64/i128-shift.ll
deleted file mode 100644
index d786d44..0000000
--- a/test/CodeGen/AArch64/i128-shift.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-
-define i128 @test_i128_lsl(i128 %a, i32 %shift) {
-; CHECK-LABEL: test_i128_lsl:
-
-  %sh_prom = zext i32 %shift to i128
-  %shl = shl i128 %a, %sh_prom
-
-; CHECK: movz [[SIXTYFOUR:x[0-9]+]], #64
-; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw
-; CHECK-NEXT: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[REVSHAMT]]
-; CHECK: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[SHAMT:x[0-9]+]]
-; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]]
-; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64
-; CHECK-NEXT: lsl [[TMP3:x[0-9]+]], [[LO]], [[EXTRASHAMT]]
-; CHECK-NEXT: cmp [[EXTRASHAMT]], #0
-; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], [[TMP3]], [[FALSEVAL]], ge
-; CHECK-NEXT: lsl [[TMP4:x[0-9]+]], [[LO]], [[SHAMT]]
-; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], xzr, [[TMP4]], ge
-
-  ret i128 %shl
-}
-
-define i128 @test_i128_shr(i128 %a, i32 %shift) {
-; CHECK-LABEL: test_i128_shr:
-
-  %sh_prom = zext i32 %shift to i128
-  %shr = lshr i128 %a, %sh_prom
-
-; CHECK: movz [[SIXTYFOUR]], #64
-; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw
-; CHECK-NEXT: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[REVSHAMT]]
-; CHECK: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[SHAMT:x[0-9]+]]
-; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]]
-; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64
-; CHECK-NEXT: lsr [[TRUEVAL:x[0-9]+]], [[HI]], [[EXTRASHAMT]]
-; CHECK-NEXT: cmp [[EXTRASHAMT]], #0
-; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], [[TRUEVAL]], [[FALSEVAL]], ge
-; CHECK-NEXT: lsr [[TMP3:x[0-9]+]], [[HI]], [[SHAMT]]
-; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], xzr, [[TMP3]], ge
-
-  ret i128 %shr
-}
diff --git a/test/CodeGen/AArch64/illegal-float-ops.ll b/test/CodeGen/AArch64/illegal-float-ops.ll
index 03c6d8d..9f7dd99 100644
--- a/test/CodeGen/AArch64/illegal-float-ops.ll
+++ b/test/CodeGen/AArch64/illegal-float-ops.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll
index 076ae27..f47b490 100644
--- a/test/CodeGen/AArch64/init-array.ll
+++ b/test/CodeGen/AArch64/init-array.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
   ret void
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
index 61bbfc2..9d833d9 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
@@ -1,7 +1,7 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
 
 define void @foo() {
   ; Out of range immediate for I.
-  call void asm sideeffect "add x0, x0, $0", "I"(i32 4096)
+  call void asm sideeffect "add x0, x0, $0", "I"(i32 4097)
   ret void
 }
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
index 40746e1..6ffc05d 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=arm64-apple-ios7.0 -o - %s
 
 define void @foo() {
   ; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
index 2c53381..1726013 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
 
 define void @foo() {
   ; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badL.ll b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
index d82d5a2..3c2f60c 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=arm64-apple-ios7.0 -o - %s
 
 define void @foo() {
   ; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints.ll b/test/CodeGen/AArch64/inline-asm-constraints.ll
deleted file mode 100644
index 365453c..0000000
--- a/test/CodeGen/AArch64/inline-asm-constraints.ll
+++ /dev/null
@@ -1,137 +0,0 @@
-;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -no-integrated-as < %s | FileCheck %s
-
-define i64 @test_inline_constraint_r(i64 %base, i32 %offset) {
-; CHECK-LABEL: test_inline_constraint_r:
-  %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 %base, i32 %offset)
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw
-  ret i64 %val
-}
-
-define i16 @test_small_reg(i16 %lhs, i16 %rhs) {
-; CHECK-LABEL: test_small_reg:
-  %val = call i16 asm sideeffect "add $0, $1, $2, sxth", "=r,r,r"(i16 %lhs, i16 %rhs)
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth
-  ret i16 %val
-}
-
-define i64 @test_inline_constraint_r_imm(i64 %base, i32 %offset) {
-; CHECK-LABEL: test_inline_constraint_r_imm:
-  %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 4, i32 12)
-; CHECK: movz [[FOUR:x[0-9]+]], #4
-; CHECK: movz [[TWELVE:w[0-9]+]], #12
-; CHECK: add {{x[0-9]+}}, [[FOUR]], [[TWELVE]], sxtw
-  ret i64 %val
-}
-
-; m is permitted to have a base/offset form. We don't do that
-; currently though.
-define i32 @test_inline_constraint_m(i32 *%ptr) {
-; CHECK-LABEL: test_inline_constraint_m:
-  %val = call i32 asm "ldr $0, $1", "=r,m"(i32 *%ptr)
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-  ret i32 %val
-}
-
-@arr = global [8 x i32] zeroinitializer
-
-; Q should *never* have base/offset form even if given the chance.
-define i32 @test_inline_constraint_Q(i32 *%ptr) {
-; CHECK-LABEL: test_inline_constraint_Q:
-  %val = call i32 asm "ldr $0, $1", "=r,Q"(i32* getelementptr([8 x i32]* @arr, i32 0, i32 1))
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-  ret i32 %val
-}
-
-@dump = global fp128 zeroinitializer
-
-define void @test_inline_constraint_w(<8 x i8> %vec64, <4 x float> %vec128, half %hlf, float %flt, double %dbl, fp128 %quad) {
-; CHECK: test_inline_constraint_w:
-  call <8 x i8> asm sideeffect "add $0.8b, $1.8b, $1.8b", "=w,w"(<8 x i8> %vec64)
-  call <8 x i8> asm sideeffect "fadd $0.4s, $1.4s, $1.4s", "=w,w"(<4 x float> %vec128)
-; CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-
-  ; Arguably semantically dodgy to output "vN", but it's what GCC does
-  ; so purely for compatibility we want vector registers to be output.
-  call float asm sideeffect "fcvt ${0:s}, ${1:h}", "=w,w"(half undef)
-  call float asm sideeffect "fadd $0.2s, $0.2s, $0.2s", "=w,w"(float %flt)
-  call double asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(double %dbl)
-  call fp128 asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(fp128 %quad)
-; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}}
-; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  ret void
-}
-
-define void @test_inline_constraint_I() {
-; CHECK-LABEL: test_inline_constraint_I:
-  call void asm sideeffect "add x0, x0, $0", "I"(i32 0)
-  call void asm sideeffect "add x0, x0, $0", "I"(i64 4095)
-; CHECK: add x0, x0, #0
-; CHECK: add x0, x0, #4095
-
-  ret void
-}
-
-; Skip J because it's useless
-
-define void @test_inline_constraint_K() {
-; CHECK-LABEL: test_inline_constraint_K:
-  call void asm sideeffect "and w0, w0, $0", "K"(i32 2863311530) ; = 0xaaaaaaaa
-  call void asm sideeffect "and w0, w0, $0", "K"(i32 65535)
-; CHECK: and w0, w0, #-1431655766
-; CHECK: and w0, w0, #65535
-
-  ret void
-}
-
-define void @test_inline_constraint_L() {
-; CHECK-LABEL: test_inline_constraint_L:
-  call void asm sideeffect "and x0, x0, $0", "L"(i64 4294967296) ; = 0xaaaaaaaa
-  call void asm sideeffect "and x0, x0, $0", "L"(i64 65535)
-; CHECK: and x0, x0, #4294967296
-; CHECK: and x0, x0, #65535
-
-  ret void
-}
-
-; Skip M and N because we don't support MOV pseudo-instructions yet.
-
-@var = global i32 0
-
-define void @test_inline_constraint_S() {
-; CHECK-LABEL: test_inline_constraint_S:
-  call void asm sideeffect "adrp x0, $0", "S"(i32* @var)
-  call void asm sideeffect "adrp x0, ${0:A}", "S"(i32* @var)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S"(i32* @var)
-; CHECK: adrp x0, var
-; CHECK: adrp x0, var
-; CHECK: add x0, x0, #:lo12:var
-  ret void
-}
-
-define i32 @test_inline_constraint_S_label(i1 %in) {
-; CHECK-LABEL: test_inline_constraint_S_label:
-  call void asm sideeffect "adr x0, $0", "S"(i8* blockaddress(@test_inline_constraint_S_label, %loc))
-; CHECK: adr x0, .Ltmp{{[0-9]+}}
-  br i1 %in, label %loc, label %loc2
-loc:
-  ret i32 0
-loc2:
-  ret i32 42
-}
-
-define void @test_inline_constraint_Y() {
-; CHECK-LABEL: test_inline_constraint_Y:
-  call void asm sideeffect "fcmp s0, $0", "Y"(float 0.0)
-; CHECK: fcmp s0, #0.0
-  ret void
-}
-
-define void @test_inline_constraint_Z() {
-; CHECK-LABEL: test_inline_constraint_Z:
-  call void asm sideeffect "cmp w0, $0", "Z"(i32 0)
-; CHECK: cmp w0, #0
-  ret void
-}
diff --git a/test/CodeGen/AArch64/inline-asm-modifiers.ll b/test/CodeGen/AArch64/inline-asm-modifiers.ll
deleted file mode 100644
index cb66335..0000000
--- a/test/CodeGen/AArch64/inline-asm-modifiers.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -no-integrated-as < %s | FileCheck %s
-
-@var_simple = hidden global i32 0
-@var_got = global i32 0
-@var_tlsgd = thread_local global i32 0
-@var_tlsld = thread_local(localdynamic) global i32 0
-@var_tlsie = thread_local(initialexec) global i32 0
-@var_tlsle = thread_local(localexec) global i32 0
-
-define void @test_inline_modifier_L() nounwind {
-; CHECK-LABEL: test_inline_modifier_L:
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_simple)
-  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "S,~{x0}"(i32* @var_got)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsgd)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsld)
-  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "S,~{x0}"(i32* @var_tlsie)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsle)
-; CHECK: add x0, x0, #:lo12:var_simple
-; CHECK: ldr x0, [x0, #:got_lo12:var_got]
-; CHECK: add x0, x0, #:tlsdesc_lo12:var_tlsgd
-; CHECK: add x0, x0, #:dtprel_lo12:var_tlsld
-; CHECK: ldr x0, [x0, #:gottprel_lo12:var_tlsie]
-; CHECK: add x0, x0, #:tprel_lo12:var_tlsle
-
-  call void asm sideeffect "add x0, x0, ${0:L}", "Si,~{x0}"(i32 64)
-  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "Si,~{x0}"(i32 64)
-; CHECK: add x0, x0, #64
-; CHECK: ldr x0, [x0, #64]
-
-  ret void
-}
-
-define void @test_inline_modifier_G() nounwind {
-; CHECK-LABEL: test_inline_modifier_G:
-  call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsld)
-  call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsle)
-; CHECK: add x0, x0, #:dtprel_hi12:var_tlsld, lsl #12
-; CHECK: add x0, x0, #:tprel_hi12:var_tlsle, lsl #12
-
-  call void asm sideeffect "add x0, x0, ${0:G}", "Si,~{x0}"(i32 42)
-; CHECK: add x0, x0, #42
-  ret void
-}
-
-define void @test_inline_modifier_A() nounwind {
-; CHECK-LABEL: test_inline_modifier_A:
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_simple)
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_got)
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_tlsgd)
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_tlsie)
-  ; N.b. All tprel and dtprel relocs are modified: lo12 or granules.
-; CHECK: adrp x0, var_simple
-; CHECK: adrp x0, :got:var_got
-; CHECK: adrp x0, :tlsdesc:var_tlsgd
-; CHECK: adrp x0, :gottprel:var_tlsie
-
-  call void asm sideeffect "adrp x0, ${0:A}", "Si,~{x0}"(i32 40)
-; CHECK: adrp x0, #40
-
-  ret void
-}
-
-define void @test_inline_modifier_wx(i32 %small, i64 %big) nounwind {
-; CHECK-LABEL: test_inline_modifier_wx:
-  call i32 asm sideeffect "add $0, $0, $0", "=r,0"(i32 %small)
-  call i32 asm sideeffect "add ${0:w}, ${0:w}, ${0:w}", "=r,0"(i32 %small)
-  call i32 asm sideeffect "add ${0:x}, ${0:x}, ${0:x}", "=r,0"(i32 %small)
-; CHECK: //APP
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-
-  call i64 asm sideeffect "add $0, $0, $0", "=r,0"(i64 %big)
-  call i64 asm sideeffect "add ${0:w}, ${0:w}, ${0:w}", "=r,0"(i64 %big)
-  call i64 asm sideeffect "add ${0:x}, ${0:x}, ${0:x}", "=r,0"(i64 %big)
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-
-  call i32 asm sideeffect "add ${0:w}, ${1:w}, ${1:w}", "=r,r"(i32 0)
-  call i32 asm sideeffect "add ${0:x}, ${1:x}, ${1:x}", "=r,r"(i32 0)
-; CHECK: add {{w[0-9]+}}, wzr, wzr
-; CHECK: add {{x[0-9]+}}, xzr, xzr
-
-  call i32 asm sideeffect "add ${0:w}, ${0:w}, ${1:w}", "=r,Ir,0"(i32 123, i32 %small)
-  call i64 asm sideeffect "add ${0:x}, ${0:x}, ${1:x}", "=r,Ir,0"(i32 456, i64 %big)
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #123
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #456
-
-  ret void
-}
-
-define void @test_inline_modifier_bhsdq() nounwind {
-; CHECK-LABEL: test_inline_modifier_bhsdq:
-  call float asm sideeffect "ldr ${0:b}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:h}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:s}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:d}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:q}, [sp]", "=w"()
-; CHECK: ldr b0, [sp]
-; CHECK: ldr h0, [sp]
-; CHECK: ldr s0, [sp]
-; CHECK: ldr d0, [sp]
-; CHECK: ldr q0, [sp]
-
-  call double asm sideeffect "ldr ${0:b}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:h}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:s}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:d}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:q}, [sp]", "=w"()
-; CHECK: ldr b0, [sp]
-; CHECK: ldr h0, [sp]
-; CHECK: ldr s0, [sp]
-; CHECK: ldr d0, [sp]
-; CHECK: ldr q0, [sp]
-
-  call void asm sideeffect "fcmp b0, ${0:b}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp h0, ${0:h}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp s0, ${0:s}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp d0, ${0:d}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp q0, ${0:q}", "Yw"(float 0.0)
-; CHECK: fcmp b0, #0
-; CHECK: fcmp h0, #0
-; CHECK: fcmp s0, #0
-; CHECK: fcmp d0, #0
-; CHECK: fcmp q0, #0
-
-  ret void
-}
-
-define void @test_inline_modifier_c() nounwind {
-; CHECK-LABEL: test_inline_modifier_c:
-  call void asm sideeffect "adr x0, ${0:c}", "i"(i32 3)
-; CHECK: adr x0, 3
-
-  ret void
-}
-
-define void @test_inline_modifier_a() nounwind {
-; CHECK-LABEL: test_inline_modifier_a:
-  call void asm sideeffect "prfm pldl1keep, ${0:a}", "r"(i32* @var_simple)
-; CHECK: adrp [[VARHI:x[0-9]+]], var_simple
-; CHECK: add x[[VARADDR:[0-9]+]], [[VARHI]], #:lo12:var_simple
-; CHECK: prfm pldl1keep, [x[[VARADDR]]]
-  ret void
-}
-
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 94717f5..1dfb789 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic <%s | FileCheck --check-prefix=CHECK-PIC %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -o - %s | FileCheck --check-prefix=CHECK-PIC %s
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
@@ -12,7 +12,7 @@ define i32 @test_jumptable(i32 %in) {
     i32 4, label %lbl4
   ]
 ; CHECK: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
-; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], #:lo12:.LJTI0_0
+; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
 ; CHECK: ldr [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #3]
 ; CHECK: br [[DEST]]
 
@@ -24,7 +24,7 @@ define i32 @test_jumptable(i32 %in) {
 ; CHECK-LARGE: br [[DEST]]
 
 ; CHECK-PIC: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
-; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], #:lo12:.LJTI0_0
+; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
 ; CHECK-PIC: ldrsw [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #2]
 ; CHECK-PIC: add [[TABLE:x[0-9]+]], [[DEST]], x[[JT]]
 ; CHECK-PIC: br [[TABLE]]
diff --git a/test/CodeGen/AArch64/large-consts.ll b/test/CodeGen/AArch64/large-consts.ll
index 1b769c6..6bf85e8 100644
--- a/test/CodeGen/AArch64/large-consts.ll
+++ b/test/CodeGen/AArch64/large-consts.ll
@@ -4,10 +4,11 @@
 ; it's not the linker's job to put it there.
 
 define double @foo() {
-; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0   // encoding: [A,A,0xe0'A',0xd2'A']
-; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [A,A,0xc0'A',0xf2'A']
-; CHECK: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [A,A,0xa0'A',0xf2'A']
-; CHECK: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [A,A,0x80'A',0xf2'A']
+
+; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0   // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
+; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b110AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b101AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b100AAAAA,0xf2]
 
   ret double 3.14159
 }
diff --git a/test/CodeGen/AArch64/large-frame.ll b/test/CodeGen/AArch64/large-frame.ll
deleted file mode 100644
index fde3036..0000000
--- a/test/CodeGen/AArch64/large-frame.ll
+++ /dev/null
@@ -1,119 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-declare void @use_addr(i8*)
-
-@addr = global i8* null
-
-define void @test_bigframe() {
-; CHECK-LABEL: test_bigframe:
-; CHECK: .cfi_startproc
-
-  %var1 = alloca i8, i32 20000000
-  %var2 = alloca i8, i32 16
-  %var3 = alloca i8, i32 20000000
-; CHECK: sub sp, sp, #496
-; CHECK: .cfi_def_cfa sp, 496
-; CHECK: str x30, [sp, #488]
-  ; Total adjust is 39999536
-; CHECK: movz [[SUBCONST:x[0-9]+]], #22576
-; CHECK: movk [[SUBCONST]], #610, lsl #16
-; CHECK: sub sp, sp, [[SUBCONST]]
-; CHECK: .cfi_def_cfa sp, 40000032
-; CHECK: .cfi_offset x30, -8
-
-  ; Total offset is 20000024
-; CHECK: movz [[VAR1OFFSET:x[0-9]+]], #11544
-; CHECK: movk [[VAR1OFFSET]], #305, lsl #16
-; CHECK: add {{x[0-9]+}}, sp, [[VAR1OFFSET]]
-  store volatile i8* %var1, i8** @addr
-
-  %var1plus2 = getelementptr i8* %var1, i32 2
-  store volatile i8* %var1plus2, i8** @addr
-
-; CHECK: movz [[VAR2OFFSET:x[0-9]+]], #11528
-; CHECK: movk [[VAR2OFFSET]], #305, lsl #16
-; CHECK: add {{x[0-9]+}}, sp, [[VAR2OFFSET]]
-  store volatile i8* %var2, i8** @addr
-
-  %var2plus2 = getelementptr i8* %var2, i32 2
-  store volatile i8* %var2plus2, i8** @addr
-
-  store volatile i8* %var3, i8** @addr
-
-  %var3plus2 = getelementptr i8* %var3, i32 2
-  store volatile i8* %var3plus2, i8** @addr
-
-; CHECK: movz [[ADDCONST:x[0-9]+]], #22576
-; CHECK: movk [[ADDCONST]], #610, lsl #16
-; CHECK: add sp, sp, [[ADDCONST]]
-; CHECK: .cfi_endproc
-  ret void
-}
-
-define void @test_mediumframe() {
-; CHECK-LABEL: test_mediumframe:
-  %var1 = alloca i8, i32 1000000
-  %var2 = alloca i8, i32 16
-  %var3 = alloca i8, i32 1000000
-; CHECK: sub sp, sp, #496
-; CHECK: str x30, [sp, #488]
-; CHECK: sub sp, sp, #688
-; CHECK-NEXT: sub sp, sp, #488, lsl #12
-
-  store volatile i8* %var1, i8** @addr
-; CHECK: add [[VAR1ADDR:x[0-9]+]], sp, #600
-; CHECK: add [[VAR1ADDR]], [[VAR1ADDR]], #244, lsl #12
-
-  %var1plus2 = getelementptr i8* %var1, i32 2
-  store volatile i8* %var1plus2, i8** @addr
-; CHECK: add [[VAR1PLUS2:x[0-9]+]], {{x[0-9]+}}, #2
-
-  store volatile i8* %var2, i8** @addr
-; CHECK: add [[VAR2ADDR:x[0-9]+]], sp, #584
-; CHECK: add [[VAR2ADDR]], [[VAR2ADDR]], #244, lsl #12
-
-  %var2plus2 = getelementptr i8* %var2, i32 2
-  store volatile i8* %var2plus2, i8** @addr
-; CHECK: add [[VAR2PLUS2:x[0-9]+]], {{x[0-9]+}}, #2
-
-  store volatile i8* %var3, i8** @addr
-
-  %var3plus2 = getelementptr i8* %var3, i32 2
-  store volatile i8* %var3plus2, i8** @addr
-
-; CHECK: add sp, sp, #688
-; CHECK: add sp, sp, #488, lsl #12
-; CHECK: ldr x30, [sp, #488]
-; CHECK: add sp, sp, #496
-  ret void
-}
-
-
-@bigspace = global [8 x i64] zeroinitializer
-
-; If temporary registers are allocated for adjustment, they should *not* clobber
-; argument registers.
-define void @test_tempallocation([8 x i64] %val) nounwind {
-; CHECK-LABEL: test_tempallocation:
-  %var = alloca i8, i32 1000000
-; CHECK: sub sp, sp,
-
-; Make sure the prologue is reasonably efficient
-; CHECK-NEXT: stp x29, x30, [sp,
-; CHECK-NEXT: stp x25, x26, [sp,
-; CHECK-NEXT: stp x23, x24, [sp,
-; CHECK-NEXT: stp x21, x22, [sp,
-; CHECK-NEXT: stp x19, x20, [sp,
-
-; Make sure we don't trash an argument register
-; CHECK-NOT: movz {{x[0-7],}}
-; CHECK: sub sp, sp,
-
-; CHECK-NOT: movz {{x[0-7],}}
-
-; CHECK: bl use_addr
-  call void @use_addr(i8* %var)
-
-  store [8 x i64] %val, [8 x i64]* @bigspace
-  ret void
-; CHECK: ret
-}
diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll
new file mode 100644
index 0000000..1ce5c95
--- /dev/null
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@@ -0,0 +1,301 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+
+; This file contains tests for the AArch64 load/store optimizer.
+
+%padding = type { i8*, i8*, i8*, i8* }
+%s.word = type { i32, i32 }
+%s.doubleword = type { i64, i32 }
+%s.quadword = type { fp128, i32 }
+%s.float = type { float, i32 }
+%s.double = type { double, i32 }
+%struct.word = type { %padding, %s.word }
+%struct.doubleword = type { %padding, %s.doubleword }
+%struct.quadword = type { %padding, %s.quadword }
+%struct.float = type { %padding, %s.float }
+%struct.double = type { %padding, %s.double }
+
+; Check the following transform:
+;
+; (ldr|str) X, [x0, #32]
+;  ...
+; add x0, x0, #32
+;  ->
+; (ldr|str) X, [x0, #32]!
+;
+; with X being either w1, x1, s0, d0 or q0.
+
+declare void @bar_word(%s.word*, i32)
+
+define void @load-pre-indexed-word(%struct.word* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-word
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1, i32 0
+  %add = load i32* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1
+  tail call void @bar_word(%s.word* %c, i32 %add)
+  ret void
+}
+
+define void @store-pre-indexed-word(%struct.word* %ptr, i32 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-word
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1, i32 0
+  store i32 %val, i32* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1
+  tail call void @bar_word(%s.word* %c, i32 %val)
+  ret void
+}
+
+declare void @bar_doubleword(%s.doubleword*, i64)
+
+define void @load-pre-indexed-doubleword(%struct.doubleword* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-doubleword
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+  %add = load i64* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1
+  tail call void @bar_doubleword(%s.doubleword* %c, i64 %add)
+  ret void
+}
+
+define void @store-pre-indexed-doubleword(%struct.doubleword* %ptr, i64 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-doubleword
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+  store i64 %val, i64* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1
+  tail call void @bar_doubleword(%s.doubleword* %c, i64 %val)
+  ret void
+}
+
+declare void @bar_quadword(%s.quadword*, fp128)
+
+define void @load-pre-indexed-quadword(%struct.quadword* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-quadword
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1, i32 0
+  %add = load fp128* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1
+  tail call void @bar_quadword(%s.quadword* %c, fp128 %add)
+  ret void
+}
+
+define void @store-pre-indexed-quadword(%struct.quadword* %ptr, fp128 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-quadword
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1, i32 0
+  store fp128 %val, fp128* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1
+  tail call void @bar_quadword(%s.quadword* %c, fp128 %val)
+  ret void
+}
+
+declare void @bar_float(%s.float*, float)
+
+define void @load-pre-indexed-float(%struct.float* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-float
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1, i32 0
+  %add = load float* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1
+  tail call void @bar_float(%s.float* %c, float %add)
+  ret void
+}
+
+define void @store-pre-indexed-float(%struct.float* %ptr, float %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-float
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1, i32 0
+  store float %val, float* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1
+  tail call void @bar_float(%s.float* %c, float %val)
+  ret void
+}
+
+declare void @bar_double(%s.double*, double)
+
+define void @load-pre-indexed-double(%struct.double* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-double
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1, i32 0
+  %add = load double* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1
+  tail call void @bar_double(%s.double* %c, double %add)
+  ret void
+}
+
+define void @store-pre-indexed-double(%struct.double* %ptr, double %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-double
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1, i32 0
+  store double %val, double* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1
+  tail call void @bar_double(%s.double* %c, double %val)
+  ret void
+}
+
+; Check the following transform:
+;
+; ldr X, [x20]
+;  ...
+; add x20, x20, #32
+;  ->
+; ldr X, [x20], #32
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @load-post-indexed-word(i32* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-word
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr i32* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i32* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i32* %iv2, i64 -1
+  %load = load i32* %gep2
+  call void @use-word(i32 %load)
+  %load2 = load i32* %iv2
+  call void @use-word(i32 %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i32* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-doubleword(i64* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-doubleword
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr i64* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i64* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i64* %iv2, i64 -1
+  %load = load i64* %gep2
+  call void @use-doubleword(i64 %load)
+  %load2 = load i64* %iv2
+  call void @use-doubleword(i64 %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i64* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-quadword(<2 x i64>* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-quadword
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}], #64
+entry:
+  %gep1 = getelementptr <2 x i64>* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi <2 x i64>* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr <2 x i64>* %iv2, i64 -1
+  %load = load <2 x i64>* %gep2
+  call void @use-quadword(<2 x i64> %load)
+  %load2 = load <2 x i64>* %iv2
+  call void @use-quadword(<2 x i64> %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr <2 x i64>* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-float(float* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-float
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr float* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi float* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr float* %iv2, i64 -1
+  %load = load float* %gep2
+  call void @use-float(float %load)
+  %load2 = load float* %iv2
+  call void @use-float(float %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr float* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-double(double* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-double
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr double* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi double* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr double* %iv2, i64 -1
+  %load = load double* %gep2
+  call void @use-double(double %load)
+  %load2 = load double* %iv2
+  call void @use-double(double %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr double* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+declare void @use-word(i32)
+declare void @use-doubleword(i64)
+declare void @use-quadword(<2 x i64>)
+declare void @use-float(float)
+declare void @use-double(double)
diff --git a/test/CodeGen/AArch64/ldst-regoffset.ll b/test/CodeGen/AArch64/ldst-regoffset.ll
index db30fd9..e2fa08b 100644
--- a/test/CodeGen/AArch64/ldst-regoffset.ll
+++ b/test/CodeGen/AArch64/ldst-regoffset.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
@@ -9,14 +9,14 @@
 @var_float = global float 0.0
 @var_double = global double 0.0
 
-define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) {
+define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_8bit:
 
    %addr8_sxtw = getelementptr i8* %base, i32 %off32
    %val8_sxtw = load volatile i8* %addr8_sxtw
    %val32_signed = sext i8 %val8_sxtw to i32
    store volatile i32 %val32_signed, i32* @var_32bit
-; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{[wx][0-9]+}}, sxtw]
 
   %addr_lsl = getelementptr i8* %base, i64 %off64
   %val8_lsl = load volatile i8* %addr_lsl
@@ -31,20 +31,20 @@ define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) {
   %val8_uxtw = load volatile i8* %addr_uxtw
   %newval8 = add i8 %val8_uxtw, 1
   store volatile i8 %newval8, i8* @var_8bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 
    ret void
 }
 
 
-define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
+define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_16bit:
 
    %addr8_sxtwN = getelementptr i16* %base, i32 %off32
    %val8_sxtwN = load volatile i16* %addr8_sxtwN
    %val32_signed = sext i16 %val8_sxtwN to i32
    store volatile i32 %val32_signed, i32* @var_32bit
-; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #1]
+; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #1]
 
   %addr_lslN = getelementptr i16* %base, i64 %off64
   %val8_lslN = load volatile i16* %addr_lslN
@@ -59,7 +59,7 @@ define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
   %val8_uxtw = load volatile i16* %addr_uxtw
   %newval8 = add i16 %val8_uxtw, 1
   store volatile i16 %newval8, i16* @var_16bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 
   %base_sxtw = ptrtoint i16* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -68,7 +68,7 @@ define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
   %val16_sxtw = load volatile i16* %addr_sxtw
   %val64_signed = sext i16 %val16_sxtw to i64
   store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{[wx][0-9]+}}, sxtw]
 
 
   %base_lsl = ptrtoint i16* %base to i64
@@ -87,17 +87,17 @@ define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
   %val32 = load volatile i32* @var_32bit
   %val16_trunc32 = trunc i32 %val32 to i16
   store volatile i16 %val16_trunc32, i16* %addr_uxtwN
-; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #1]
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #1]
    ret void
 }
 
-define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
+define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_32bit:
 
    %addr_sxtwN = getelementptr i32* %base, i32 %off32
    %val_sxtwN = load volatile i32* %addr_sxtwN
    store volatile i32 %val_sxtwN, i32* @var_32bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #2]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #2]
 
   %addr_lslN = getelementptr i32* %base, i64 %off64
   %val_lslN = load volatile i32* %addr_lslN
@@ -111,7 +111,7 @@ define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
   %val_uxtw = load volatile i32* %addr_uxtw
   %newval8 = add i32 %val_uxtw, 1
   store volatile i32 %newval8, i32* @var_32bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 
 
   %base_sxtw = ptrtoint i32* %base to i64
@@ -121,7 +121,7 @@ define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
   %val16_sxtw = load volatile i32* %addr_sxtw
   %val64_signed = sext i32 %val16_sxtw to i64
   store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
 
 
   %base_lsl = ptrtoint i32* %base to i64
@@ -139,17 +139,17 @@ define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
   %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i32*
   %val32 = load volatile i32* @var_32bit
   store volatile i32 %val32, i32* %addr_uxtwN
-; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #2]
    ret void
 }
 
-define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
+define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_64bit:
 
    %addr_sxtwN = getelementptr i64* %base, i32 %off32
    %val_sxtwN = load volatile i64* %addr_sxtwN
    store volatile i64 %val_sxtwN, i64* @var_64bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #3]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #3]
 
   %addr_lslN = getelementptr i64* %base, i64 %off64
   %val_lslN = load volatile i64* %addr_lslN
@@ -163,7 +163,7 @@ define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
   %val8_uxtw = load volatile i64* %addr_uxtw
   %newval8 = add i64 %val8_uxtw, 1
   store volatile i64 %newval8, i64* @var_64bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 
   %base_sxtw = ptrtoint i64* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -171,7 +171,7 @@ define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
   %addr_sxtw = inttoptr i64 %addrint_sxtw to i64*
   %val64_sxtw = load volatile i64* %addr_sxtw
   store volatile i64 %val64_sxtw, i64* @var_64bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
 
   %base_lsl = ptrtoint i64* %base to i64
   %addrint_lsl = add i64 %base_lsl, %off64
@@ -187,17 +187,17 @@ define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
   %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i64*
   %val64 = load volatile i64* @var_64bit
   store volatile i64 %val64, i64* %addr_uxtwN
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #3]
    ret void
 }
 
-define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
+define void @ldst_float(float* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_float:
 
    %addr_sxtwN = getelementptr float* %base, i32 %off32
    %val_sxtwN = load volatile float* %addr_sxtwN
    store volatile float %val_sxtwN, float* @var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #2]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #2]
 ; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %addr_lslN = getelementptr float* %base, i64 %off64
@@ -212,7 +212,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %addr_uxtw = inttoptr i64 %addrint1_uxtw to float*
   %val_uxtw = load volatile float* %addr_uxtw
   store volatile float %val_uxtw, float* @var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 ; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %base_sxtw = ptrtoint float* %base to i64
@@ -221,7 +221,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %addr_sxtw = inttoptr i64 %addrint_sxtw to float*
   %val64_sxtw = load volatile float* %addr_sxtw
   store volatile float %val64_sxtw, float* @var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
 ; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %base_lsl = ptrtoint float* %base to i64
@@ -239,18 +239,18 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %addr_uxtwN = inttoptr i64 %addrint_uxtwN to float*
   %val64 = load volatile float* @var_float
   store volatile float %val64, float* %addr_uxtwN
-; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #2]
 ; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
    ret void
 }
 
-define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
+define void @ldst_double(double* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_double:
 
    %addr_sxtwN = getelementptr double* %base, i32 %off32
    %val_sxtwN = load volatile double* %addr_sxtwN
    store volatile double %val_sxtwN, double* @var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #3]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #3]
 ; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %addr_lslN = getelementptr double* %base, i64 %off64
@@ -265,7 +265,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %addr_uxtw = inttoptr i64 %addrint1_uxtw to double*
   %val_uxtw = load volatile double* %addr_uxtw
   store volatile double %val_uxtw, double* @var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 ; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %base_sxtw = ptrtoint double* %base to i64
@@ -274,7 +274,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %addr_sxtw = inttoptr i64 %addrint_sxtw to double*
   %val64_sxtw = load volatile double* %addr_sxtw
   store volatile double %val64_sxtw, double* @var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
 ; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %base_lsl = ptrtoint double* %base to i64
@@ -292,26 +292,26 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %addr_uxtwN = inttoptr i64 %addrint_uxtwN to double*
   %val64 = load volatile double* @var_double
   store volatile double %val64, double* %addr_uxtwN
-; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #3]
 ; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
    ret void
 }
 
 
-define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
+define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_128bit:
 
    %addr_sxtwN = getelementptr fp128* %base, i32 %off32
    %val_sxtwN = load volatile fp128* %addr_sxtwN
    store volatile fp128 %val_sxtwN, fp128* %base
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
 
   %addr_lslN = getelementptr fp128* %base, i64 %off64
   %val_lslN = load volatile fp128* %addr_lslN
   store volatile fp128 %val_lslN, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #4]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
 
   %addrint_uxtw = ptrtoint fp128* %base to i64
   %offset_uxtw = zext i32 %off32 to i64
@@ -319,8 +319,8 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %addr_uxtw = inttoptr i64 %addrint1_uxtw to fp128*
   %val_uxtw = load volatile fp128* %addr_uxtw
   store volatile fp128 %val_uxtw, fp128* %base
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
 
   %base_sxtw = ptrtoint fp128* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -328,8 +328,8 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %addr_sxtw = inttoptr i64 %addrint_sxtw to fp128*
   %val64_sxtw = load volatile fp128* %addr_sxtw
   store volatile fp128 %val64_sxtw, fp128* %base
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
 
   %base_lsl = ptrtoint fp128* %base to i64
   %addrint_lsl = add i64 %base_lsl, %off64
@@ -337,7 +337,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %val64_lsl = load volatile fp128* %addr_lsl
   store volatile fp128 %val64_lsl, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
 
   %base_uxtwN = ptrtoint fp128* %base to i64
   %offset_uxtwN = zext i32 %off32 to i64
@@ -346,7 +346,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %addr_uxtwN = inttoptr i64 %addrint_uxtwN to fp128*
   %val64 = load volatile fp128* %base
   store volatile fp128 %val64, fp128* %addr_uxtwN
-; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #4]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
    ret void
 }
diff --git a/test/CodeGen/AArch64/ldst-unscaledimm.ll b/test/CodeGen/AArch64/ldst-unscaledimm.ll
index bea5bb5..1de8443 100644
--- a/test/CodeGen/AArch64/ldst-unscaledimm.ll
+++ b/test/CodeGen/AArch64/ldst-unscaledimm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
@@ -160,7 +160,7 @@ define void @ldst_32bit() {
   %val64_unsigned = zext i32 %val32_zext to i64
   store volatile i64 %val64_unsigned, i64* @var_64bit
 ; CHECK: ldur {{w[0-9]+}}, [{{x[0-9]+}}, #-256]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
 
 ; Sign-extension to 64-bits
   %addr32_8_sext = getelementptr i8* %addr_8bit, i64 -12
@@ -169,7 +169,7 @@ define void @ldst_32bit() {
   %val64_signed = sext i32 %val32_sext to i64
   store volatile i64 %val64_signed, i64* @var_64bit
 ; CHECK: ldursw {{x[0-9]+}}, [{{x[0-9]+}}, #-12]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
 
 ; Truncation from 64-bits
   %addr64_8_trunc = getelementptr i8* %addr_8bit, i64 255
diff --git a/test/CodeGen/AArch64/ldst-unsignedimm.ll b/test/CodeGen/AArch64/ldst-unsignedimm.ll
index 44c1586..e171d22 100644
--- a/test/CodeGen/AArch64/ldst-unsignedimm.ll
+++ b/test/CodeGen/AArch64/ldst-unsignedimm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
@@ -20,25 +20,25 @@ define void @ldst_8bit() {
    %val32_signed = sext i8 %val8_sext32 to i32
    store volatile i32 %val32_signed, i32* @var_32bit
 ; CHECK: adrp {{x[0-9]+}}, var_8bit
-; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; match a zero-extending load volatile 8-bit -> 32-bit
   %val8_zext32 = load volatile i8* @var_8bit
   %val32_unsigned = zext i8 %val8_zext32 to i32
   store volatile i32 %val32_unsigned, i32* @var_32bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; match an any-extending load volatile 8-bit -> 32-bit
   %val8_anyext = load volatile i8* @var_8bit
   %newval8 = add i8 %val8_anyext, 1
   store volatile i8 %newval8, i8* @var_8bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; match a sign-extending load volatile 8-bit -> 64-bit
   %val8_sext64 = load volatile i8* @var_8bit
   %val64_signed = sext i8 %val8_sext64 to i64
   store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsb {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrsb {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; match a zero-extending load volatile 8-bit -> 64-bit.
 ; This uses the fact that ldrb w0, [x0] will zero out the high 32-bits
@@ -46,19 +46,19 @@ define void @ldst_8bit() {
   %val8_zext64 = load volatile i8* @var_8bit
   %val64_unsigned = zext i8 %val8_zext64 to i64
   store volatile i64 %val64_unsigned, i64* @var_64bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; truncating store volatile 32-bits to 8-bits
   %val32 = load volatile i32* @var_32bit
   %val8_trunc32 = trunc i32 %val32 to i8
   store volatile i8 %val8_trunc32, i8* @var_8bit
-; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; truncating store volatile 64-bits to 8-bits
   %val64 = load volatile i64* @var_64bit
   %val8_trunc64 = trunc i64 %val64 to i8
   store volatile i8 %val8_trunc64, i8* @var_8bit
-; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
    ret void
 }
@@ -74,25 +74,25 @@ define void @ldst_16bit() {
   %val32_signed = sext i16 %val16_sext32 to i32
   store volatile i32 %val32_signed, i32* @var_32bit
 ; CHECK: adrp {{x[0-9]+}}, var_16bit
-; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; match a zero-extending load volatile 16-bit -> 32-bit
   %val16_zext32 = load volatile i16* @var_16bit
   %val32_unsigned = zext i16 %val16_zext32 to i32
   store volatile i32 %val32_unsigned, i32* @var_32bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; match an any-extending load volatile 16-bit -> 32-bit
   %val16_anyext = load volatile i16* @var_16bit
   %newval16 = add i16 %val16_anyext, 1
   store volatile i16 %newval16, i16* @var_16bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; match a sign-extending load volatile 16-bit -> 64-bit
   %val16_sext64 = load volatile i16* @var_16bit
   %val64_signed = sext i16 %val16_sext64 to i64
   store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; match a zero-extending load volatile 16-bit -> 64-bit.
 ; This uses the fact that ldrb w0, [x0] will zero out the high 32-bits
@@ -100,19 +100,19 @@ define void @ldst_16bit() {
   %val16_zext64 = load volatile i16* @var_16bit
   %val64_unsigned = zext i16 %val16_zext64 to i64
   store volatile i64 %val64_unsigned, i64* @var_64bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; truncating store volatile 32-bits to 16-bits
   %val32 = load volatile i32* @var_32bit
   %val16_trunc32 = trunc i32 %val32 to i16
   store volatile i16 %val16_trunc32, i16* @var_16bit
-; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; truncating store volatile 64-bits to 16-bits
   %val64 = load volatile i64* @var_64bit
   %val16_trunc64 = trunc i64 %val64 to i16
   store volatile i16 %val16_trunc64, i16* @var_16bit
-; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
   ret void
 }
@@ -124,29 +124,29 @@ define void @ldst_32bit() {
   %val32_noext = load volatile i32* @var_32bit
   store volatile i32 %val32_noext, i32* @var_32bit
 ; CHECK: adrp {{x[0-9]+}}, var_32bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
-; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
 
 ; Zero-extension to 64-bits
   %val32_zext = load volatile i32* @var_32bit
   %val64_unsigned = zext i32 %val32_zext to i64
   store volatile i64 %val64_unsigned, i64* @var_64bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
 
 ; Sign-extension to 64-bits
   %val32_sext = load volatile i32* @var_32bit
   %val64_signed = sext i32 %val32_sext to i64
   store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
 
 ; Truncation from 64-bits
   %val64_trunc = load volatile i64* @var_64bit
   %val32_trunc = trunc i64 %val64_trunc to i32
   store volatile i32 %val32_trunc, i32* @var_32bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
-; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
 
   ret void
 }
@@ -165,7 +165,7 @@ define void @ldst_complex_offsets() {
 ; CHECK: ldst_complex_offsets
   %arr8_addr = load volatile i8** @arr8
 ; CHECK: adrp {{x[0-9]+}}, arr8
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr8]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr8]
 
   %arr8_sub1_addr = getelementptr i8* %arr8_addr, i64 1
   %arr8_sub1 = load volatile i8* %arr8_sub1_addr
@@ -180,7 +180,7 @@ define void @ldst_complex_offsets() {
 
   %arr16_addr = load volatile i16** @arr16
 ; CHECK: adrp {{x[0-9]+}}, arr16
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr16]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr16]
 
   %arr16_sub1_addr = getelementptr i16* %arr16_addr, i64 1
   %arr16_sub1 = load volatile i16* %arr16_sub1_addr
@@ -195,7 +195,7 @@ define void @ldst_complex_offsets() {
 
   %arr32_addr = load volatile i32** @arr32
 ; CHECK: adrp {{x[0-9]+}}, arr32
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr32]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr32]
 
   %arr32_sub1_addr = getelementptr i32* %arr32_addr, i64 1
   %arr32_sub1 = load volatile i32* %arr32_sub1_addr
@@ -210,7 +210,7 @@ define void @ldst_complex_offsets() {
 
   %arr64_addr = load volatile i64** @arr64
 ; CHECK: adrp {{x[0-9]+}}, arr64
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr64]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr64]
 
   %arr64_sub1_addr = getelementptr i64* %arr64_addr, i64 1
   %arr64_sub1 = load volatile i64* %arr64_sub1_addr
@@ -230,11 +230,11 @@ define void @ldst_float() {
 
    %valfp = load volatile float* @var_float
 ; CHECK: adrp {{x[0-9]+}}, var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_float]
 ; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   store volatile float %valfp, float* @var_float
-; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_float]
 ; CHECK-NOFP-NOT: str {{s[0-9]+}},
 
    ret void
@@ -245,11 +245,11 @@ define void @ldst_double() {
 
    %valfp = load volatile double* @var_double
 ; CHECK: adrp {{x[0-9]+}}, var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_double]
 ; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   store volatile double %valfp, double* @var_double
-; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_double]
 ; CHECK-NOFP-NOT: str {{d[0-9]+}},
 
    ret void
diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg
index 9a66a00..77493d8 100644
--- a/test/CodeGen/AArch64/lit.local.cfg
+++ b/test/CodeGen/AArch64/lit.local.cfg
@@ -1,4 +1,11 @@
+import re
+
+config.suffixes = ['.ll']
+
 targets = set(config.root.targets_to_build.split())
 if not 'AArch64' in targets:
     config.unsupported = True
 
+# For now we don't test arm64-win32.
+if re.search(r'cygwin|mingw32|win32', config.target_triple):
+    config.unsupported = True
diff --git a/test/CodeGen/AArch64/literal_pools.ll b/test/CodeGen/AArch64/literal_pools.ll
deleted file mode 100644
index fc33aee..0000000
--- a/test/CodeGen/AArch64/literal_pools.ll
+++ /dev/null
@@ -1,103 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
-
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @foo() {
-; CHECK-LABEL: foo:
-    %val32 = load i32* @var32
-    %val64 = load i64* @var64
-
-    %val32_lit32 = and i32 %val32, 123456785
-    store volatile i32 %val32_lit32, i32* @var32
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{w[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{w[0-9]+}}, [x[[LITADDR]]]
-
-    %val64_lit32 = and i64 %val64, 305402420
-    store volatile i64 %val64_lit32, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{w[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{w[0-9]+}}, [x[[LITADDR]]]
-
-    %val64_lit32signed = and i64 %val64, -12345678
-    store volatile i64 %val64_lit32signed, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldrsw {{x[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldrsw {{x[0-9]+}}, [x[[LITADDR]]]
-
-    %val64_lit64 = and i64 %val64, 1234567898765432
-    store volatile i64 %val64_lit64, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{x[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{x[0-9]+}}, [x[[LITADDR]]]
-
-    ret void
-}
-
-@varfloat = global float 0.0
-@vardouble = global double 0.0
-
-define void @floating_lits() {
-; CHECK-LABEL: floating_lits:
-
-  %floatval = load float* @varfloat
-  %newfloat = fadd float %floatval, 128.0
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
-; CHECK: ldr [[LIT128:s[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]]
-; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{s[0-9]+}}, [x[[LITADDR]]]
-; CHECK-LARGE: fadd
-; CHECK-NOFP-LARGE-NOT: ldr {{s[0-9]+}},
-; CHECK-NOFP-LARGE-NOT: fadd
-
-  store float %newfloat, float* @varfloat
-
-  %doubleval = load double* @vardouble
-  %newdouble = fadd double %doubleval, 129.0
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
-; CHECK: ldr [[LIT129:d[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]]
-; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, [[LIT128]]
-; CHECK: fadd {{d[0-9]+}}, {{d[0-9]+}}, [[LIT129]]
-; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
-; CHECK-NOFP-NOT: fadd
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{d[0-9]+}}, [x[[LITADDR]]]
-; CHECK-NOFP-LARGE-NOT: ldr {{d[0-9]+}},
-
-  store double %newdouble, double* @vardouble
-
-  ret void
-}
diff --git a/test/CodeGen/AArch64/literal_pools_float.ll b/test/CodeGen/AArch64/literal_pools_float.ll
new file mode 100644
index 0000000..e53b8b6
--- /dev/null
+++ b/test/CodeGen/AArch64/literal_pools_float.ll
@@ -0,0 +1,46 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -code-model=large -mcpu=cyclone | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+
+define void @floating_lits() {
+; CHECK-LABEL: floating_lits:
+
+  %floatval = load float* @varfloat
+  %newfloat = fadd float %floatval, 128.0
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK: ldr [[LIT128:s[0-9]+]], [x[[LITBASE]], {{#?}}:lo12:[[CURLIT]]]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
+
+; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
+; CHECK-LARGE: ldr {{s[0-9]+}}, [x[[LITADDR]]]
+; CHECK-LARGE: fadd
+; CHECK-NOFP-LARGE-NOT: ldr {{s[0-9]+}},
+; CHECK-NOFP-LARGE-NOT: fadd
+
+  store float %newfloat, float* @varfloat
+
+  %doubleval = load double* @vardouble
+  %newdouble = fadd double %doubleval, 129.0
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK: ldr [[LIT129:d[0-9]+]], [x[[LITBASE]], {{#?}}:lo12:[[CURLIT]]]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
+; CHECK-NOFP-NOT: fadd
+
+; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
+; CHECK-LARGE: ldr {{d[0-9]+}}, [x[[LITADDR]]]
+; CHECK-NOFP-LARGE-NOT: ldr {{d[0-9]+}},
+
+  store double %newdouble, double* @vardouble
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/local_vars.ll b/test/CodeGen/AArch64/local_vars.ll
index b5cef85..2f5b9f2 100644
--- a/test/CodeGen/AArch64/local_vars.ll
+++ b/test/CodeGen/AArch64/local_vars.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP-ARM64 %s
 
 ; Make sure a reasonably sane prologue and epilogue are
 ; generated. This test is not robust in the face of an frame-handling
@@ -16,7 +16,7 @@
 declare void @foo()
 
 define void @trivial_func() nounwind {
-; CHECK: trivial_func: // @trivial_func
+; CHECK-LABEL: trivial_func: // @trivial_func
 ; CHECK-NEXT: // BB#0
 ; CHECK-NEXT: ret
 
@@ -24,11 +24,14 @@ define void @trivial_func() nounwind {
 }
 
 define void @trivial_fp_func() {
-; CHECK-WITHFP-LABEL: trivial_fp_func:
+; CHECK-WITHFP-AARCH64-LABEL: trivial_fp_func:
+; CHECK-WITHFP-AARCH64: sub sp, sp, #16
+; CHECK-WITHFP-AARCH64: stp x29, x30, [sp]
+; CHECK-WITHFP-AARCH64-NEXT: mov x29, sp
 
-; CHECK-WITHFP: sub sp, sp, #16
-; CHECK-WITHFP: stp x29, x30, [sp]
-; CHECK-WITHFP-NEXT: mov x29, sp
+; CHECK-WITHFP-ARM64-LABEL: trivial_fp_func:
+; CHECK-WITHFP-ARM64: stp x29, x30, [sp, #-16]!
+; CHECK-WITHFP-ARM64-NEXT: mov x29, sp
 
 ; Dont't really care, but it would be a Bad Thing if this came after the epilogue.
 ; CHECK: bl foo
@@ -48,10 +51,10 @@ define void @stack_local() {
 
   %val = load i64* @var
   store i64 %val, i64* %local_var
-; CHECK: str {{x[0-9]+}}, [sp, #{{[0-9]+}}]
+; CHECK-DAG: str {{x[0-9]+}}, [sp, #{{[0-9]+}}]
 
   store i64* %local_var, i64** @local_addr
-; CHECK: add {{x[0-9]+}}, sp, #{{[0-9]+}}
+; CHECK-DAG: add {{x[0-9]+}}, sp, #{{[0-9]+}}
 
   ret void
 }
diff --git a/test/CodeGen/AArch64/logical-imm.ll b/test/CodeGen/AArch64/logical-imm.ll
index e04bb51..a5e4a99 100644
--- a/test/CodeGen/AArch64/logical-imm.ll
+++ b/test/CodeGen/AArch64/logical-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/logical_shifted_reg.ll b/test/CodeGen/AArch64/logical_shifted_reg.ll
index a08ba20..b249d72 100644
--- a/test/CodeGen/AArch64/logical_shifted_reg.ll
+++ b/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var1_32 = global i32 0
 @var2_32 = global i32 0
@@ -6,7 +6,7 @@
 @var1_64 = global i64 0
 @var2_64 = global i64 0
 
-define void @logical_32bit() {
+define void @logical_32bit() minsize {
 ; CHECK-LABEL: logical_32bit:
   %val1 = load i32* @var1_32
   %val2 = load i32* @var2_32
@@ -96,7 +96,7 @@ define void @logical_32bit() {
   ret void
 }
 
-define void @logical_64bit() {
+define void @logical_64bit() minsize {
 ; CHECK-LABEL: logical_64bit:
   %val1 = load i64* @var1_64
   %val2 = load i64* @var2_64
diff --git a/test/CodeGen/AArch64/mature-mc-support.ll b/test/CodeGen/AArch64/mature-mc-support.ll
index 06e3cc7..276c54d 100644
--- a/test/CodeGen/AArch64/mature-mc-support.ll
+++ b/test/CodeGen/AArch64/mature-mc-support.ll
@@ -1,11 +1,11 @@
 ; Test that inline assembly is parsed by the MC layer when MC support is mature
 ; (even when the output is assembly).
 
-; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t1
-; RUN: FileCheck %s < %t1
+; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t3
+; RUN: FileCheck %s < %t3
 
-; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t2
-; RUN: FileCheck %s < %t2
+; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t4
+; RUN: FileCheck %s < %t4
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
 
diff --git a/test/CodeGen/AArch64/movw-consts.ll b/test/CodeGen/AArch64/movw-consts.ll
index 38e37db..93c1812 100644
--- a/test/CodeGen/AArch64/movw-consts.ll
+++ b/test/CodeGen/AArch64/movw-consts.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
 
 define i64 @test0() {
 ; CHECK-LABEL: test0:
@@ -9,43 +9,43 @@ define i64 @test0() {
 
 define i64 @test1() {
 ; CHECK-LABEL: test1:
-; CHECK: movz x0, #1
+; CHECK: orr w0, wzr, #0x1
   ret i64 1
 }
 
 define i64 @test2() {
 ; CHECK-LABEL: test2:
-; CHECK: movz x0, #65535
+; CHECK: orr w0, wzr, #0xffff
   ret i64 65535
 }
 
 define i64 @test3() {
 ; CHECK-LABEL: test3:
-; CHECK: movz x0, #1, lsl #16
+; CHECK: orr w0, wzr, #0x10000
   ret i64 65536
 }
 
 define i64 @test4() {
 ; CHECK-LABEL: test4:
-; CHECK: movz x0, #65535, lsl #16
+; CHECK: orr w0, wzr, #0xffff0000
   ret i64 4294901760
 }
 
 define i64 @test5() {
 ; CHECK-LABEL: test5:
-; CHECK: movz x0, #1, lsl #32
+; CHECK: orr x0, xzr, #0x100000000
   ret i64 4294967296
 }
 
 define i64 @test6() {
 ; CHECK-LABEL: test6:
-; CHECK: movz x0, #65535, lsl #32
+; CHECK: orr x0, xzr, #0xffff00000000
   ret i64 281470681743360
 }
 
 define i64 @test7() {
 ; CHECK-LABEL: test7:
-; CHECK: movz x0, #1, lsl #48
+; CHECK: orr x0, xzr, #0x1000000000000
   ret i64 281474976710656
 }
 
@@ -53,7 +53,7 @@ define i64 @test7() {
 ; couldn't. Useful even for i64
 define i64 @test8() {
 ; CHECK-LABEL: test8:
-; CHECK: movn w0, #60875
+; CHECK: movn w0, #{{60875|0xedcb}}
   ret i64 4294906420
 }
 
@@ -65,7 +65,7 @@ define i64 @test9() {
 
 define i64 @test10() {
 ; CHECK-LABEL: test10:
-; CHECK: movn x0, #60875, lsl #16
+; CHECK: movn x0, #{{60875|0xedcb}}, lsl #16
   ret i64 18446744069720047615
 }
 
@@ -75,35 +75,35 @@ define i64 @test10() {
 
 define void @test11() {
 ; CHECK-LABEL: test11:
-; CHECK: mov {{w[0-9]+}}, wzr
+; CHECK: str wzr
   store i32 0, i32* @var32
   ret void
 }
 
 define void @test12() {
 ; CHECK-LABEL: test12:
-; CHECK: movz {{w[0-9]+}}, #1
+; CHECK: orr {{w[0-9]+}}, wzr, #0x1
   store i32 1, i32* @var32
   ret void
 }
 
 define void @test13() {
 ; CHECK-LABEL: test13:
-; CHECK: movz {{w[0-9]+}}, #65535
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff
   store i32 65535, i32* @var32
   ret void
 }
 
 define void @test14() {
 ; CHECK-LABEL: test14:
-; CHECK: movz {{w[0-9]+}}, #1, lsl #16
+; CHECK: orr {{w[0-9]+}}, wzr, #0x10000
   store i32 65536, i32* @var32
   ret void
 }
 
 define void @test15() {
 ; CHECK-LABEL: test15:
-; CHECK: movz {{w[0-9]+}}, #65535, lsl #16
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff0000
   store i32 4294901760, i32* @var32
   ret void
 }
@@ -119,6 +119,6 @@ define i64 @test17() {
 ; CHECK-LABEL: test17:
 
   ; Mustn't MOVN w0 here.
-; CHECK: movn x0, #2
+; CHECK: orr x0, xzr, #0xfffffffffffffffd
   ret i64 -3
 }
diff --git a/test/CodeGen/AArch64/movw-shift-encoding.ll b/test/CodeGen/AArch64/movw-shift-encoding.ll
index ec133bd..178fccc 100644
--- a/test/CodeGen/AArch64/movw-shift-encoding.ll
+++ b/test/CodeGen/AArch64/movw-shift-encoding.ll
@@ -7,8 +7,9 @@
 
 define i32* @get_var() {
   ret i32* @var
-; CHECK: movz    x0, #:abs_g3:var        // encoding: [A,A,0xe0'A',0xd2'A']
-; CHECK: movk    x0, #:abs_g2_nc:var     // encoding: [A,A,0xc0'A',0xf2'A']
-; CHECK: movk    x0, #:abs_g1_nc:var     // encoding: [A,A,0xa0'A',0xf2'A']
-; CHECK: movk    x0, #:abs_g0_nc:var     // encoding: [A,A,0x80'A',0xf2'A']
+
+; CHECK: movz    x0, #:abs_g3:var        // encoding: [0bAAA00000,A,0b111AAAAA,0xd2]
+; CHECK: movk    x0, #:abs_g2_nc:var     // encoding: [0bAAA00000,A,0b110AAAAA,0xf2]
+; CHECK: movk    x0, #:abs_g1_nc:var     // encoding: [0bAAA00000,A,0b101AAAAA,0xf2]
+; CHECK: movk    x0, #:abs_g0_nc:var     // encoding: [0bAAA00000,A,0b100AAAAA,0xf2]
 }
diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll
index f58c598..0689fbd 100644
--- a/test/CodeGen/AArch64/mul-lohi.ll
+++ b/test/CodeGen/AArch64/mul-lohi.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
 
 define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: test_128bitmul:
diff --git a/test/CodeGen/AArch64/neon-across.ll b/test/CodeGen/AArch64/neon-across.ll
deleted file mode 100644
index 6d30c95..0000000
--- a/test/CodeGen/AArch64/neon-across.ll
+++ /dev/null
@@ -1,472 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare float @llvm.aarch64.neon.vminnmv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vmaxnmv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vminv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vmaxv(<4 x float>)
-
-declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32>)
-
-declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8>)
-
-declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32>)
-
-declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8>)
-
-define i16 @test_vaddlv_s8(<8 x i8> %a) {
-; CHECK: test_vaddlv_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i16> %saddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlv_s16(<4 x i16> %a) {
-; CHECK: test_vaddlv_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i32> %saddlv.i, i32 0
-  ret i32 %0
-}
-
-define i16 @test_vaddlv_u8(<8 x i8> %a) {
-; CHECK: test_vaddlv_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i16> %uaddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlv_u16(<4 x i16> %a) {
-; CHECK: test_vaddlv_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i32> %uaddlv.i, i32 0
-  ret i32 %0
-}
-
-define i16 @test_vaddlvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i16> %saddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i32> %saddlv.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vaddlvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_s32:
-; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %saddlv.i = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i64> %saddlv.i, i32 0
-  ret i64 %0
-}
-
-define i16 @test_vaddlvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i16> %uaddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i32> %uaddlv.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vaddlvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_u32:
-; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %uaddlv.i = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i64> %uaddlv.i, i32 0
-  ret i64 %0
-}
-
-define i8 @test_vmaxv_s8(<8 x i8> %a) {
-; CHECK: test_vmaxv_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %smaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxv_s16(<4 x i16> %a) {
-; CHECK: test_vmaxv_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %smaxv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vmaxv_u8(<8 x i8> %a) {
-; CHECK: test_vmaxv_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %umaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxv_u16(<4 x i16> %a) {
-; CHECK: test_vmaxv_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %umaxv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vmaxvq_s8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %smaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxvq_s16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %smaxv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_s32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_s32:
-; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %smaxv.i = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %smaxv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vmaxvq_u8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %umaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxvq_u16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %umaxv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_u32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_u32:
-; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %umaxv.i = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %umaxv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vminv_s8(<8 x i8> %a) {
-; CHECK: test_vminv_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %sminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminv_s16(<4 x i16> %a) {
-; CHECK: test_vminv_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %sminv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vminv_u8(<8 x i8> %a) {
-; CHECK: test_vminv_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %uminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminv_u16(<4 x i16> %a) {
-; CHECK: test_vminv_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %uminv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vminvq_s8(<16 x i8> %a) {
-; CHECK: test_vminvq_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %sminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminvq_s16(<8 x i16> %a) {
-; CHECK: test_vminvq_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %sminv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vminvq_s32(<4 x i32> %a) {
-; CHECK: test_vminvq_s32:
-; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sminv.i = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %sminv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vminvq_u8(<16 x i8> %a) {
-; CHECK: test_vminvq_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %uminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminvq_u16(<8 x i16> %a) {
-; CHECK: test_vminvq_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %uminv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vminvq_u32(<4 x i32> %a) {
-; CHECK: test_vminvq_u32:
-; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %uminv.i = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %uminv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vaddv_s8(<8 x i8> %a) {
-; CHECK: test_vaddv_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddv_s16(<4 x i16> %a) {
-; CHECK: test_vaddv_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vaddv_u8(<8 x i8> %a) {
-; CHECK: test_vaddv_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddv_u16(<4 x i16> %a) {
-; CHECK: test_vaddv_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vaddvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddvq_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddvq_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddvq_s32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %vaddv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vaddvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddvq_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddvq_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddvq_u32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %vaddv.i, i32 0
-  ret i32 %0
-}
-
-define float @test_vmaxvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxvq_f32:
-; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vmaxv(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vminvq_f32(<4 x float> %a) {
-; CHECK: test_vminvq_f32:
-; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vminv(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vmaxnmvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxnmvq_f32:
-; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vmaxnmv(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vminnmvq_f32(<4 x float> %a) {
-; CHECK: test_vminnmvq_f32:
-; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vminnmv(<4 x float> %a)
-  ret float %0
-}
-
diff --git a/test/CodeGen/AArch64/neon-add-pairwise.ll b/test/CodeGen/AArch64/neon-add-pairwise.ll
deleted file mode 100644
index 32d8222..0000000
--- a/test/CodeGen/AArch64/neon-add-pairwise.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_addp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: addp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_addp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: addp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_addp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: addp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_addp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: addp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_addp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: addp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_addp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: addp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
-declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_addp_v2i64:
-        %val = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: addp v0.2d, v0.2d, v1.2d
-        ret <2 x i64> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_faddp_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: faddp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_faddp_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: faddp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_faddp_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: faddp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-define i32 @test_vaddv.v2i32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddv.v2i32
-; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32>)
-\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 7e5b693..6497856 100644
--- a/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1,45 +1,52 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and8xi8:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <8 x i8> %a, %b;
 	ret <8 x i8> %tmp1
 }
 
 define <16 x i8> @and16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and16xi8:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <16 x i8> %a, %b;
 	ret <16 x i8> %tmp1
 }
 
 
 define <8 x i8> @orr8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr8xi8:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <8 x i8> %a, %b;
 	ret <8 x i8> %tmp1
 }
 
 define <16 x i8> @orr16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr16xi8:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <16 x i8> %a, %b;
 	ret <16 x i8> %tmp1
 }
 
 
 define <8 x i8> @xor8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: xor8xi8:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <8 x i8> %a, %b;
 	ret <8 x i8> %tmp1
 }
 
 define <16 x i8> @xor16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: xor16xi8:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <16 x i8> %a, %b;
 	ret <16 x i8> %tmp1
 }
 
 define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl8xi8_const:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 >
 	%tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1 >
 	%tmp3 = or <8 x i8> %tmp1, %tmp2
@@ -47,7 +54,8 @@ define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b)  {
 }
 
 define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl16xi8_const:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0 >
 	%tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1 >
 	%tmp3 = or <16 x i8> %tmp1, %tmp2
@@ -55,398 +63,461 @@ define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) {
 }
 
 define <8 x i8> @orn8xi8(<8 x i8> %a, <8 x i8> %b)  {
-;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn8xi8:
+; CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = or <8 x i8> %a, %tmp1
   ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @orn16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn16xi8:
+; CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = or <16 x i8> %a, %tmp1
   ret <16 x i8> %tmp2
 }
 
 define <8 x i8> @bic8xi8(<8 x i8> %a, <8 x i8> %b)  {
-;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic8xi8:
+; CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = and <8 x i8> %a, %tmp1
   ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @bic16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic16xi8:
+; CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = and <16 x i8> %a, %tmp1
   ret <16 x i8> %tmp2
 }
 
 define <2 x i32> @orrimm2s_lsl0(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orrimm2s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = or <2 x i32> %a, < i32 255, i32 255>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @orrimm2s_lsl8(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orrimm2s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <2 x i32> %a, < i32 65280, i32 65280>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @orrimm2s_lsl16(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orrimm2s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <2 x i32> %a, < i32 16711680, i32 16711680>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @orrimm2s_lsl24(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orrimm2s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <2 x i32> %a, < i32 4278190080, i32 4278190080>
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl0(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orrimm4s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = or <4 x i32> %a, < i32 255, i32 255, i32 255, i32 255>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl8(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orrimm4s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <4 x i32> %a, < i32 65280, i32 65280, i32 65280, i32 65280>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl16(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orrimm4s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <4 x i32> %a, < i32 16711680, i32 16711680, i32 16711680, i32 16711680>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl24(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orrimm4s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <4 x i32> %a, < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i16> @orrimm4h_lsl0(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orrimm4h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = or <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255 >
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @orrimm4h_lsl8(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orrimm4h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 >
 	ret <4 x i16> %tmp1
 }
 
 define <8 x i16> @orrimm8h_lsl0(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orrimm8h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = or <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @orrimm8h_lsl8(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orrimm8h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl0(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0x10
+; CHECK-LABEL: bicimm2s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0x10|16}}
 	%tmp1 = and <2 x i32> %a, < i32 4294967279, i32 4294967279 >
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl8(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0x10, lsl #8
+; CHECK-LABEL: bicimm2s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #8
 	%tmp1 = and <2 x i32> %a, < i32 4294963199, i32  4294963199 >
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl16(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0x10, lsl #16
+; CHECK-LABEL: bicimm2s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #16
 	%tmp1 = and <2 x i32> %a, < i32 4293918719, i32 4293918719 >
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl124(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0x10, lsl #24
+; CHECK-LABEL: bicimm2s_lsl124:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #24
 	%tmp1 = and <2 x i32> %a, < i32 4026531839, i32  4026531839>
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl0(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0x10
+; CHECK-LABEL: bicimm4s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0x10|16}}
 	%tmp1 = and <4 x i32> %a, < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl8(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0x10, lsl #8
+; CHECK-LABEL: bicimm4s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #8
 	%tmp1 = and <4 x i32> %a, < i32 4294963199, i32  4294963199, i32  4294963199, i32  4294963199 >
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl16(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0x10, lsl #16
+; CHECK-LABEL: bicimm4s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #16
 	%tmp1 = and <4 x i32> %a, < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 >
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl124(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0x10, lsl #24
+; CHECK-LABEL: bicimm4s_lsl124:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #24
 	%tmp1 = and <4 x i32> %a, < i32 4026531839, i32  4026531839, i32  4026531839, i32  4026531839>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl0_a(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0x10
+; CHECK-LABEL: bicimm4h_lsl0_a:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0x10|16}}
 	%tmp1 = and <4 x i16> %a, < i16 4294967279, i16  4294967279, i16  4294967279, i16  4294967279 >
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl0_b(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: bicimm4h_lsl0_b:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = and <4 x i16> %a, < i16 65280, i16  65280, i16  65280, i16 65280 >
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl8_a(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0x10, lsl #8
+; CHECK-LABEL: bicimm4h_lsl8_a:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0x10|16}}, lsl #8
 	%tmp1 = and <4 x i16> %a, < i16 4294963199, i16  4294963199, i16  4294963199, i16  4294963199>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl8_b(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: bicimm4h_lsl8_b:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255>
 	ret <4 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl0_a(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0x10
+; CHECK-LABEL: bicimm8h_lsl0_a:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0x10|16}}
 	%tmp1 = and <8 x i16> %a, < i16 4294967279, i16  4294967279, i16  4294967279, i16  4294967279,
    i16  4294967279, i16  4294967279, i16  4294967279, i16  4294967279 >
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl0_b(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: bicimm8h_lsl0_b:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = and <8 x i16> %a, < i16 65280, i16  65280, i16  65280, i16 65280, i16 65280, i16  65280, i16  65280, i16 65280 >
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl8_a(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0x10, lsl #8
+; CHECK-LABEL: bicimm8h_lsl8_a:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0x10|16}}, lsl #8
 	%tmp1 = and <8 x i16> %a, < i16 4294963199, i16  4294963199, i16  4294963199, i16  4294963199,
    i16  4294963199, i16  4294963199, i16  4294963199, i16  4294963199>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl8_b(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: bicimm8h_lsl8_b:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i32> @and2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and2xi32:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <2 x i32> %a, %b;
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i16> @and4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and4xi16:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <4 x i16> %a, %b;
 	ret <4 x i16> %tmp1
 }
 
 define <1 x i64> @and1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and1xi64:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <1 x i64> %a, %b;
 	ret <1 x i64> %tmp1
 }
 
 define <4 x i32> @and4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and4xi32:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <4 x i32> %a, %b;
 	ret <4 x i32> %tmp1
 }
 
 define <8 x i16> @and8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and8xi16:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <8 x i16> %a, %b;
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @and2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and2xi64:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <2 x i64> %a, %b;
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i32> @orr2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr2xi32:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <2 x i32> %a, %b;
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i16> @orr4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr4xi16:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <4 x i16> %a, %b;
 	ret <4 x i16> %tmp1
 }
 
 define <1 x i64> @orr1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr1xi64:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <1 x i64> %a, %b;
 	ret <1 x i64> %tmp1
 }
 
 define <4 x i32> @orr4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr4xi32:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <4 x i32> %a, %b;
 	ret <4 x i32> %tmp1
 }
 
 define <8 x i16> @orr8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr8xi16:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <8 x i16> %a, %b;
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @orr2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr2xi64:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <2 x i64> %a, %b;
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i32> @eor2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: eor2xi32:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <2 x i32> %a, %b;
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i16> @eor4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: eor4xi16:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <4 x i16> %a, %b;
 	ret <4 x i16> %tmp1
 }
 
 define <1 x i64> @eor1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: eor1xi64:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <1 x i64> %a, %b;
 	ret <1 x i64> %tmp1
 }
 
 define <4 x i32> @eor4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: eor4xi32:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <4 x i32> %a, %b;
 	ret <4 x i32> %tmp1
 }
 
 define <8 x i16> @eor8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: eor8xi16:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <8 x i16> %a, %b;
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @eor2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: eor2xi64:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <2 x i64> %a, %b;
 	ret <2 x i64> %tmp1
 }
 
 
 define <2 x i32> @bic2xi32(<2 x i32> %a, <2 x i32> %b)  {
-;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic2xi32:
+; CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
   %tmp2 = and <2 x i32> %a, %tmp1
   ret <2 x i32> %tmp2
 }
 
 define <4 x i16> @bic4xi16(<4 x i16> %a, <4 x i16> %b)  {
-;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic4xi16:
+; CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
   %tmp2 = and <4 x i16> %a, %tmp1
   ret <4 x i16> %tmp2
 }
 
 define <1 x i64> @bic1xi64(<1 x i64> %a, <1 x i64> %b)  {
-;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic1xi64:
+; CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <1 x i64> %b, < i64 -1>
   %tmp2 = and <1 x i64> %a, %tmp1
   ret <1 x i64> %tmp2
 }
 
 define <4 x i32> @bic4xi32(<4 x i32> %a, <4 x i32> %b)  {
-;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic4xi32:
+; CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
   %tmp2 = and <4 x i32> %a, %tmp1
   ret <4 x i32> %tmp2
 }
 
 define <8 x i16> @bic8xi16(<8 x i16> %a, <8 x i16> %b)  {
-;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic8xi16:
+; CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
   %tmp2 = and <8 x i16> %a, %tmp1
   ret <8 x i16> %tmp2
 }
 
 define <2 x i64> @bic2xi64(<2 x i64> %a, <2 x i64> %b)  {
-;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic2xi64:
+; CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
   %tmp2 = and <2 x i64> %a, %tmp1
   ret <2 x i64> %tmp2
 }
 
 define <2 x i32> @orn2xi32(<2 x i32> %a, <2 x i32> %b)  {
-;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn2xi32:
+; CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
   %tmp2 = or <2 x i32> %a, %tmp1
   ret <2 x i32> %tmp2
 }
 
 define <4 x i16> @orn4xi16(<4 x i16> %a, <4 x i16> %b)  {
-;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn4xi16:
+; CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
   %tmp2 = or <4 x i16> %a, %tmp1
   ret <4 x i16> %tmp2
 }
 
 define <1 x i64> @orn1xi64(<1 x i64> %a, <1 x i64> %b)  {
-;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn1xi64:
+; CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <1 x i64> %b, < i64 -1>
   %tmp2 = or <1 x i64> %a, %tmp1
   ret <1 x i64> %tmp2
 }
 
 define <4 x i32> @orn4xi32(<4 x i32> %a, <4 x i32> %b)  {
-;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn4xi32:
+; CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
   %tmp2 = or <4 x i32> %a, %tmp1
   ret <4 x i32> %tmp2
 }
 
 define <8 x i16> @orn8xi16(<8 x i16> %a, <8 x i16> %b)  {
-;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn8xi16:
+; CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
   %tmp2 = or <8 x i16> %a, %tmp1
   ret <8 x i16> %tmp2
 }
 
 define <2 x i64> @orn2xi64(<2 x i64> %a, <2 x i64> %b)  {
-;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn2xi64:
+; CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
   %tmp2 = or <2 x i64> %a, %tmp1
   ret <2 x i64> %tmp2
 }
 
 define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl2xi32_const:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <2 x i32> %a, < i32 -1, i32 0 >
 	%tmp2 = and <2 x i32> %b, < i32 0, i32 -1 >
 	%tmp3 = or <2 x i32> %tmp1, %tmp2
@@ -455,7 +526,8 @@ define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b)  {
 
 
 define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl4xi16_const:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <4 x i16> %a, < i16 -1, i16 0, i16 -1,i16 0 >
 	%tmp2 = and <4 x i16> %b, < i16 0, i16 -1,i16 0, i16 -1 >
 	%tmp3 = or <4 x i16> %tmp1, %tmp2
@@ -463,7 +535,8 @@ define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b)  {
 }
 
 define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl1xi64_const:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <1 x i64> %a, < i64 -16 >
 	%tmp2 = and <1 x i64> %b, < i64 15 >
 	%tmp3 = or <1 x i64> %tmp1, %tmp2
@@ -471,7 +544,8 @@ define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b)  {
 }
 
 define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl4xi32_const:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <4 x i32> %a, < i32 -1, i32 0, i32 -1, i32 0 >
 	%tmp2 = and <4 x i32> %b, < i32 0, i32 -1, i32 0, i32 -1 >
 	%tmp3 = or <4 x i32> %tmp1, %tmp2
@@ -479,7 +553,8 @@ define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b)  {
 }
 
 define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl8xi16_const:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 0,i16 0, i16 -1, i16 -1, i16 0,i16 0 >
 	%tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 -1, i16 -1, i16 0, i16 0, i16 -1, i16 -1 >
 	%tmp3 = or <8 x i16> %tmp1, %tmp2
@@ -487,7 +562,8 @@ define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b)  {
 }
 
 define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl2xi64_const:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <2 x i64> %a, < i64 -1, i64 0 >
 	%tmp2 = and <2 x i64> %b, < i64 0, i64 -1 >
 	%tmp3 = or <2 x i64> %tmp1, %tmp2
@@ -496,7 +572,8 @@ define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b)  {
 
 
 define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl8xi8:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <8 x i8> %v1, %v2
   %2 = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   %3 = and <8 x i8> %2, %v3
@@ -505,7 +582,8 @@ define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
 }
 
 define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl4xi16:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <4 x i16> %v1, %v2
   %2 = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
   %3 = and <4 x i16> %2, %v3
@@ -514,7 +592,8 @@ define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
 }
 
 define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl2xi32:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <2 x i32> %v1, %v2
   %2 = xor <2 x i32> %v1, <i32 -1, i32 -1>
   %3 = and <2 x i32> %2, %v3
@@ -523,7 +602,8 @@ define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
 }
 
 define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl1xi64:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <1 x i64> %v1, %v2
   %2 = xor <1 x i64> %v1, <i64 -1>
   %3 = and <1 x i64> %2, %v3
@@ -532,7 +612,8 @@ define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
 }
 
 define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl16xi8:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <16 x i8> %v1, %v2
   %2 = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   %3 = and <16 x i8> %2, %v3
@@ -541,7 +622,8 @@ define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
 }
 
 define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl8xi16:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <8 x i16> %v1, %v2
   %2 = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   %3 = and <8 x i16> %2, %v3
@@ -550,7 +632,8 @@ define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
 }
 
 define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl4xi32:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <4 x i32> %v1, %v2
   %2 = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
   %3 = and <4 x i32> %2, %v3
@@ -559,56 +642,63 @@ define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
 }
 
 define <8 x i8> @vselect_v8i8(<8 x i8> %a) {
-;CHECK:  movi	 {{d[0-9]+}}, #0xffff
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_v8i8:
+; CHECK:  movi {{d[0-9]+}}, #0x{{0*}}ffff
+; CHECK-NEXT:  {{bsl v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b|and v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b}}
   %b = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i8> %a, <8 x i8> <i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
   ret <8 x i8> %b
 }
 
 define <4 x i16> @vselect_v4i16(<4 x i16> %a) {
-;CHECK:  movi	 {{d[0-9]+}}, #0xffff
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_v4i16:
+; CHECK:  movi {{d[0-9]+}}, #0x{{0*}}ffff
+; CHECK-NEXT:  {{bsl v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b|and v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b}}
   %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i16> %a, <4 x i16> <i16 undef, i16 0, i16 0, i16 0>
   ret <4 x i16> %b
 }
 
 define <8 x i8> @vselect_cmp_ne(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT:  not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmp_ne:
+; CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT:  {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %cmp = icmp ne <8 x i8> %a, %b
   %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
   ret <8 x i8> %d
 }
 
 define <8 x i8> @vselect_cmp_eq(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmp_eq:
+; CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %cmp = icmp eq <8 x i8> %a, %b
   %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
   ret <8 x i8> %d
 }
 
 define <8 x i8> @vselect_cmpz_ne(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-;CHECK-NEXT:  not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmpz_ne:
+; CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+; CHECK-NEXT:  {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %cmp = icmp ne <8 x i8> %a, zeroinitializer
   %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
   ret <8 x i8> %d
 }
 
 define <8 x i8> @vselect_cmpz_eq(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmpz_eq:
+; CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+; CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %cmp = icmp eq <8 x i8> %a, zeroinitializer
   %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
   ret <8 x i8> %d
 }
 
 define <8 x i8> @vselect_tst(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK:  cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_tst:
+; CHECK:  cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = and <8 x i8> %a, %b
 	%tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
   %d = select <8 x i1> %tmp4, <8 x i8> %b, <8 x i8> %c
@@ -616,7 +706,8 @@ define <8 x i8> @vselect_tst(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
 }
 
 define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl2xi64:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <2 x i64> %v1, %v2
   %2 = xor <2 x i64> %v1, <i64 -1, i64 -1>
   %3 = and <2 x i64> %2, %v3
@@ -625,458 +716,534 @@ define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
 }
 
 define <8 x i8> @orrimm8b_as_orrimm4h_lsl0(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orrimm8b_as_orrimm4h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}
   %val = or <8 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
   ret <8 x i8> %val
 }
 
 define <8 x i8> @orrimm8b_as_orimm4h_lsl8(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orrimm8b_as_orimm4h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
   %val = or <8 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
   ret <8 x i8> %val
 }
 
 define <16 x i8> @orimm16b_as_orrimm8h_lsl0(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orimm16b_as_orrimm8h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}
   %val = or <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
   ret <16 x i8> %val
 }
 
 define <16 x i8> @orimm16b_as_orrimm8h_lsl8(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orimm16b_as_orrimm8h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
   %val = or <16 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
   ret <16 x i8> %val
 }
 
 define <8 x i8> @and8imm2s_lsl0(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: and8imm2s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = and <8 x i8> %a, < i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @and8imm2s_lsl8(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: and8imm2s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <8 x i8> %a, < i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @and8imm2s_lsl16(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: and8imm2s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <8 x i8> %a, < i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @and8imm2s_lsl24(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+; CHECK-LABEL: and8imm2s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <8 x i8> %a, < i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1>
 	ret <8 x i8> %tmp1
 }
 
 define <4 x i16> @and16imm2s_lsl0(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: and16imm2s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = and <4 x i16> %a, < i16 65280, i16 65535, i16 65280, i16 65535>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @and16imm2s_lsl8(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: and16imm2s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <4 x i16> %a, < i16 255, i16 65535, i16 255, i16 65535>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @and16imm2s_lsl16(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: and16imm2s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <4 x i16> %a, < i16 65535, i16 65280, i16 65535, i16 65280>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @and16imm2s_lsl24(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+; CHECK-LABEL: and16imm2s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <4 x i16> %a, < i16 65535, i16 511, i16 65535, i16 511>
 	ret <4 x i16> %tmp1
 }
 
 
 define <1 x i64> @and64imm2s_lsl0(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: and64imm2s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = and <1 x i64> %a, < i64 -1095216660736>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @and64imm2s_lsl8(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: and64imm2s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <1 x i64> %a, < i64 -280375465148161>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @and64imm2s_lsl16(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: and64imm2s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <1 x i64> %a, < i64 -71776119077928961>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @and64imm2s_lsl24(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+; CHECK-LABEL: and64imm2s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <1 x i64> %a, < i64 144115183814443007>
 	ret <1 x i64> %tmp1
 }
 
 define <16 x i8> @and8imm4s_lsl0(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: and8imm4s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = and <16 x i8> %a, < i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @and8imm4s_lsl8(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: and8imm4s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <16 x i8> %a, < i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @and8imm4s_lsl16(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: and8imm4s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <16 x i8> %a, < i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @and8imm4s_lsl24(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+; CHECK-LABEL: and8imm4s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <16 x i8> %a, < i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1>
 	ret <16 x i8> %tmp1
 }
 
 define <8 x i16> @and16imm4s_lsl0(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: and16imm4s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = and <8 x i16> %a, < i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @and16imm4s_lsl8(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: and16imm4s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <8 x i16> %a, < i16 255, i16 65535, i16 255, i16 65535, i16 255, i16 65535, i16 255, i16 65535>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @and16imm4s_lsl16(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: and16imm4s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <8 x i16> %a, < i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @and16imm4s_lsl24(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+; CHECK-LABEL: and16imm4s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <8 x i16> %a, < i16 65535, i16 511, i16 65535, i16 511, i16 65535, i16 511, i16 65535, i16 511>
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @and64imm4s_lsl0(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: and64imm4s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = and <2 x i64> %a, < i64 -1095216660736, i64 -1095216660736>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @and64imm4s_lsl8(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: and64imm4s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <2 x i64> %a, < i64 -280375465148161, i64 -280375465148161>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @and64imm4s_lsl16(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: and64imm4s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <2 x i64> %a, < i64 -71776119077928961, i64 -71776119077928961>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @and64imm4s_lsl24(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+; CHECK-LABEL: and64imm4s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <2 x i64> %a, < i64 144115183814443007, i64 144115183814443007>
 	ret <2 x i64> %tmp1
 }
 
 define <8 x i8> @and8imm4h_lsl0(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: and8imm4h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = and <8 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @and8imm4h_lsl8(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: and8imm4h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <8 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
 	ret <8 x i8> %tmp1
 }
 
 define <2 x i32> @and16imm4h_lsl0(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: and16imm4h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = and <2 x i32> %a, < i32 4278255360, i32 4278255360>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @and16imm4h_lsl8(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: and16imm4h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <2 x i32> %a, < i32 16711935, i32 16711935>
 	ret <2 x i32> %tmp1
 }
 
 define <1 x i64> @and64imm4h_lsl0(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: and64imm4h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = and <1 x i64> %a, < i64 -71777214294589696>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @and64imm4h_lsl8(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: and64imm4h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <1 x i64> %a, < i64 71777214294589695>
 	ret <1 x i64> %tmp1
 }
 
 define <16 x i8> @and8imm8h_lsl0(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: and8imm8h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = and <16 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255 >
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @and8imm8h_lsl8(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: and8imm8h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0 >
 	ret <16 x i8> %tmp1
 }
 
 define <4 x i32> @and16imm8h_lsl0(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: and16imm8h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = and <4 x i32> %a, < i32 4278255360, i32 4278255360, i32 4278255360, i32 4278255360>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @and16imm8h_lsl8(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: and16imm8h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <4 x i32> %a, < i32 16711935, i32 16711935, i32 16711935, i32 16711935>
 	ret <4 x i32> %tmp1
 }
 
 define <2 x i64> @and64imm8h_lsl0(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: and64imm8h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = and <2 x i64> %a, < i64 -71777214294589696, i64 -71777214294589696>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @and64imm8h_lsl8(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: and64imm8h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <2 x i64> %a, < i64 71777214294589695, i64 71777214294589695>
 	ret <2 x i64> %tmp1
 }
 
 define <8 x i8> @orr8imm2s_lsl0(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orr8imm2s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = or <8 x i8> %a, < i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @orr8imm2s_lsl8(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orr8imm2s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <8 x i8> %a, < i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @orr8imm2s_lsl16(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orr8imm2s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <8 x i8> %a, < i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @orr8imm2s_lsl24(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orr8imm2s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <8 x i8> %a, < i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <4 x i16> @orr16imm2s_lsl0(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orr16imm2s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = or <4 x i16> %a, < i16 255, i16 0, i16 255, i16 0>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @orr16imm2s_lsl8(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orr16imm2s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <4 x i16> %a, < i16 65280, i16 0, i16 65280, i16 0>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @orr16imm2s_lsl16(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orr16imm2s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <4 x i16> %a, < i16 0, i16 255, i16 0, i16 255>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @orr16imm2s_lsl24(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orr16imm2s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <4 x i16> %a, < i16 0, i16 65280, i16 0, i16 65280>
 	ret <4 x i16> %tmp1
 }
 
 define <1 x i64> @orr64imm2s_lsl0(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orr64imm2s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = or <1 x i64> %a, < i64 1095216660735>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @orr64imm2s_lsl8(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orr64imm2s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <1 x i64> %a, < i64 280375465148160>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @orr64imm2s_lsl16(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orr64imm2s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <1 x i64> %a, < i64 71776119077928960>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @orr64imm2s_lsl24(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orr64imm2s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <1 x i64> %a, < i64 -72057589759737856>
 	ret <1 x i64> %tmp1
 }
 
 define <16 x i8> @orr8imm4s_lsl0(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orr8imm4s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = or <16 x i8> %a, < i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @orr8imm4s_lsl8(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orr8imm4s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <16 x i8> %a, < i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @orr8imm4s_lsl16(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orr8imm4s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <16 x i8> %a, < i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @orr8imm4s_lsl24(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orr8imm4s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <16 x i8> %a, < i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255>
 	ret <16 x i8> %tmp1
 }
 
 define <8 x i16> @orr16imm4s_lsl0(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orr16imm4s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = or <8 x i16> %a, < i16 255, i16 0, i16 255, i16 0, i16 255, i16 0, i16 255, i16 0>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @orr16imm4s_lsl8(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orr16imm4s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <8 x i16> %a, < i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @orr16imm4s_lsl16(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orr16imm4s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <8 x i16> %a, < i16 0, i16 255, i16 0, i16 255, i16 0, i16 255, i16 0, i16 255>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @orr16imm4s_lsl24(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orr16imm4s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <8 x i16> %a, < i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280>
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @orr64imm4s_lsl0(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orr64imm4s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = or <2 x i64> %a, < i64 1095216660735, i64 1095216660735>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @orr64imm4s_lsl8(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orr64imm4s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <2 x i64> %a, < i64 280375465148160, i64 280375465148160>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @orr64imm4s_lsl16(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orr64imm4s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <2 x i64> %a, < i64 71776119077928960, i64 71776119077928960>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @orr64imm4s_lsl24(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orr64imm4s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <2 x i64> %a, < i64 -72057589759737856, i64 -72057589759737856>
 	ret <2 x i64> %tmp1
 }
 
 define <8 x i8> @orr8imm4h_lsl0(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orr8imm4h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = or <8 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @orr8imm4h_lsl8(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orr8imm4h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <8 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <2 x i32> @orr16imm4h_lsl0(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orr16imm4h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = or <2 x i32> %a, < i32 16711935, i32 16711935>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @orr16imm4h_lsl8(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orr16imm4h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <2 x i32> %a, < i32 4278255360, i32 4278255360>
 	ret <2 x i32> %tmp1
 }
 
 define <1 x i64> @orr64imm4h_lsl0(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orr64imm4h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = or <1 x i64> %a, < i64 71777214294589695>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @orr64imm4h_lsl8(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orr64imm4h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <1 x i64> %a, < i64 -71777214294589696>
 	ret <1 x i64> %tmp1
 }
 
 define <16 x i8> @orr8imm8h_lsl0(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orr8imm8h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = or <16 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @orr8imm8h_lsl8(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orr8imm8h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <16 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
 	ret <16 x i8> %tmp1
 }
 
 define <4 x i32> @orr16imm8h_lsl0(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orr16imm8h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = or <4 x i32> %a, < i32 16711935, i32 16711935, i32 16711935, i32 16711935>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @orr16imm8h_lsl8(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orr16imm8h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <4 x i32> %a, < i32 4278255360, i32 4278255360, i32 4278255360, i32 4278255360>
 	ret <4 x i32> %tmp1
 }
 
 define <2 x i64> @orr64imm8h_lsl0(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orr64imm8h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = or <2 x i64> %a, < i64 71777214294589695, i64 71777214294589695>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @orr64imm8h_lsl8(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orr64imm8h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <2 x i64> %a, < i64 -71777214294589696, i64 -71777214294589696>
 	ret <2 x i64> %tmp1
 }
diff --git a/test/CodeGen/AArch64/neon-bsl.ll b/test/CodeGen/AArch64/neon-bsl.ll
deleted file mode 100644
index c55fd01..0000000
--- a/test/CodeGen/AArch64/neon-bsl.ll
+++ /dev/null
@@ -1,235 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double>, <2 x double>, <2 x double>)
-
-declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
-
-declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-
-declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
-
-declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
-
-declare <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double>, <1 x double>, <1 x double>)
-
-declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>)
-
-declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>)
-
-declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
-
-define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_s8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_s16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
-  %0 = bitcast <4 x i16> %vbsl3.i to <8 x i8>
-  ret <8 x i8> %0
-}
-
-define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-; CHECK-LABEL: test_vbsl_s32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
-  ret <2 x i32> %vbsl3.i
-}
-
-define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_vbsl_s64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
-  ret <1 x i64> %vbsl3.i
-}
-
-define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_u8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
-  ret <8 x i8> %vbsl.i
-}
-
-define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_u16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
-  ret <4 x i16> %vbsl3.i
-}
-
-define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-; CHECK-LABEL: test_vbsl_u32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
-  ret <2 x i32> %vbsl3.i
-}
-
-define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_vbsl_u64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
-  ret <1 x i64> %vbsl3.i
-}
-
-define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) {
-; CHECK-LABEL: test_vbsl_f32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3)
-  ret <2 x float> %vbsl3.i
-}
-
-define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) {
-; CHECK-LABEL: test_vbsl_f64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = bitcast <1 x i64> %v1 to <1 x double>
-  %vbsl3.i = tail call <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double> %vbsl.i, <1 x double> %v2, <1 x double> %v3)
-  ret <1 x double> %vbsl3.i
-}
-
-define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_p8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
-  ret <8 x i8> %vbsl.i
-}
-
-define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_p16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
-  ret <4 x i16> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_s8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
-  ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_s16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
-  ret <8 x i16> %vbsl3.i
-}
-
-define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: test_vbslq_s32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
-  ret <4 x i32> %vbsl3.i
-}
-
-define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-; CHECK-LABEL: test_vbslq_s64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
-  ret <2 x i64> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_u8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
-  ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_u16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
-  ret <8 x i16> %vbsl3.i
-}
-
-define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: test_vbslq_u32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
-  ret <4 x i32> %vbsl3.i
-}
-
-define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-; CHECK-LABEL: test_vbslq_u64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
-  ret <2 x i64> %vbsl3.i
-}
-
-define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) {
-; CHECK-LABEL: test_vbslq_f32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = bitcast <4 x i32> %v1 to <4 x float>
-  %vbsl3.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %vbsl.i, <4 x float> %v2, <4 x float> %v3)
-  ret <4 x float> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_p8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
-  ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_p16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
-  ret <8 x i16> %vbsl3.i
-}
-
-define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) {
-; CHECK-LABEL: test_vbslq_f64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = bitcast <2 x i64> %v1 to <2 x double>
-  %vbsl3.i = tail call <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double> %vbsl.i, <2 x double> %v2, <2 x double> %v3)
-  ret <2 x double> %vbsl3.i
-}
-
-define <2 x double> @test_bsl_v2f64(<2 x i1> %v1, <2 x double> %v2, <2 x double> %v3) {
-; CHECK-LABEL: test_bsl_v2f64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  %1 = select <2 x i1> %v1, <2 x double> %v2, <2 x double> %v3
-  ret <2 x double> %1
-}
-
-define <4 x float> @test_bsl_v4f32(<4 x i1> %v1, <4 x float> %v2, <4 x float> %v3) {
-; CHECK-LABEL: test_bsl_v4f32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  %1 = select <4 x i1> %v1, <4 x float> %v2, <4 x float> %v3
-  ret <4 x float> %1
-}
diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll
index 68f0342..6d89dfb 100644
--- a/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1,560 +1,631 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmeq8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp eq <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmeq16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp eq <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmeq4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp eq <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmeq8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp eq <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmeq2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp eq <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmeq4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp eq <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmeq2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp eq <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmne8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmne4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmne2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmgt8xi8:
+; CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp sgt <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmgt16xi8:
+; CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp sgt <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmgt4xi16:
+; CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp sgt <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmgt8xi16:
+; CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp sgt <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmgt2xi32:
+; CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp sgt <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmgt4xi32:
+; CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp sgt <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmgt2xi64:
+; CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp sgt <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmlt8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp slt <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmlt16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp slt <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmlt4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp slt <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmlt8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp slt <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmlt2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp slt <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmlt4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp slt <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmlt2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp slt <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmge8xi8:
+; CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp sge <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmge16xi8:
+; CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp sge <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmge4xi16:
+; CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp sge <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmge8xi16:
+; CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp sge <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmge2xi32:
+; CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp sge <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmge4xi32:
+; CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp sge <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmge2xi64:
+; CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp sge <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmle8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp sle <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmle16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp sle <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmle4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp sle <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmle8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp sle <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmle2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp sle <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmle4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp sle <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmle2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp sle <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhi8xi8:
+; CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ugt <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhi16xi8:
+; CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ugt <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhi4xi16:
+; CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp ugt <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhi8xi16:
+; CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp ugt <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhi2xi32:
+; CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp ugt <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhi4xi32:
+; CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp ugt <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhi2xi64:
+; CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp ugt <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmlo8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp ult <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmlo16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp ult <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmlo4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp ult <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmlo8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp ult <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmlo2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp ult <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmlo4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp ult <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmlo2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp ult <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhs8xi8:
+; CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp uge <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhs16xi8:
+; CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp uge <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhs4xi16:
+; CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp uge <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhs8xi16:
+; CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp uge <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhs2xi32:
+; CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp uge <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhs4xi32:
+; CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp uge <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhs2xi64:
+; CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp uge <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmls8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp ule <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmls16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp ule <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmls4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp ule <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmls8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp ule <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmls2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp ule <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmls4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp ule <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmls2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp ule <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmtst8xi8:
+; CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = and <8 x i8> %A, %B
 	%tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
    %tmp5 = sext <8 x i1> %tmp4 to <8 x i8>
@@ -562,7 +633,8 @@ define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) {
 }
 
 define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmtst16xi8:
+; CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = and <16 x i8> %A, %B
 	%tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer
    %tmp5 = sext <16 x i1> %tmp4 to <16 x i8>
@@ -570,7 +642,8 @@ define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) {
 }
 
 define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmtst4xi16:
+; CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = and <4 x i16> %A, %B
 	%tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer
    %tmp5 = sext <4 x i1> %tmp4 to <4 x i16>
@@ -578,7 +651,8 @@ define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) {
 }
 
 define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmtst8xi16:
+; CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = and <8 x i16> %A, %B
 	%tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer
    %tmp5 = sext <8 x i1> %tmp4 to <8 x i16>
@@ -586,7 +660,8 @@ define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) {
 }
 
 define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmtst2xi32:
+; CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = and <2 x i32> %A, %B
 	%tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer
    %tmp5 = sext <2 x i1> %tmp4 to <2 x i32>
@@ -594,7 +669,8 @@ define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) {
 }
 
 define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmtst4xi32:
+; CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = and <4 x i32> %A, %B
 	%tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer
    %tmp5 = sext <4 x i1> %tmp4 to <4 x i32>
@@ -602,7 +678,8 @@ define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) {
 }
 
 define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmtst2xi64:
+; CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = and <2 x i64> %A, %B
 	%tmp4 = icmp ne <2 x i64> %tmp3, zeroinitializer
    %tmp5 = sext <2 x i1> %tmp4 to <2 x i64>
@@ -612,49 +689,56 @@ define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) {
 
 
 define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmeqz8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 	%tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmeqz16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 	%tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmeqz4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 	%tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmeqz8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 	%tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmeqz2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 	%tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmeqz4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 	%tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmeqz2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 	%tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -662,49 +746,56 @@ define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
 
 
 define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
-;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmgez8xi8:
+; CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 	%tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
-;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmgez16xi8:
+; CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 	%tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
-;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmgez4xi16:
+; CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 	%tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
-;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmgez8xi16:
+; CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 	%tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
-;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmgez2xi32:
+; CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 	%tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
-;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmgez4xi32:
+; CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 	%tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
-;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmgez2xi64:
+; CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 	%tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -712,259 +803,294 @@ define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
 
 
 define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
-;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmgtz8xi8:
+; CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 	%tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
-;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmgtz16xi8:
+; CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 	%tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
-;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmgtz4xi16:
+; CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 	%tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
-;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmgtz8xi16:
+; CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 	%tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
-;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmgtz2xi32:
+; CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 	%tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
-;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmgtz4xi32:
+; CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 	%tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
-;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmgtz2xi64:
+; CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 	%tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
-;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmlez8xi8:
+; CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 	%tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
-;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmlez16xi8:
+; CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 	%tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
-;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmlez4xi16:
+; CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 	%tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
-;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmlez8xi16:
+; CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 	%tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
-;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmlez2xi32:
+; CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 	%tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
-;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmlez4xi32:
+; CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 	%tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
-;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmlez2xi64:
+; CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 	%tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
-;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmltz8xi8:
+; CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 	%tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
-;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmltz16xi8:
+; CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 	%tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
-;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmltz4xi16:
+; CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 	%tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
-;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmltz8xi16:
+; CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 	%tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
-;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmltz2xi32:
+; CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 	%tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
-;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmltz4xi32:
+; CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 	%tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
-;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmltz2xi64:
+; CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 	%tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmneqz8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmneqz4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmneqz2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhsz8xi8:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhsz16xi8:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhsz4xi16:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhsz8xi16:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhsz2xi32:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhsz4xi32:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhsz2xi64:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -972,196 +1098,217 @@ define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
 
 
 define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhiz8xi8:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhiz16xi8:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhiz4xi16:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhiz8xi16:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhiz2xi32:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhiz4xi32:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhiz2xi64:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
+; CHECK-LABEL: cmlsz8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
+; CHECK-LABEL: cmlsz16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
+; CHECK-LABEL: cmlsz4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
+; CHECK-LABEL: cmlsz8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
+; CHECK-LABEL: cmlsz2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
+; CHECK-LABEL: cmlsz4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
+; CHECK-LABEL: cmlsz2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
+; CHECK-LABEL: cmloz8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v1.8b, {{v[0-9]+}}.8b
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v1.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
+; CHECK-LABEL: cmloz16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
+; CHECK-LABEL: cmloz4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
+; CHECK-LABEL: cmloz8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
+; CHECK-LABEL: cmloz2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
+; CHECK-LABEL: cmloz4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
+; CHECK-LABEL: cmloz2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1169,144 +1316,162 @@ define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
 
 
 define <2 x i32> @fcmoeq2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: fcmoeq2xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    %tmp3 = fcmp oeq <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmoeq4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: fcmoeq4xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    %tmp3 = fcmp oeq <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmoeq2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: fcmoeq2xdouble:
+; CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    %tmp3 = fcmp oeq <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmoge2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: fcmoge2xfloat:
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    %tmp3 = fcmp oge <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmoge4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: fcmoge4xfloat:
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    %tmp3 = fcmp oge <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmoge2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: fcmoge2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    %tmp3 = fcmp oge <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmogt2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: fcmogt2xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    %tmp3 = fcmp ogt <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmogt4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: fcmogt4xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    %tmp3 = fcmp ogt <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmogt2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: fcmogt2xdouble:
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    %tmp3 = fcmp ogt <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmole2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmole2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
    %tmp3 = fcmp ole <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmole4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmole4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
    %tmp3 = fcmp ole <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmole2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmole2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
    %tmp3 = fcmp ole <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmolt2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmolt2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
    %tmp3 = fcmp olt <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmolt4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmolt4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
    %tmp3 = fcmp olt <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmolt2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmolt2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
    %tmp3 = fcmp olt <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmone2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmone2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp one <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmone4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmone4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp one <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmone2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmone2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; todo check reversed operands
    %tmp3 = fcmp one <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
@@ -1315,11 +1480,12 @@ define <2 x i64> @fcmone2xdouble(<2 x double> %A, <2 x double> %B) {
 
 
 define <2 x i32> @fcmord2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmord2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ord <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1327,22 +1493,24 @@ define <2 x i32> @fcmord2xfloat(<2 x float> %A, <2 x float> %B) {
 
 
 define <4 x i32> @fcmord4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmord4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ord <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmord2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmord2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ord <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1350,236 +1518,260 @@ define <2 x i64> @fcmord2xdouble(<2 x double> %A, <2 x double> %B) {
 
 
 define <2 x i32> @fcmuno2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmuno2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uno <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmuno4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmuno4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmuno2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmuno2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmueq2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmueq2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ueq <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmueq4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmueq4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmueq2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmueq2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmuge2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmuge2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uge <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmuge4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmuge4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmuge2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmuge2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmugt2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmugt2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGT = ULT with swapped operands, ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ugt <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmugt4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmugt4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGT = ULT with swapped operands, ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmugt2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: fcmugt2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmule2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmule2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ule <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmule4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmule4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmule2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmule2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmult2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmult2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ult <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmult4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmult4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmult2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmult2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmune2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmune2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
-;CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp une <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmune4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmune4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
-;CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmune2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmune2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
-;CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmoeqz2xfloat(<2 x float> %A) {
-;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmoeqz2xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
    %tmp3 = fcmp oeq <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmoeqz4xfloat(<4 x float> %A) {
-;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmoeqz4xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
    %tmp3 = fcmp oeq <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmoeqz2xdouble(<2 x double> %A) {
-;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmoeqz2xdouble:
+; CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
    %tmp3 = fcmp oeq <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1587,250 +1779,280 @@ define <2 x i64> @fcmoeqz2xdouble(<2 x double> %A) {
 
 
 define <2 x i32> @fcmogez2xfloat(<2 x float> %A) {
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmogez2xfloat:
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
    %tmp3 = fcmp oge <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmogez4xfloat(<4 x float> %A) {
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmogez4xfloat:
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
    %tmp3 = fcmp oge <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmogez2xdouble(<2 x double> %A) {
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmogez2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
    %tmp3 = fcmp oge <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmogtz2xfloat(<2 x float> %A) {
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmogtz2xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
    %tmp3 = fcmp ogt <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmogtz4xfloat(<4 x float> %A) {
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmogtz4xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
    %tmp3 = fcmp ogt <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmogtz2xdouble(<2 x double> %A) {
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmogtz2xdouble:
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
    %tmp3 = fcmp ogt <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmoltz2xfloat(<2 x float> %A) {
-;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmoltz2xfloat:
+; CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
    %tmp3 = fcmp olt <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmoltz4xfloat(<4 x float> %A) {
-;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmoltz4xfloat:
+; CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
    %tmp3 = fcmp olt <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmoltz2xdouble(<2 x double> %A) {
-;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmoltz2xdouble:
+; CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
    %tmp3 = fcmp olt <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmolez2xfloat(<2 x float> %A) {
-;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmolez2xfloat:
+; CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
    %tmp3 = fcmp ole <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmolez4xfloat(<4 x float> %A) {
-;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmolez4xfloat:
+; CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
    %tmp3 = fcmp ole <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmolez2xdouble(<2 x double> %A) {
-;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmolez2xdouble:
+; CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
    %tmp3 = fcmp ole <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmonez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmonez2xfloat:
 ; ONE with zero = OLT | OGT
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp one <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmonez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmonez4xfloat:
 ; ONE with zero = OLT | OGT
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp one <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmonez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmonez2xdouble:
 ; ONE with zero = OLT | OGT
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp one <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmordz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmordz2xfloat:
 ; ORD with zero = OLT | OGE
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ord <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmordz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmordz4xfloat:
 ; ORD with zero = OLT | OGE
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ord <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmordz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmordz2xdouble:
 ; ORD with zero = OLT | OGE
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ord <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmueqz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmueqz2xfloat:
 ; UEQ with zero = !ONE = !(OLT |OGT)
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ueq <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmueqz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmueqz4xfloat:
 ; UEQ with zero = !ONE = !(OLT |OGT)
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmueqz2xdouble:
 ; UEQ with zero = !ONE = !(OLT |OGT)
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmugez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmugez2xfloat:
 ; UGE with zero = !OLT
-;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uge <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmugez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmugez4xfloat:
 ; UGE with zero = !OLT
-;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmugez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmugez2xdouble:
 ; UGE with zero = !OLT
-;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmugtz2xfloat:
 ; UGT with zero = !OLE
-;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ugt <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmugtz4xfloat:
 ; UGT with zero = !OLE
-;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmugtz2xdouble:
 ; UGT with zero = !OLE
-;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmultz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmultz2xfloat:
 ; ULT with zero = !OGE
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ult <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmultz4xfloat(<4 x float> %A) {
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: fcmultz4xfloat:
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: fcmultz2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1838,53 +2060,59 @@ define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
 
 
 define <2 x i32> @fcmulez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmulez2xfloat:
 ; ULE with zero = !OGT
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ule <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmulez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmulez4xfloat:
 ; ULE with zero = !OGT
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmulez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmulez2xdouble:
 ; ULE with zero = !OGT
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmunez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmunez2xfloat:
 ; UNE with zero = !OEQ with zero
-;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp une <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmunez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmunez4xfloat:
 ; UNE with zero = !OEQ with zero
-;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmunez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmunez2xdouble:
 ; UNE with zero = !OEQ with zero
-;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1892,33 +2120,36 @@ define <2 x i64> @fcmunez2xdouble(<2 x double> %A) {
 
 
 define <2 x i32> @fcmunoz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmunoz2xfloat:
 ; UNO with zero = !ORD = !(OLT | OGE)
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uno <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmunoz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmunoz4xfloat:
 ; UNO with zero = !ORD = !(OLT | OGE)
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmunoz2xdouble:
 ; UNO with zero = !ORD = !(OLT | OGE)
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
diff --git a/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
deleted file mode 100644
index 4dffcd1..0000000
--- a/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <4 x i32> @copyTuple.QPair(i8* %a, i8* %b) {
-; CHECK-LABEL: copyTuple.QPair:
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i32 0, i32 4)
-  %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 1, i32 4)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QTriple(i8* %a, i8* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QTriple:
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
-  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i32 1, i32 4)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QQuad(i8* %a, i8* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QQuad:
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
-  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 1, i32 4)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-crypto.ll b/test/CodeGen/AArch64/neon-crypto.ll
deleted file mode 100644
index c0014fa..0000000
--- a/test/CodeGen/AArch64/neon-crypto.ll
+++ /dev/null
@@ -1,144 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -mattr=+crypto | FileCheck %s
-; RUN: not llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s
-
-declare <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256h(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1m(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1p(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1c(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32>, <4 x i32>) #1
-
-declare i32 @llvm.arm.neon.sha1h(i32) #1
-
-declare <16 x i8> @llvm.arm.neon.aesimc(<16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aesmc(<16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aesd(<16 x i8>, <16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aese(<16 x i8>, <16 x i8>) #1
-
-define <16 x i8> @test_vaeseq_u8(<16 x i8> %data, <16 x i8> %key) {
-; CHECK: test_vaeseq_u8:
-; CHECK: aese {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-; CHECK-NO-CRYPTO: Cannot select: intrinsic %llvm.arm.neon.aese
-entry:
-  %aese.i = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %aese.i
-}
-
-define <16 x i8> @test_vaesdq_u8(<16 x i8> %data, <16 x i8> %key) {
-; CHECK: test_vaesdq_u8:
-; CHECK: aesd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %aesd.i = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %aesd.i
-}
-
-define <16 x i8> @test_vaesmcq_u8(<16 x i8> %data) {
-; CHECK: test_vaesmcq_u8:
-; CHECK: aesmc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %aesmc.i = tail call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %data)
-  ret <16 x i8> %aesmc.i
-}
-
-define <16 x i8> @test_vaesimcq_u8(<16 x i8> %data) {
-; CHECK: test_vaesimcq_u8:
-; CHECK: aesimc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %aesimc.i = tail call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %data)
-  ret <16 x i8> %aesimc.i
-}
-
-define i32 @test_vsha1h_u32(i32 %hash_e) {
-; CHECK: test_vsha1h_u32:
-; CHECK: sha1h {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %sha1h1.i = tail call i32 @llvm.arm.neon.sha1h(i32 %hash_e)
-  ret i32 %sha1h1.i
-}
-
-define <4 x i32> @test_vsha1su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w12_15) {
-; CHECK: test_vsha1su1q_u32:
-; CHECK: sha1su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha1su12.i = tail call <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32> %tw0_3, <4 x i32> %w12_15)
-  ret <4 x i32> %sha1su12.i
-}
-
-define <4 x i32> @test_vsha256su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7) {
-; CHECK: test_vsha256su0q_u32:
-; CHECK: sha256su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha256su02.i = tail call <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
-  ret <4 x i32> %sha256su02.i
-}
-
-define <4 x i32> @test_vsha1cq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1cq_u32:
-; CHECK: sha1c {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha1c1.i = tail call <4 x i32> @llvm.arm.neon.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %sha1c1.i
-}
-
-define <4 x i32> @test_vsha1pq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1pq_u32:
-; CHECK: sha1p {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha1p1.i = tail call <4 x i32> @llvm.arm.neon.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %sha1p1.i
-}
-
-define <4 x i32> @test_vsha1mq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1mq_u32:
-; CHECK: sha1m {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha1m1.i = tail call <4 x i32> @llvm.arm.neon.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %sha1m1.i
-}
-
-define <4 x i32> @test_vsha1su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11) {
-; CHECK: test_vsha1su0q_u32:
-; CHECK: sha1su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha1su03.i = tail call <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11)
-  ret <4 x i32> %sha1su03.i
-}
-
-define <4 x i32> @test_vsha256hq_u32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
-; CHECK: test_vsha256hq_u32:
-; CHECK: sha256h {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha256h3.i = tail call <4 x i32> @llvm.arm.neon.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
-  ret <4 x i32> %sha256h3.i
-}
-
-define <4 x i32> @test_vsha256h2q_u32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
-; CHECK: test_vsha256h2q_u32:
-; CHECK: sha256h2 {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha256h23.i = tail call <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
-  ret <4 x i32> %sha256h23.i
-}
-
-define <4 x i32> @test_vsha256su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
-; CHECK: test_vsha256su1q_u32:
-; CHECK: sha256su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha256su13.i = tail call <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
-  ret <4 x i32> %sha256su13.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-diagnostics.ll b/test/CodeGen/AArch64/neon-diagnostics.ll
index f546aa7..099b685 100644
--- a/test/CodeGen/AArch64/neon-diagnostics.ll
+++ b/test/CodeGen/AArch64/neon-diagnostics.ll
@@ -21,4 +21,4 @@ define <4 x i32> @test_vshrn_not_match(<2 x i32> %a, <2 x i64> %b) {
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
   %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
   ret <4 x i32> %4
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/AArch64/neon-extract.ll b/test/CodeGen/AArch64/neon-extract.ll
index cddc226..f270b54 100644
--- a/test/CodeGen/AArch64/neon-extract.ll
+++ b/test/CodeGen/AArch64/neon-extract.ll
@@ -1,221 +1,221 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vext_s8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_vext_s8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
 entry:
   %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   ret <8 x i8> %vext
 }
 
 define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vext_s16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+; CHECK-LABEL: test_vext_s16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x6|6}}
 entry:
   %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %vext
 }
 
 define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vext_s32:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_vext_s32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
 entry:
   %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %vext
 }
 
 define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK: test_vext_s64:
+; CHECK-LABEL: test_vext_s64:
 entry:
   %vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0>
   ret <1 x i64> %vext
 }
 
 define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vextq_s8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+; CHECK-LABEL: test_vextq_s8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x2|2}}
 entry:
   %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   ret <16 x i8> %vext
 }
 
 define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vextq_s16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_vextq_s16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
 entry:
   %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   ret <8 x i16> %vext
 }
 
 define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vextq_s32:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+; CHECK-LABEL: test_vextq_s32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x4|4}}
 entry:
   %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i32> %vext
 }
 
 define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vextq_s64:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: test_vextq_s64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x8|8}}
 entry:
   %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %vext
 }
 
 define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vext_u8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_vext_u8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
 entry:
   %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   ret <8 x i8> %vext
 }
 
 define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vext_u16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+; CHECK-LABEL: test_vext_u16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x6|6}}
 entry:
   %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %vext
 }
 
 define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vext_u32:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_vext_u32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
 entry:
   %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %vext
 }
 
 define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK: test_vext_u64:
+; CHECK-LABEL: test_vext_u64:
 entry:
   %vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0>
   ret <1 x i64> %vext
 }
 
 define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vextq_u8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+; CHECK-LABEL: test_vextq_u8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x2|2}}
 entry:
   %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   ret <16 x i8> %vext
 }
 
 define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vextq_u16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_vextq_u16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
 entry:
   %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   ret <8 x i16> %vext
 }
 
 define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vextq_u32:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+; CHECK-LABEL: test_vextq_u32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x4|4}}
 entry:
   %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i32> %vext
 }
 
 define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vextq_u64:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: test_vextq_u64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x8|8}}
 entry:
   %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %vext
 }
 
 define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vext_f32:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_vext_f32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
 entry:
   %vext = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %vext
 }
 
 define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK: test_vext_f64:
+; CHECK-LABEL: test_vext_f64:
 entry:
   %vext = shufflevector <1 x double> %a, <1 x double> %b, <1 x i32> <i32 0>
   ret <1 x double> %vext
 }
 
 define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vextq_f32:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+; CHECK-LABEL: test_vextq_f32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x4|4}}
 entry:
   %vext = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x float> %vext
 }
 
 define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vextq_f64:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: test_vextq_f64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x8|8}}
 entry:
   %vext = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x double> %vext
 }
 
 define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vext_p8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_vext_p8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
 entry:
   %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   ret <8 x i8> %vext
 }
 
 define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vext_p16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+; CHECK-LABEL: test_vext_p16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x6|6}}
 entry:
   %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %vext
 }
 
 define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vextq_p8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+; CHECK-LABEL: test_vextq_p8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x2|2}}
 entry:
   %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   ret <16 x i8> %vext
 }
 
 define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vextq_p16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_vextq_p16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
 entry:
   %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   ret <8 x i16> %vext
 }
 
 define <8 x i8> @test_undef_vext_s8(<8 x i8> %a) {
-; CHECK: test_undef_vext_s8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_undef_vext_s8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
 entry:
   %vext = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 10, i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   ret <8 x i8> %vext
 }
 
 define <16 x i8> @test_undef_vextq_s8(<16 x i8> %a) {
-; CHECK: test_undef_vextq_s8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_undef_vextq_s8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
 entry:
   %vext = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 20, i32 20, i32 20, i32 20, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 20, i32 20, i32 20, i32 20, i32 20>
   ret <16 x i8> %vext
 }
 
 define <4 x i16> @test_undef_vext_s16(<4 x i16> %a) {
-; CHECK: test_undef_vext_s16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_undef_vext_s16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
 entry:
   %vext = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   ret <4 x i16> %vext
 }
 
 define <8 x i16> @test_undef_vextq_s16(<8 x i16> %a) {
-; CHECK: test_undef_vextq_s16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_undef_vextq_s16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
 entry:
   %vext = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   ret <8 x i16> %vext
diff --git a/test/CodeGen/AArch64/neon-facge-facgt.ll b/test/CodeGen/AArch64/neon-facge-facgt.ll
deleted file mode 100644
index 28e8212..0000000
--- a/test/CodeGen/AArch64/neon-facge-facgt.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float>, <2 x float>)
-declare <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float>, <4 x float>)
-declare <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double>, <2 x double>)
-
-define <2 x i32> @facge_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v2i32:
-  %val = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
-; CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  ret <2 x i32> %val
-}
-define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v4i32:
-  %val = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
-; CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  ret <4 x i32> %val
-}
-
-define <2 x i64> @facge_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v2i64:
-  %val = call <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
-; CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  ret <2 x i64> %val
-}
-
-declare <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float>, <2 x float>)
-declare <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float>, <4 x float>)
-declare <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double>, <2 x double>)
-
-define <2 x i32> @facgt_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v2i32:
-  %val = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
-; CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  ret <2 x i32> %val
-}
-define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v4i32:
-  %val = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
-; CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  ret <4 x i32> %val
-}
-
-define <2 x i64> @facgt_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v2i64:
-  %val = call <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
-; CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  ret <2 x i64> %val
-}
-
diff --git a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
deleted file mode 100644
index 46fe25d..0000000
--- a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon  | FileCheck %s
-
-; Set of tests for when the intrinsic is used.
-
-declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @frsqrts_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-        ret <2 x float> %val
-}
-
-define <4 x float> @frsqrts_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-        ret <4 x float> %val
-}
-
-define <2 x double> @frsqrts_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @frecps_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-        ret <2 x float> %val
-}
-
-define <4 x float> @frecps_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-        ret <4 x float> %val
-}
-
-define <2 x double> @frecps_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-        ret <2 x double> %val
-}
-
diff --git a/test/CodeGen/AArch64/neon-halving-add-sub.ll b/test/CodeGen/AArch64/neon-halving-add-sub.ll
deleted file mode 100644
index a8f59db..0000000
--- a/test/CodeGen/AArch64/neon-halving-add-sub.ll
+++ /dev/null
@@ -1,207 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uhadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uhadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_shadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_shadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: shadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uhadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uhadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_shadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_shadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: shadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uhadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uhadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_shadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_shadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: shadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uhadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uhadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_shadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_shadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: shadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uhadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uhadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_shadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_shadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: shadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uhadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uhadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_shadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_shadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: shadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
-declare <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uhsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uhsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uhsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_shsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_shsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: shsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uhsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uhsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uhsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_shsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_shsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: shsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uhsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uhsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uhsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_shsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_shsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: shsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uhsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uhsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uhsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_shsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_shsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: shsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uhsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uhsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uhsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_shsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_shsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: shsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uhsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uhsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uhsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_shsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_shsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: shsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-idiv.ll b/test/CodeGen/AArch64/neon-idiv.ll
new file mode 100644
index 0000000..de402c4
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-idiv.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
+
+define <4 x i32> @test1(<4 x i32> %a) {
+  %rem = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %rem
+; CHECK-LABEL: test1
+; FIXME: Can we lower this more efficiently?
+; CHECK: mul
+; CHECK: mul
+; CHECK: mul
+; CHECK: mul
+}
+
diff --git a/test/CodeGen/AArch64/neon-load-store-v1i32.ll b/test/CodeGen/AArch64/neon-load-store-v1i32.ll
deleted file mode 100644
index 92f704d..0000000
--- a/test/CodeGen/AArch64/neon-load-store-v1i32.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-; Test load/store of v1i8, v1i16, v1i32 types can be selected correctly
-define void @load.store.v1i8(<1 x i8>* %ptr, <1 x i8>* %ptr2) {
-; CHECK-LABEL: load.store.v1i8:
-; CHECK: ldr b{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str b{{[0-9]+}}, [x{{[0-9]+|sp}}]
-  %a = load <1 x i8>* %ptr
-  store <1 x i8> %a, <1 x i8>* %ptr2
-  ret void
-}
-
-define void @load.store.v1i16(<1 x i16>* %ptr, <1 x i16>* %ptr2) {
-; CHECK-LABEL: load.store.v1i16:
-; CHECK: ldr h{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str h{{[0-9]+}}, [x{{[0-9]+|sp}}]
-  %a = load <1 x i16>* %ptr
-  store <1 x i16> %a, <1 x i16>* %ptr2
-  ret void
-}
-
-define void @load.store.v1i32(<1 x i32>* %ptr, <1 x i32>* %ptr2) {
-; CHECK-LABEL: load.store.v1i32:
-; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str s{{[0-9]+}}, [x{{[0-9]+|sp}}]
-  %a = load <1 x i32>* %ptr
-  store <1 x i32> %a, <1 x i32>* %ptr2
-  ret void
-}
diff --git a/test/CodeGen/AArch64/neon-max-min-pairwise.ll b/test/CodeGen/AArch64/neon-max-min-pairwise.ll
deleted file mode 100644
index 3e18077..0000000
--- a/test/CodeGen/AArch64/neon-max-min-pairwise.ll
+++ /dev/null
@@ -1,346 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smaxp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smaxp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umaxp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smaxp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smaxp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umaxp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umaxp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smaxp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smaxp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umaxp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umaxp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smaxp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smaxp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umaxp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umaxp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smaxp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smaxp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umaxp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umaxp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smaxp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smaxp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umaxp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umaxp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_sminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_sminp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sminp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_uminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uminp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_sminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sminp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sminp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_uminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uminp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uminp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_sminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sminp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sminp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_uminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uminp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uminp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_sminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sminp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sminp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_uminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uminp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uminp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_sminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sminp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sminp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_uminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uminp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uminp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_sminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sminp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sminp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_uminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uminp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uminp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxp_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxp_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxp_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminp_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fminp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminp_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fminp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminp_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxnmp_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxnmp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxnmp_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxnmp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxnmp_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxnmp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminnmp_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminnmp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fminnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminnmp_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminnmp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fminnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminnmp_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminnmp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-define i32 @test_vminv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vminv_s32
-; CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-define i32 @test_vminv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vminv_u32
-; CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-define i32 @test_vmaxv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vmaxv_s32
-; CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-define i32 @test_vmaxv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vmaxv_u32
-; CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32>)
-\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-max-min.ll b/test/CodeGen/AArch64/neon-max-min.ll
deleted file mode 100644
index 7889c77..0000000
--- a/test/CodeGen/AArch64/neon-max-min.ll
+++ /dev/null
@@ -1,310 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smax_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smax v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umax v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smax_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smax v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umax_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umax v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smax_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smax v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umax_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umax v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smax_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smax v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umax_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umax v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smax_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smax v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umax_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umax v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smax_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smax v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umax_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umax v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smin_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smin v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umin v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smin_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smin v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umin_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umin v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smin_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smin v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umin_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umin v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smin_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smin v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umin_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umin v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smin_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smin v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umin_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umin v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smin_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smin v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umin_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umin v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmax_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmax_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmax v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmax_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmax_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmax v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmax_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmax_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmax v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmin_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmin_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmin v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmin_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmin_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmin v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmin_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmin_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmin v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-
-declare <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxnm_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxnm v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxnm_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxnm v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxnm_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxnm v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminnm_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminnm v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fminnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminnm_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminnm v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fminnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminnm_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminnm v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
diff --git a/test/CodeGen/AArch64/neon-misc-scalar.ll b/test/CodeGen/AArch64/neon-misc-scalar.ll
deleted file mode 100644
index cca8deb..0000000
--- a/test/CodeGen/AArch64/neon-misc-scalar.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-;RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>)
-
-declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) {
-entry:
-  ; CHECK: test_vuqadd_s64
-  %vuqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-  ; CHECK: suqadd d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vuqadd2.i
-}
-
-define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) {
-entry:
-  ; CHECK: test_vsqadd_u64
-  %vsqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-  ; CHECK: usqadd d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vsqadd2.i
-}
-
-define <1 x i64> @test_vabs_s64(<1 x i64> %a) {
-  ; CHECK: test_vabs_s64
-entry:
-  %vabs1.i = tail call <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64> %a)
-  ; CHECK: abs d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vabs1.i
-}
-
-define <1 x i64> @test_vqabs_s64(<1 x i64> %a) {
-  ; CHECK: test_vqabs_s64
-entry:
-  %vqabs1.i = tail call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %a)
-  ; CHECK: sqabs d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vqabs1.i
-}
-
-define <1 x i64> @test_vqneg_s64(<1 x i64> %a) {
-  ; CHECK: test_vqneg_s64
-entry:
-  %vqneg1.i = tail call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %a)
-  ; CHECK: sqneg d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vqneg1.i
-}
-
-define <1 x i64> @test_vneg_s64(<1 x i64> %a) {
-  ; CHECK: test_vneg_s64
-entry:
-  %sub.i = sub <1 x i64> zeroinitializer, %a
-  ; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %sub.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-misc.ll b/test/CodeGen/AArch64/neon-misc.ll
deleted file mode 100644
index 7ec36c2..0000000
--- a/test/CodeGen/AArch64/neon-misc.ll
+++ /dev/null
@@ -1,2014 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-
-define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
-; CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-  ret <8 x i8> %shuffle.i
-}
-
-define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
-; CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  ret <4 x i16> %shuffle.i
-}
-
-define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-  ret <8 x i16> %shuffle.i
-}
-
-define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-  ret <2 x i32> %shuffle.i
-}
-
-define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %shuffle.i = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-  ret <2 x float> %shuffle.i
-}
-
-define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  ret <4 x i32> %shuffle.i
-}
-
-define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  ret <4 x float> %shuffle.i
-}
-
-define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
-  ret <4 x i16> %vpaddl.i
-}
-
-define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a) #4
-  ret <2 x i32> %vpaddl1.i
-}
-
-define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a) #4
-  ret <1 x i64> %vpaddl1.i
-}
-
-define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
-  ret <4 x i16> %vpaddl.i
-}
-
-define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a) #4
-  ret <2 x i32> %vpaddl1.i
-}
-
-define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a) #4
-  ret <1 x i64> %vpaddl1.i
-}
-
-define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
-  ret <8 x i16> %vpaddl.i
-}
-
-define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a) #4
-  ret <4 x i32> %vpaddl1.i
-}
-
-define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a) #4
-  ret <2 x i64> %vpaddl1.i
-}
-
-define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
-  ret <8 x i16> %vpaddl.i
-}
-
-define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a) #4
-  ret <4 x i32> %vpaddl1.i
-}
-
-define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a) #4
-  ret <2 x i64> %vpaddl1.i
-}
-
-define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
-  ret <4 x i16> %vpadal1.i
-}
-
-define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
-  ret <2 x i32> %vpadal2.i
-}
-
-define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
-  ret <1 x i64> %vpadal2.i
-}
-
-define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
-  ret <4 x i16> %vpadal1.i
-}
-
-define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
-  ret <2 x i32> %vpadal2.i
-}
-
-define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
-  ret <1 x i64> %vpadal2.i
-}
-
-define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
-  ret <8 x i16> %vpadal1.i
-}
-
-define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
-  ret <4 x i32> %vpadal2.i
-}
-
-define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
-  ret <2 x i64> %vpadal2.i
-}
-
-define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
-  ret <8 x i16> %vpadal1.i
-}
-
-define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
-  ret <4 x i32> %vpadal2.i
-}
-
-define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
-  ret <2 x i64> %vpadal2.i
-}
-
-define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vqabs.i = tail call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vqabs.i
-}
-
-define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vqabs.i = tail call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vqabs.i
-}
-
-define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vqabs1.i = tail call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vqabs1.i
-}
-
-define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vqabs1.i = tail call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vqabs1.i
-}
-
-define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vqabs1.i = tail call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vqabs1.i
-}
-
-define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vqabs1.i = tail call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vqabs1.i
-}
-
-define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vqabs1.i = tail call <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64> %a) #4
-  ret <2 x i64> %vqabs1.i
-}
-
-define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vqneg.i = tail call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vqneg.i
-}
-
-define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vqneg.i = tail call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vqneg.i
-}
-
-define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vqneg1.i = tail call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vqneg1.i
-}
-
-define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vqneg1.i = tail call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vqneg1.i
-}
-
-define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vqneg1.i = tail call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vqneg1.i
-}
-
-define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vqneg1.i = tail call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vqneg1.i
-}
-
-define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vqneg1.i = tail call <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64> %a) #4
-  ret <2 x i64> %vqneg1.i
-}
-
-define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %sub.i = sub <8 x i8> zeroinitializer, %a
-  ret <8 x i8> %sub.i
-}
-
-define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %sub.i = sub <16 x i8> zeroinitializer, %a
-  ret <16 x i8> %sub.i
-}
-
-define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %sub.i = sub <4 x i16> zeroinitializer, %a
-  ret <4 x i16> %sub.i
-}
-
-define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %sub.i = sub <8 x i16> zeroinitializer, %a
-  ret <8 x i16> %sub.i
-}
-
-define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %sub.i = sub <2 x i32> zeroinitializer, %a
-  ret <2 x i32> %sub.i
-}
-
-define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %sub.i = sub <4 x i32> zeroinitializer, %a
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %sub.i = sub <2 x i64> zeroinitializer, %a
-  ret <2 x i64> %sub.i
-}
-
-define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
-  ret <2 x float> %sub.i
-}
-
-define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
-  ret <4 x float> %sub.i
-}
-
-define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
-  ret <2 x double> %sub.i
-}
-
-define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vabs.i = tail call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vabs.i
-}
-
-define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vabs.i = tail call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vabs.i
-}
-
-define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vabs1.i = tail call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vabs1.i
-}
-
-define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vabs1.i = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vabs1.i
-}
-
-define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vabs1.i = tail call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vabs1.i
-}
-
-define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vabs1.i = tail call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vabs1.i
-}
-
-define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vabs1.i = tail call <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64> %a) #4
-  ret <2 x i64> %vabs1.i
-}
-
-define <2 x float> @test_vabs_f32(<2 x float> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vabs1.i = tail call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vabs1.i
-}
-
-define <4 x float> @test_vabsq_f32(<4 x float> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vabs1.i = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vabs1.i
-}
-
-define <2 x double> @test_vabsq_f64(<2 x double> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vabs1.i = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vabs1.i
-}
-
-define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vuqadd.i = tail call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
-  ret <8 x i8> %vuqadd.i
-}
-
-define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vuqadd.i = tail call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
-  ret <16 x i8> %vuqadd.i
-}
-
-define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vuqadd2.i = tail call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
-  ret <4 x i16> %vuqadd2.i
-}
-
-define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vuqadd2.i = tail call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
-  ret <8 x i16> %vuqadd2.i
-}
-
-define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vuqadd2.i = tail call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
-  ret <2 x i32> %vuqadd2.i
-}
-
-define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vuqadd2.i = tail call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
-  ret <4 x i32> %vuqadd2.i
-}
-
-define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vuqadd2.i = tail call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
-  ret <2 x i64> %vuqadd2.i
-}
-
-define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vcls.i = tail call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vcls.i
-}
-
-define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vcls.i = tail call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vcls.i
-}
-
-define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vcls1.i = tail call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vcls1.i
-}
-
-define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vcls1.i = tail call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vcls1.i
-}
-
-define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcls1.i = tail call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vcls1.i
-}
-
-define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcls1.i = tail call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vcls1.i
-}
-
-define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
-  ret <8 x i8> %vclz.i
-}
-
-define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
-  ret <16 x i8> %vclz.i
-}
-
-define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #4
-  ret <4 x i16> %vclz1.i
-}
-
-define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #4
-  ret <8 x i16> %vclz1.i
-}
-
-define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #4
-  ret <2 x i32> %vclz1.i
-}
-
-define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #4
-  ret <4 x i32> %vclz1.i
-}
-
-define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
-; CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vctpop.i = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vctpop.i
-}
-
-define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
-; CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vctpop.i = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vctpop.i
-}
-
-define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %neg.i = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-  ret <8 x i8> %neg.i
-}
-
-define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %neg.i = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-  ret <16 x i8> %neg.i
-}
-
-define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %neg.i = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
-  ret <4 x i16> %neg.i
-}
-
-define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %neg.i = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-  ret <8 x i16> %neg.i
-}
-
-define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %neg.i = xor <2 x i32> %a, <i32 -1, i32 -1>
-  ret <2 x i32> %neg.i
-}
-
-define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %neg.i = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
-  ret <4 x i32> %neg.i
-}
-
-define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 {
-; CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vrbit.i = tail call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vrbit.i
-}
-
-define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 {
-; CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vrbit.i = tail call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vrbit.i
-}
-
-define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vmovn.i = trunc <8 x i16> %a to <8 x i8>
-  ret <8 x i8> %vmovn.i
-}
-
-define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vmovn.i = trunc <4 x i32> %a to <4 x i16>
-  ret <4 x i16> %vmovn.i
-}
-
-define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vmovn.i = trunc <2 x i64> %a to <2 x i32>
-  ret <2 x i32> %vmovn.i
-}
-
-define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vmovn.i.i = trunc <8 x i16> %b to <8 x i8>
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vmovn.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vmovn.i.i = trunc <4 x i32> %b to <4 x i16>
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vmovn.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vmovn.i.i = trunc <2 x i64> %b to <2 x i32>
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vmovn.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vqdmull1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a) #4
-  ret <8 x i8> %vqdmull1.i
-}
-
-define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vqdmull1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a) #4
-  ret <4 x i16> %vqdmull1.i
-}
-
-define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vqdmull1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a) #4
-  ret <2 x i32> %vqdmull1.i
-}
-
-define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vqdmull1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %b) #4
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqdmull1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vqdmull1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqdmull1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vqdmull1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %b) #4
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqdmull1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a) #4
-  ret <8 x i8> %vqmovn1.i
-}
-
-define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a) #4
-  ret <4 x i16> %vqmovn1.i
-}
-
-define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a) #4
-  ret <2 x i32> %vqmovn1.i
-}
-
-define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %b) #4
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: test_vqmovn_high_s32
-  %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: test_vqmovn_high_s64
-  %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %b) #4
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a) #4
-  ret <8 x i8> %vqmovn1.i
-}
-
-define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a) #4
-  ret <4 x i16> %vqmovn1.i
-}
-
-define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a) #4
-  ret <2 x i32> %vqmovn1.i
-}
-
-define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %b) #4
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %b) #4
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
-  %1 = sext <8 x i8> %a to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
-  %1 = sext <4 x i16> %a to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
-  %1 = sext <2 x i32> %a to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
-  %1 = zext <8 x i8> %a to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
-  %1 = zext <4 x i16> %a to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
-  %1 = zext <2 x i32> %a to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = sext <8 x i8> %shuffle.i to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = sext <4 x i16> %shuffle.i to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = sext <2 x i32> %shuffle.i to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = zext <8 x i8> %shuffle.i to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = zext <4 x i16> %shuffle.i to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = zext <2 x i32> %shuffle.i to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <4 x i16> @test_vcvt_f16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vcvt1.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a) #4
-  ret <4 x i16> %vcvt1.i
-}
-
-define <8 x i16> @test_vcvt_high_f16_f32(<4 x i16> %a, <4 x float> %b) #0 {
-; CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vcvt1.i.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vcvt1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x float> @test_vcvt_f32_f16(<4 x i16> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h
-  %vcvt1.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %a) #4
-  ret <4 x float> %vcvt1.i
-}
-
-define <4 x float> @test_vcvt_high_f32_f16(<8 x i16> %a) #0 {
-; CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vcvt1.i.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %shuffle.i.i) #4
-  ret <4 x float> %vcvt1.i.i
-}
-
-define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = fptrunc <2 x double> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
-; CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vcvt.i.i = fptrunc <2 x double> %b to <2 x float>
-  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvt.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %shuffle.i
-}
-
-define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvtx_f32_f641.i = call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %a) #4
-  ret <2 x float> %vcvtx_f32_f641.i
-}
-
-define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
-; CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vcvtx_f32_f641.i.i = tail call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %b) #4
-  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvtx_f32_f641.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %shuffle.i
-}
-
-define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
-  %vcvt.i = fpext <2 x float> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 {
-; CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %shuffle.i.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
-  %vcvt.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
-  ret <2 x double> %vcvt.i.i
-}
-
-define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndn1.i = tail call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndn1.i
-}
-
-define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndn1.i = tail call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndn1.i
-}
-
-define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndn1.i = tail call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndn1.i
-}
-
-define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrnda1.i = tail call <2 x float> @llvm.round.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrnda1.i
-}
-
-define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-   %vrnda1.i = tail call <4 x float> @llvm.round.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrnda1.i
-}
-
-define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrnda1.i = tail call <2 x double> @llvm.round.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrnda1.i
-}
-
-define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndp1.i = tail call <2 x float> @llvm.ceil.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndp1.i
-}
-
-define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrndp1.i = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndp1.i
-}
-
-define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndp1.i = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndp1.i
-}
-
-define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndm1.i = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndm1.i
-}
-
-define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndm1.i = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndm1.i
-}
-
-define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-   %vrndm1.i = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndm1.i
-}
-
-define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndx1.i = tail call <2 x float> @llvm.rint.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndx1.i
-}
-
-define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndx1.i = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndx1.i
-}
-
-define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndx1.i = tail call <2 x double> @llvm.rint.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndx1.i
-}
-
-define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-   %vrnd1.i = tail call <2 x float> @llvm.trunc.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrnd1.i
-}
-
-define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrnd1.i = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrnd1.i
-}
-
-define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrnd1.i = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrnd1.i
-}
-
-define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndi1.i = tail call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndi1.i
-}
-
-define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndi1.i = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndi1.i
-}
-
-define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndi1.i = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndi1.i
-}
-
-define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
-  ret <4 x i32> %vcvt.i
-}
-
-define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptosi <2 x double> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
-  ret <4 x i32> %vcvt.i
-}
-
-define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptoui <2 x double> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <2 x i64> @test_vcvt_s64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptosi <2 x float> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <2 x i64> @test_vcvt_u64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptoui <2 x float> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <4 x i16> @test_vcvt_s16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vcvt.i = fptosi <4 x float> %a to <4 x i16>
-  ret <4 x i16> %vcvt.i
-}
-
-define <4 x i16> @test_vcvt_u16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vcvt.i = fptoui <4 x float> %a to <4 x i16>
-  ret <4 x i16> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_s32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = fptosi <2 x double> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_u32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = fptoui <2 x double> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <1 x i8> @test_vcvt_s8_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}}
-  %vcvt.i = fptosi <1 x double> %a to <1 x i8>
-  ret <1 x i8> %vcvt.i
-}
-
-define <1 x i8> @test_vcvt_u8_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}}
-  %vcvt.i = fptoui <1 x double> %a to <1 x i8>
-  ret <1 x i8> %vcvt.i
-}
-
-define <1 x i16> @test_vcvt_s16_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}}
-  %vcvt.i = fptosi <1 x double> %a to <1 x i16>
-  ret <1 x i16> %vcvt.i
-}
-
-define <1 x i16> @test_vcvt_u16_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}}
-  %vcvt.i = fptoui <1 x double> %a to <1 x i16>
-  ret <1 x i16> %vcvt.i
-}
-
-define <1 x i32> @test_vcvt_s32_f64_v1(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = fptosi <1 x double> %a to <1 x i32>
-  ret <1 x i32> %vcvt.i
-}
-
-define <1 x i32> @test_vcvt_u32_f64_v1(<1 x double> %a) #0 {
-; CHECK: fcvtzu w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = fptoui <1 x double> %a to <1 x i32>
-  ret <1 x i32> %vcvt.i
-}
-
-define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtn_s32_f32
-; CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtns_f321.i = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtns_f321.i
-}
-
-define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtnq_s32_f32
-; CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtns_f321.i = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtns_f321.i
-}
-
-define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtnq_s64_f64
-; CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtns_f641.i = call <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtns_f641.i
-}
-
-define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtn_u32_f32
-; CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtnu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtnu_f321.i
-}
-
-define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtnq_u32_f32
-; CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtnu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtnu_f321.i
-}
-
-define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtnq_u64_f64
-; CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtnu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtnu_f641.i
-}
-
-define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtp_s32_f32
-; CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtps_f321.i = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtps_f321.i
-}
-
-define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtpq_s32_f32
-; CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtps_f321.i = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtps_f321.i
-}
-
-define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtpq_s64_f64
-; CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtps_f641.i = call <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtps_f641.i
-}
-
-define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtp_u32_f32
-; CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtpu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtpu_f321.i
-}
-
-define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtpq_u32_f32
-; CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtpu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtpu_f321.i
-}
-
-define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtpq_u64_f64
-; CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtpu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtpu_f641.i
-}
-
-define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtm_s32_f32
-; CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtms_f321.i = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtms_f321.i
-}
-
-define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtmq_s32_f32
-; CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtms_f321.i = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtms_f321.i
-}
-
-define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtmq_s64_f64
-; CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtms_f641.i = call <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtms_f641.i
-}
-
-define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtm_u32_f32
-; CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtmu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtmu_f321.i
-}
-
-define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtmq_u32_f32
-; CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtmu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtmu_f321.i
-}
-
-define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtmq_u64_f64
-; CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtmu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtmu_f641.i
-}
-
-define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvta_s32_f32
-; CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtas_f321.i = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtas_f321.i
-}
-
-define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtaq_s32_f32
-; CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtas_f321.i = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtas_f321.i
-}
-
-define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtaq_s64_f64
-; CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtas_f641.i = call <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtas_f641.i
-}
-
-define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvta_u32_f32
-; CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtau_f321.i = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtau_f321.i
-}
-
-define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtaq_u32_f32
-; CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtau_f321.i = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtau_f321.i
-}
-
-define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtaq_u64_f64
-; CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtau_f641.i = call <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtau_f641.i
-}
-
-define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrsqrte1.i = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrsqrte1.i
-}
-
-define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrsqrte1.i = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrsqrte1.i
-}
-
-define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrsqrte1.i = tail call <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrsqrte1.i
-}
-
-define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrecpe1.i = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrecpe1.i
-}
-
-define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrecpe1.i = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrecpe1.i
-}
-
-define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrecpe1.i = tail call <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrecpe1.i
-}
-
-define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
-; CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrecpe1.i = tail call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vrecpe1.i
-}
-
-define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
-; CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrecpe1.i = tail call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vrecpe1.i
-}
-
-define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vsqrt1.i = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vsqrt1.i
-}
-
-define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vsqrt1.i = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vsqrt1.i
-}
-
-define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vsqrt1.i = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vsqrt1.i
-}
-
-define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = sitofp <2 x i64> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = uitofp <2 x i64> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_s64(<2 x i64> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = sitofp <2 x i64> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_u64(<2 x i64> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = uitofp <2 x i64> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_f32_s16(<4 x i16> %a) #0 {
-; CHECK: sshll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0
-; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = sitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_f32_u16(<4 x i16> %a) #0 {
-; CHECK: ushll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0
-; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = uitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_f64_s32(<2 x i32> %a) #0 {
-; CHECK: sshll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = sitofp <2 x i32> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_f64_u32(<2 x i32> %a) #0 {
-; CHECK: ushll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = uitofp <2 x i32> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s8(<1 x i8> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0]
-; CHECK: sxtb w{{[0-9]+}}, w{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = sitofp <1 x i8> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u8(<1 x i8> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0]
-; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xff
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = uitofp <1 x i8> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s16(<1 x i16> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0]
-; CHECK: sxth w{{[0-9]+}}, w{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = sitofp <1 x i16> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u16(<1 x i16> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0]
-; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xffff
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = uitofp <1 x i16> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s32_v1(<1 x i32> %a) #0 {
-; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = sitofp <1 x i32> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u32_v1(<1 x i32> %a) #0 {
-; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = uitofp <1 x i32> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #2
-
-declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) #2
-
-declare <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) #2
-
-declare <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) #2
-
-declare <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float>)
-
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.trunc.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.rint.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.rint.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.floor.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.floor.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.ceil.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.round.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.round.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.round.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) #2
-
-declare <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double>) #2
-
-declare <2 x float> @llvm.aarch64.neon.fcvtn.v2f32.v2f64(<2 x double>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) #2
-
-declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) #2
-
-declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) #2
-
-declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #2
-
-declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) #2
-
-declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) #2
-
-declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) #2
-
-declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) #2
-
-declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) #2
-
-declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) #2
-
-declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) #2
-
-declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) #2
-
-declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) #2
-
-declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) #2
-
-declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) #2
-
-declare <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) #2
-
-declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #3
-
-declare <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) #2
-
-declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) #2
-
-
-define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_s64_f64
-; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fptosi <1 x double> %a to <1 x i64>
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_u64_f64
-; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fptoui <1 x double> %a to <1 x i64>
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtn_s64_f64
-; CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtn_u64_f64
-; CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtp_s64_f64
-; CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtp_u64_f64
-; CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtm_s64_f64
-; CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtm_u64_f64
-; CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvta_s64_f64
-; CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvta_u64_f64
-; CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_f64_s64
-; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = sitofp <1 x i64> %a to <1 x double>
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_f64_u64
-; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = uitofp <1 x i64> %a to <1 x double>
-  ret <1 x double> %1
-}
-
-declare <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double>)
-
-define <1 x double> @test_vrndn_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndn_f64
-; CHECK: frintn d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrnda_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrnda_f64
-; CHECK: frinta d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.round.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndp_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndp_f64
-; CHECK: frintp d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.ceil.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndm_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndm_f64
-; CHECK: frintm d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.floor.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndx_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndx_f64
-; CHECK: frintx d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.rint.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrnd_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrnd_f64
-; CHECK: frintz d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.trunc.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndi_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndi_f64
-; CHECK: frinti d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-declare <1 x double> @llvm.nearbyint.v1f64(<1 x double>)
-declare <1 x double> @llvm.trunc.v1f64(<1 x double>)
-declare <1 x double> @llvm.rint.v1f64(<1 x double>)
-declare <1 x double> @llvm.floor.v1f64(<1 x double>)
-declare <1 x double> @llvm.ceil.v1f64(<1 x double>)
-declare <1 x double> @llvm.round.v1f64(<1 x double>)
-declare <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double>)
-
-define <1 x double> @test_vrsqrte_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrsqrte_f64
-; CHECK: frsqrte d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrecpe_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrecpe_f64
-; CHECK: frecpe d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vsqrt_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vsqrt_f64
-; CHECK: fsqrt d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vrecps_f64
-; CHECK: frecps d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vrsqrts_f64
-; CHECK: frsqrts d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.sqrt.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
-
-define i64 @test_vaddlv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddlv_s32
-; CHECK: saddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-define i64 @test_vaddlv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddlv_u32
-; CHECK: uaddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32>)
-declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32>)
-\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll
index 4035b91..40649ae 100644
--- a/test/CodeGen/AArch64/neon-mov.ll
+++ b/test/CodeGen/AArch64/neon-mov.ll
@@ -1,218 +1,259 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
 define <8 x i8> @movi8b() {
-;CHECK:  movi {{v[0-9]+}}.8b, #0x8
+; CHECK-LABEL: movi8b:
+; CHECK:  movi {{v[0-9]+}}.8b, #{{0x8|8}}
    ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
 }
 
 define <16 x i8> @movi16b() {
-;CHECK:  movi {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: movi16b:
+; CHECK:  movi {{v[0-9]+}}.16b, #{{0x8|8}}
    ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
 }
 
 define <2 x i32> @movi2s_lsl0() {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: movi2s_lsl0:
+; CHECK: movi {{d[0-9]+}}, #0x0000ff000000ff
    ret <2 x i32> < i32 255, i32 255 >
 }
 
 define <2 x i32> @movi2s_lsl8() {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: movi2s_lsl8:
+; CHECK: movi {{d[0-9]+}}, #0x00ff000000ff00
    ret <2 x i32> < i32 65280, i32 65280 >
 }
 
 define <2 x i32> @movi2s_lsl16() {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: movi2s_lsl16:
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff0000
    ret <2 x i32> < i32 16711680, i32 16711680 >
 
 }
 
 define <2 x i32> @movi2s_lsl24() {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: movi2s_lsl24:
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff000000
    ret <2 x i32> < i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i32> @movi4s_lsl0() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: movi4s_lsl0:
+; CHECK: movi {{v[0-9]+}}.2d, #0x0000ff000000ff
    ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 >
 }
 
 define <4 x i32> @movi4s_lsl8() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: movi4s_lsl8:
+; CHECK: movi {{v[0-9]+}}.2d, #0x00ff000000ff00
    ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 >
 }
 
 define <4 x i32> @movi4s_lsl16() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: movi4s_lsl16:
+; CHECK:  movi {{v[0-9]+}}.2d, #0xff000000ff0000
    ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 >
 
 }
 
 define <4 x i32> @movi4s_lsl24() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: movi4s_lsl24:
+; CHECK:  movi {{v[0-9]+}}.2d, #0xff000000ff000000
    ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i16> @movi4h_lsl0() {
-;CHECK:  movi {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: movi4h_lsl0:
+; CHECK:  movi {{d[0-9]+}}, #0xff00ff00ff00ff
    ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <4 x i16> @movi4h_lsl8() {
-;CHECK:  movi {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: movi4h_lsl8:
+; CHECK: movi d0, #0xff00ff00ff00ff00
    ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
 define <8 x i16> @movi8h_lsl0() {
-;CHECK:  movi {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: movi8h_lsl0:
+; CHECK: movi v0.2d, #0xff00ff00ff00ff
    ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <8 x i16> @movi8h_lsl8() {
-;CHECK:  movi {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: movi8h_lsl8:
+; CHECK: movi v0.2d, #0xff00ff00ff00ff00
    ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
 
 define <2 x i32> @mvni2s_lsl0() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10
+; CHECK-LABEL: mvni2s_lsl0:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}
    ret <2 x i32> < i32 4294967279, i32 4294967279 >
 }
 
 define <2 x i32> @mvni2s_lsl8() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, lsl #8
+; CHECK-LABEL: mvni2s_lsl8:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #8
    ret <2 x i32> < i32 4294963199, i32 4294963199 >
 }
 
 define <2 x i32> @mvni2s_lsl16() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, lsl #16
+; CHECK-LABEL: mvni2s_lsl16:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #16
    ret <2 x i32> < i32 4293918719, i32 4293918719 >
 }
 
 define <2 x i32> @mvni2s_lsl24() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, lsl #24
+; CHECK-LABEL: mvni2s_lsl24:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #24
    ret <2 x i32> < i32 4026531839, i32 4026531839 >
 }
 
 define <4 x i32> @mvni4s_lsl0() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10
+; CHECK-LABEL: mvni4s_lsl0:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}
    ret <4 x i32> < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
 }
 
 define <4 x i32> @mvni4s_lsl8() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, lsl #8
+; CHECK-LABEL: mvni4s_lsl8:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #8
    ret <4 x i32> < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 >
 }
 
 define <4 x i32> @mvni4s_lsl16() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, lsl #16
+; CHECK-LABEL: mvni4s_lsl16:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #16
    ret <4 x i32> < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 >
 
 }
 
 define <4 x i32> @mvni4s_lsl24() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, lsl #24
+; CHECK-LABEL: mvni4s_lsl24:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #24
    ret <4 x i32> < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839 >
 }
 
 
 define <4 x i16> @mvni4h_lsl0() {
-;CHECK:  mvni {{v[0-9]+}}.4h, #0x10
+; CHECK-LABEL: mvni4h_lsl0:
+; CHECK:  mvni {{v[0-9]+}}.4h, #{{0x10|16}}
    ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 >
 }
 
 define <4 x i16> @mvni4h_lsl8() {
-;CHECK:  mvni {{v[0-9]+}}.4h, #0x10, lsl #8
+; CHECK-LABEL: mvni4h_lsl8:
+; CHECK:  mvni {{v[0-9]+}}.4h, #{{0x10|16}}, lsl #8
    ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 >
 }
 
 define <8 x i16> @mvni8h_lsl0() {
-;CHECK:  mvni {{v[0-9]+}}.8h, #0x10
+; CHECK-LABEL: mvni8h_lsl0:
+; CHECK:  mvni {{v[0-9]+}}.8h, #{{0x10|16}}
    ret <8 x i16> < i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519 >
 }
 
 define <8 x i16> @mvni8h_lsl8() {
-;CHECK:  mvni {{v[0-9]+}}.8h, #0x10, lsl #8
+; CHECK-LABEL: mvni8h_lsl8:
+; CHECK:  mvni {{v[0-9]+}}.8h, #{{0x10|16}}, lsl #8
    ret <8 x i16> < i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439 >
 }
 
 
 define <2 x i32> @movi2s_msl8(<2 x i32> %a) {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff, msl #8
+; CHECK-LABEL: movi2s_msl8:
+; CHECK: movi {{d[0-9]+}}, #0x00ffff0000ffff
 	ret <2 x i32> < i32 65535, i32 65535 >
 }
 
 define <2 x i32> @movi2s_msl16() {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff, msl #16
+; CHECK-LABEL: movi2s_msl16:
+; CHECK:  movi d0, #0xffffff00ffffff
    ret <2 x i32> < i32 16777215, i32 16777215 >
 }
 
 
 define <4 x i32> @movi4s_msl8() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff, msl #8
+; CHECK-LABEL: movi4s_msl8:
+; CHECK:  movi v0.2d, #0x00ffff0000ffff
    ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 >
 }
 
 define <4 x i32> @movi4s_msl16() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff, msl #16
+; CHECK-LABEL: movi4s_msl16:
+; CHECK:  movi v0.2d, #0xffffff00ffffff
    ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 >
 }
 
 define <2 x i32> @mvni2s_msl8() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, msl #8
+; CHECK-LABEL: mvni2s_msl8:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}, msl #8
    ret <2 x i32> < i32 18446744073709547264, i32 18446744073709547264>
 }
 
 define <2 x i32> @mvni2s_msl16() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, msl #16
+; CHECK-LABEL: mvni2s_msl16:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}, msl #16
    ret <2 x i32> < i32 18446744073708437504, i32 18446744073708437504>
 }
 
 define <4 x i32> @mvni4s_msl8() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, msl #8
+; CHECK-LABEL: mvni4s_msl8:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}, msl #8
    ret <4 x i32> < i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264>
 }
 
 define <4 x i32> @mvni4s_msl16() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, msl #16
+; CHECK-LABEL: mvni4s_msl16:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}, msl #16
    ret <4 x i32> < i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504>
 }
 
 define <2 x i64> @movi2d() {
-;CHECK: movi {{v[0-9]+}}.2d, #0xff0000ff0000ffff
+; CHECK-LABEL: movi2d:
+; CHECK: movi {{v[0-9]+}}.2d, #0xff0000ff0000ffff
 	ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
 }
 
 define <1 x i64> @movid() {
-;CHECK: movi {{d[0-9]+}}, #0xff0000ff0000ffff
+; CHECK-LABEL: movid:
+; CHECK: movi {{d[0-9]+}}, #0xff0000ff0000ffff
 	ret  <1 x i64> < i64 18374687574888349695 >
 }
 
 define <2 x float> @fmov2s() {
-;CHECK:  fmov {{v[0-9]+}}.2s, #-12.00000000
+; CHECK-LABEL: fmov2s:
+; CHECK:  fmov {{v[0-9]+}}.2s, #{{-12.00000000|-1.200000e\+01}}
 	ret <2 x float> < float -1.2e1, float -1.2e1>
 }
 
 define <4 x float> @fmov4s() {
-;CHECK:  fmov {{v[0-9]+}}.4s, #-12.00000000
+; CHECK-LABEL: fmov4s:
+; CHECK:  fmov {{v[0-9]+}}.4s, #{{-12.00000000|-1.200000e\+01}}
 	ret <4 x float> < float -1.2e1, float -1.2e1, float -1.2e1, float -1.2e1>
 }
 
 define <2 x double> @fmov2d() {
-;CHECK:  fmov {{v[0-9]+}}.2d, #-12.00000000
+; CHECK-LABEL: fmov2d:
+; CHECK:  fmov {{v[0-9]+}}.2d, #{{-12.00000000|-1.200000e\+01}}
 	ret <2 x double> < double -1.2e1, double -1.2e1>
 }
 
 define <2 x i32> @movi1d_1() {
-; CHECK: movi    d0, #0xffffffff0000
+; CHECK-LABEL: movi1d_1:
+; CHECK: movi    d0, #0x{{0*}}ffffffff0000
   ret <2 x i32> < i32  -65536, i32 65535>
 }
 
 
 declare <2 x i32> @test_movi1d(<2 x i32>, <2 x i32>)
 define <2 x i32> @movi1d() {
+; CHECK-LABEL: movi1d:
 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-; CHECK-NEXT: movi     d1, #0xffffffff0000
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+; CHECK-NEXT: movi     d1, #0x{{0*}}ffffffff0000
   %1 = tail call <2 x i32> @test_movi1d(<2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>)
   ret <2 x i32> %1
 }
diff --git a/test/CodeGen/AArch64/neon-mul-div.ll b/test/CodeGen/AArch64/neon-mul-div.ll
deleted file mode 100644
index da22ce8..0000000
--- a/test/CodeGen/AArch64/neon-mul-div.ll
+++ /dev/null
@@ -1,754 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-
-define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = mul <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = mul <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = mul <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = mul <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = mul <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = mul <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK-LABEL: mul1xi64:
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-  %tmp3 = mul <1 x i64> %A, %B;
-  ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK-LABEL: mul2xi64:
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-  %tmp3 = mul <2 x i64> %A, %B;
-  ret <2 x i64> %tmp3
-}
-
- define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fmul <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fmul <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fmul <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-
- define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fdiv <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fdiv <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fdiv <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = sdiv <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = sdiv <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = udiv <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = udiv <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = srem <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = srem <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = urem <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = urem <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) {
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-	%tmp3 = frem <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) {
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-	%tmp3 = frem <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-
-define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) {
-; CHECK: bl fmod
-	%tmp3 = frem <1 x double> %A, %B;
-	ret <1 x double> %tmp3
-}
-
-define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) {
-; CHECK: bl fmod
-; CHECK: bl fmod
-	%tmp3 = frem <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>)
-declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>)
-
-define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: poly_mulv8i8:
-   %prod = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: pmul v0.8b, v0.8b, v1.8b
-   ret <8 x i8> %prod
-}
-
-define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: poly_mulv16i8:
-   %prod = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: pmul v0.16b, v0.16b, v1.16b
-   ret <16 x i8> %prod
-}
-
-declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
-   ret <4 x i16> %prod
-}
-
-define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
-   ret <8 x i16> %prod
-}
-
-define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
-   ret <2 x i32> %prod
-}
-
-define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
-   ret <4 x i32> %prod
-}
-
-declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqrdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
-   ret <4 x i16> %prod
-}
-
-define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqrdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
-   ret <8 x i16> %prod
-}
-
-define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqrdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
-   ret <2 x i32> %prod
-}
-
-define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqrdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
-   ret <4 x i32> %prod
-}
-
-declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-        ret <2 x float> %val
-}
-
-define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-        ret <4 x float> %val
-}
-
-define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-        ret <2 x double> %val
-}
-
-define <1 x i8> @test_mul_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_mul_v1i8:
-;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %c = mul <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @test_mul_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_mul_v1i16:
-;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %c = mul <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @test_mul_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_mul_v1i32:
-;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %c = mul <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
index a0b17e1..4f8571d 100644
--- a/test/CodeGen/AArch64/neon-perm.ll
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
@@ -20,7 +20,7 @@
 %struct.poly16x8x2_t = type { [2 x <8 x i16>] }
 
 define <8 x i8> @test_vuzp1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp1_s8:
+; CHECK-LABEL: test_vuzp1_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -28,7 +28,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp1q_s8:
+; CHECK-LABEL: test_vuzp1q_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -36,7 +36,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp1_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp1_s16:
+; CHECK-LABEL: test_vuzp1_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -44,7 +44,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp1q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp1q_s16:
+; CHECK-LABEL: test_vuzp1q_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -52,15 +52,15 @@ entry:
 }
 
 define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp1_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vuzp1_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vuzp1q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp1q_s32:
+; CHECK-LABEL: test_vuzp1q_s32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -68,15 +68,15 @@ entry:
 }
 
 define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp1q_s64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vuzp1q_s64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vuzp1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp1_u8:
+; CHECK-LABEL: test_vuzp1_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -84,7 +84,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp1q_u8:
+; CHECK-LABEL: test_vuzp1q_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -92,7 +92,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp1_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp1_u16:
+; CHECK-LABEL: test_vuzp1_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -100,7 +100,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp1q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp1q_u16:
+; CHECK-LABEL: test_vuzp1q_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -108,15 +108,15 @@ entry:
 }
 
 define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp1_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vuzp1_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vuzp1q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp1q_u32:
+; CHECK-LABEL: test_vuzp1q_u32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -124,23 +124,23 @@ entry:
 }
 
 define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp1q_u64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vuzp1q_u64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vuzp1_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vuzp1_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vuzp1q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vuzp1q_f32:
+; CHECK-LABEL: test_vuzp1q_f32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -148,15 +148,15 @@ entry:
 }
 
 define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vuzp1q_f64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vuzp1q_f64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vuzp1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp1_p8:
+; CHECK-LABEL: test_vuzp1_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -164,7 +164,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp1q_p8:
+; CHECK-LABEL: test_vuzp1q_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -172,7 +172,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp1_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp1_p16:
+; CHECK-LABEL: test_vuzp1_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -180,7 +180,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp1q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp1q_p16:
+; CHECK-LABEL: test_vuzp1q_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -188,7 +188,7 @@ entry:
 }
 
 define <8 x i8> @test_vuzp2_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp2_s8:
+; CHECK-LABEL: test_vuzp2_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -196,7 +196,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp2q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp2q_s8:
+; CHECK-LABEL: test_vuzp2q_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -204,7 +204,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp2_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp2_s16:
+; CHECK-LABEL: test_vuzp2_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -212,7 +212,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp2q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp2q_s16:
+; CHECK-LABEL: test_vuzp2q_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -220,15 +220,15 @@ entry:
 }
 
 define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp2_s32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp2_s32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vuzp2q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp2q_s32:
+; CHECK-LABEL: test_vuzp2q_s32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -236,16 +236,15 @@ entry:
 }
 
 define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp2q_s64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: test_vuzp2q_s64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vuzp2_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp2_u8:
+; CHECK-LABEL: test_vuzp2_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -253,7 +252,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp2q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp2q_u8:
+; CHECK-LABEL: test_vuzp2q_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -261,7 +260,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp2_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp2_u16:
+; CHECK-LABEL: test_vuzp2_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -269,7 +268,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp2q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp2q_u16:
+; CHECK-LABEL: test_vuzp2q_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -277,15 +276,15 @@ entry:
 }
 
 define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp2_u32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp2_u32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vuzp2q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp2q_u32:
+; CHECK-LABEL: test_vuzp2q_u32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -293,24 +292,23 @@ entry:
 }
 
 define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp2q_u64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: test_vuzp2q_u64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vuzp2_f32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp2_f32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vuzp2q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vuzp2q_f32:
+; CHECK-LABEL: test_vuzp2q_f32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -318,16 +316,15 @@ entry:
 }
 
 define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vuzp2q_f64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: test_vuzp2q_f64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vuzp2_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp2_p8:
+; CHECK-LABEL: test_vuzp2_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -335,7 +332,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp2q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp2q_p8:
+; CHECK-LABEL: test_vuzp2q_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -343,7 +340,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp2_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp2_p16:
+; CHECK-LABEL: test_vuzp2_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -351,7 +348,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp2q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp2q_p16:
+; CHECK-LABEL: test_vuzp2q_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -359,7 +356,7 @@ entry:
 }
 
 define <8 x i8> @test_vzip1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip1_s8:
+; CHECK-LABEL: test_vzip1_s8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -367,7 +364,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip1q_s8:
+; CHECK-LABEL: test_vzip1q_s8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -375,7 +372,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip1_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip1_s16:
+; CHECK-LABEL: test_vzip1_s16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -383,7 +380,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip1q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip1q_s16:
+; CHECK-LABEL: test_vzip1q_s16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -391,15 +388,15 @@ entry:
 }
 
 define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip1_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vzip1_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vzip1q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip1q_s32:
+; CHECK-LABEL: test_vzip1q_s32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -407,15 +404,15 @@ entry:
 }
 
 define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip1q_s64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vzip1q_s64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vzip1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip1_u8:
+; CHECK-LABEL: test_vzip1_u8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -423,7 +420,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip1q_u8:
+; CHECK-LABEL: test_vzip1q_u8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -431,7 +428,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip1_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip1_u16:
+; CHECK-LABEL: test_vzip1_u16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -439,7 +436,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip1q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip1q_u16:
+; CHECK-LABEL: test_vzip1q_u16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -447,15 +444,15 @@ entry:
 }
 
 define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip1_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vzip1_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vzip1q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip1q_u32:
+; CHECK-LABEL: test_vzip1q_u32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -463,23 +460,23 @@ entry:
 }
 
 define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip1q_u64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vzip1q_u64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vzip1_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vzip1_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vzip1q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vzip1q_f32:
+; CHECK-LABEL: test_vzip1q_f32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -487,15 +484,15 @@ entry:
 }
 
 define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vzip1q_f64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vzip1q_f64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vzip1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip1_p8:
+; CHECK-LABEL: test_vzip1_p8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -503,7 +500,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip1q_p8:
+; CHECK-LABEL: test_vzip1q_p8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -511,7 +508,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip1_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip1_p16:
+; CHECK-LABEL: test_vzip1_p16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -519,7 +516,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip1q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip1q_p16:
+; CHECK-LABEL: test_vzip1q_p16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -527,7 +524,7 @@ entry:
 }
 
 define <8 x i8> @test_vzip2_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip2_s8:
+; CHECK-LABEL: test_vzip2_s8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -535,7 +532,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip2q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip2q_s8:
+; CHECK-LABEL: test_vzip2q_s8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -543,7 +540,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip2_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip2_s16:
+; CHECK-LABEL: test_vzip2_s16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -551,7 +548,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip2q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip2q_s16:
+; CHECK-LABEL: test_vzip2q_s16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -559,15 +556,15 @@ entry:
 }
 
 define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip2_s32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip2_s32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vzip2q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip2q_s32:
+; CHECK-LABEL: test_vzip2q_s32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -575,15 +572,15 @@ entry:
 }
 
 define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip2q_s64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vzip2q_s64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vzip2_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip2_u8:
+; CHECK-LABEL: test_vzip2_u8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -591,7 +588,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip2q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip2q_u8:
+; CHECK-LABEL: test_vzip2q_u8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -599,7 +596,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip2_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip2_u16:
+; CHECK-LABEL: test_vzip2_u16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -607,7 +604,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip2q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip2q_u16:
+; CHECK-LABEL: test_vzip2q_u16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -615,15 +612,15 @@ entry:
 }
 
 define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip2_u32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip2_u32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vzip2q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip2q_u32:
+; CHECK-LABEL: test_vzip2q_u32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -631,23 +628,23 @@ entry:
 }
 
 define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip2q_u64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vzip2q_u64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vzip2_f32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip2_f32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vzip2q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vzip2q_f32:
+; CHECK-LABEL: test_vzip2q_f32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -655,15 +652,15 @@ entry:
 }
 
 define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vzip2q_f64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vzip2q_f64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vzip2_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip2_p8:
+; CHECK-LABEL: test_vzip2_p8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -671,7 +668,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip2q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip2q_p8:
+; CHECK-LABEL: test_vzip2q_p8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -679,7 +676,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip2_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip2_p16:
+; CHECK-LABEL: test_vzip2_p16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -687,7 +684,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip2q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip2q_p16:
+; CHECK-LABEL: test_vzip2q_p16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -695,7 +692,7 @@ entry:
 }
 
 define <8 x i8> @test_vtrn1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn1_s8:
+; CHECK-LABEL: test_vtrn1_s8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -703,7 +700,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn1q_s8:
+; CHECK-LABEL: test_vtrn1q_s8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -711,7 +708,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn1_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn1_s16:
+; CHECK-LABEL: test_vtrn1_s16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -719,7 +716,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn1q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn1q_s16:
+; CHECK-LABEL: test_vtrn1q_s16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -727,15 +724,15 @@ entry:
 }
 
 define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn1_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vtrn1_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vtrn1q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn1q_s32:
+; CHECK-LABEL: test_vtrn1q_s32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -743,15 +740,15 @@ entry:
 }
 
 define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn1q_s64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vtrn1q_s64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vtrn1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn1_u8:
+; CHECK-LABEL: test_vtrn1_u8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -759,7 +756,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn1q_u8:
+; CHECK-LABEL: test_vtrn1q_u8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -767,7 +764,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn1_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn1_u16:
+; CHECK-LABEL: test_vtrn1_u16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -775,7 +772,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn1q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn1q_u16:
+; CHECK-LABEL: test_vtrn1q_u16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -783,15 +780,15 @@ entry:
 }
 
 define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn1_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vtrn1_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vtrn1q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn1q_u32:
+; CHECK-LABEL: test_vtrn1q_u32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -799,23 +796,23 @@ entry:
 }
 
 define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn1q_u64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vtrn1q_u64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vtrn1_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vtrn1_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vtrn1q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vtrn1q_f32:
+; CHECK-LABEL: test_vtrn1q_f32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -823,15 +820,15 @@ entry:
 }
 
 define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vtrn1q_f64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vtrn1q_f64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vtrn1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn1_p8:
+; CHECK-LABEL: test_vtrn1_p8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -839,7 +836,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn1q_p8:
+; CHECK-LABEL: test_vtrn1q_p8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -847,7 +844,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn1_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn1_p16:
+; CHECK-LABEL: test_vtrn1_p16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -855,7 +852,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn1q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn1q_p16:
+; CHECK-LABEL: test_vtrn1q_p16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -863,7 +860,7 @@ entry:
 }
 
 define <8 x i8> @test_vtrn2_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn2_s8:
+; CHECK-LABEL: test_vtrn2_s8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -871,7 +868,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn2q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn2q_s8:
+; CHECK-LABEL: test_vtrn2q_s8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -879,7 +876,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn2_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn2_s16:
+; CHECK-LABEL: test_vtrn2_s16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -887,7 +884,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn2q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn2q_s16:
+; CHECK-LABEL: test_vtrn2q_s16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -895,15 +892,15 @@ entry:
 }
 
 define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn2_s32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn2_s32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vtrn2q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn2q_s32:
+; CHECK-LABEL: test_vtrn2q_s32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -911,15 +908,15 @@ entry:
 }
 
 define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn2q_s64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vtrn2q_s64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vtrn2_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn2_u8:
+; CHECK-LABEL: test_vtrn2_u8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -927,7 +924,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn2q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn2q_u8:
+; CHECK-LABEL: test_vtrn2q_u8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -935,7 +932,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn2_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn2_u16:
+; CHECK-LABEL: test_vtrn2_u16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -943,7 +940,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn2q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn2q_u16:
+; CHECK-LABEL: test_vtrn2q_u16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -951,15 +948,15 @@ entry:
 }
 
 define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn2_u32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn2_u32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vtrn2q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn2q_u32:
+; CHECK-LABEL: test_vtrn2q_u32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -967,23 +964,23 @@ entry:
 }
 
 define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn2q_u64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vtrn2q_u64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vtrn2_f32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn2_f32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vtrn2q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vtrn2q_f32:
+; CHECK-LABEL: test_vtrn2q_f32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -991,15 +988,15 @@ entry:
 }
 
 define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vtrn2q_f64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vtrn2q_f64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vtrn2_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn2_p8:
+; CHECK-LABEL: test_vtrn2_p8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1007,7 +1004,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn2q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn2q_p8:
+; CHECK-LABEL: test_vtrn2q_p8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1015,7 +1012,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn2_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn2_p16:
+; CHECK-LABEL: test_vtrn2_p16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1023,7 +1020,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn2q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn2q_p16:
+; CHECK-LABEL: test_vtrn2q_p16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1031,7 +1028,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp1_s8(<8 x i8> %a) {
-; CHECK: test_same_vuzp1_s8:
+; CHECK-LABEL: test_same_vuzp1_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1039,7 +1036,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp1q_s8(<16 x i8> %a) {
-; CHECK: test_same_vuzp1q_s8:
+; CHECK-LABEL: test_same_vuzp1q_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1047,7 +1044,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp1_s16(<4 x i16> %a) {
-; CHECK: test_same_vuzp1_s16:
+; CHECK-LABEL: test_same_vuzp1_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1055,7 +1052,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp1q_s16(<8 x i16> %a) {
-; CHECK: test_same_vuzp1q_s16:
+; CHECK-LABEL: test_same_vuzp1q_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1063,7 +1060,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vuzp1q_s32(<4 x i32> %a) {
-; CHECK: test_same_vuzp1q_s32:
+; CHECK-LABEL: test_same_vuzp1q_s32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1071,7 +1068,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp1_u8(<8 x i8> %a) {
-; CHECK: test_same_vuzp1_u8:
+; CHECK-LABEL: test_same_vuzp1_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1079,7 +1076,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp1q_u8(<16 x i8> %a) {
-; CHECK: test_same_vuzp1q_u8:
+; CHECK-LABEL: test_same_vuzp1q_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1087,7 +1084,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp1_u16(<4 x i16> %a) {
-; CHECK: test_same_vuzp1_u16:
+; CHECK-LABEL: test_same_vuzp1_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1095,7 +1092,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp1q_u16(<8 x i16> %a) {
-; CHECK: test_same_vuzp1q_u16:
+; CHECK-LABEL: test_same_vuzp1q_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1103,7 +1100,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vuzp1q_u32(<4 x i32> %a) {
-; CHECK: test_same_vuzp1q_u32:
+; CHECK-LABEL: test_same_vuzp1q_u32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1111,7 +1108,7 @@ entry:
 }
 
 define <4 x float> @test_same_vuzp1q_f32(<4 x float> %a) {
-; CHECK: test_same_vuzp1q_f32:
+; CHECK-LABEL: test_same_vuzp1q_f32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1119,7 +1116,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp1_p8(<8 x i8> %a) {
-; CHECK: test_same_vuzp1_p8:
+; CHECK-LABEL: test_same_vuzp1_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1127,7 +1124,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp1q_p8(<16 x i8> %a) {
-; CHECK: test_same_vuzp1q_p8:
+; CHECK-LABEL: test_same_vuzp1q_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1135,7 +1132,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp1_p16(<4 x i16> %a) {
-; CHECK: test_same_vuzp1_p16:
+; CHECK-LABEL: test_same_vuzp1_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1143,7 +1140,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp1q_p16(<8 x i16> %a) {
-; CHECK: test_same_vuzp1q_p16:
+; CHECK-LABEL: test_same_vuzp1q_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1151,7 +1148,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp2_s8(<8 x i8> %a) {
-; CHECK: test_same_vuzp2_s8:
+; CHECK-LABEL: test_same_vuzp2_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1159,7 +1156,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp2q_s8(<16 x i8> %a) {
-; CHECK: test_same_vuzp2q_s8:
+; CHECK-LABEL: test_same_vuzp2q_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1167,7 +1164,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp2_s16(<4 x i16> %a) {
-; CHECK: test_same_vuzp2_s16:
+; CHECK-LABEL: test_same_vuzp2_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1175,7 +1172,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp2q_s16(<8 x i16> %a) {
-; CHECK: test_same_vuzp2q_s16:
+; CHECK-LABEL: test_same_vuzp2q_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1183,7 +1180,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vuzp2q_s32(<4 x i32> %a) {
-; CHECK: test_same_vuzp2q_s32:
+; CHECK-LABEL: test_same_vuzp2q_s32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1191,7 +1188,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp2_u8(<8 x i8> %a) {
-; CHECK: test_same_vuzp2_u8:
+; CHECK-LABEL: test_same_vuzp2_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1199,7 +1196,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp2q_u8(<16 x i8> %a) {
-; CHECK: test_same_vuzp2q_u8:
+; CHECK-LABEL: test_same_vuzp2q_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1207,7 +1204,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp2_u16(<4 x i16> %a) {
-; CHECK: test_same_vuzp2_u16:
+; CHECK-LABEL: test_same_vuzp2_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1215,7 +1212,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp2q_u16(<8 x i16> %a) {
-; CHECK: test_same_vuzp2q_u16:
+; CHECK-LABEL: test_same_vuzp2q_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1223,7 +1220,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vuzp2q_u32(<4 x i32> %a) {
-; CHECK: test_same_vuzp2q_u32:
+; CHECK-LABEL: test_same_vuzp2q_u32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1231,7 +1228,7 @@ entry:
 }
 
 define <4 x float> @test_same_vuzp2q_f32(<4 x float> %a) {
-; CHECK: test_same_vuzp2q_f32:
+; CHECK-LABEL: test_same_vuzp2q_f32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1239,7 +1236,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp2_p8(<8 x i8> %a) {
-; CHECK: test_same_vuzp2_p8:
+; CHECK-LABEL: test_same_vuzp2_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1247,7 +1244,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp2q_p8(<16 x i8> %a) {
-; CHECK: test_same_vuzp2q_p8:
+; CHECK-LABEL: test_same_vuzp2q_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1255,7 +1252,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp2_p16(<4 x i16> %a) {
-; CHECK: test_same_vuzp2_p16:
+; CHECK-LABEL: test_same_vuzp2_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1263,7 +1260,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp2q_p16(<8 x i16> %a) {
-; CHECK: test_same_vuzp2q_p16:
+; CHECK-LABEL: test_same_vuzp2q_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1271,7 +1268,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip1_s8(<8 x i8> %a) {
-; CHECK: test_same_vzip1_s8:
+; CHECK-LABEL: test_same_vzip1_s8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1279,7 +1276,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip1q_s8(<16 x i8> %a) {
-; CHECK: test_same_vzip1q_s8:
+; CHECK-LABEL: test_same_vzip1q_s8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -1287,7 +1284,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip1_s16(<4 x i16> %a) {
-; CHECK: test_same_vzip1_s16:
+; CHECK-LABEL: test_same_vzip1_s16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1295,7 +1292,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip1q_s16(<8 x i16> %a) {
-; CHECK: test_same_vzip1q_s16:
+; CHECK-LABEL: test_same_vzip1q_s16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1303,7 +1300,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vzip1q_s32(<4 x i32> %a) {
-; CHECK: test_same_vzip1q_s32:
+; CHECK-LABEL: test_same_vzip1q_s32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1311,7 +1308,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip1_u8(<8 x i8> %a) {
-; CHECK: test_same_vzip1_u8:
+; CHECK-LABEL: test_same_vzip1_u8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1319,7 +1316,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip1q_u8(<16 x i8> %a) {
-; CHECK: test_same_vzip1q_u8:
+; CHECK-LABEL: test_same_vzip1q_u8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -1327,7 +1324,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip1_u16(<4 x i16> %a) {
-; CHECK: test_same_vzip1_u16:
+; CHECK-LABEL: test_same_vzip1_u16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1335,7 +1332,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip1q_u16(<8 x i16> %a) {
-; CHECK: test_same_vzip1q_u16:
+; CHECK-LABEL: test_same_vzip1q_u16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1343,7 +1340,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vzip1q_u32(<4 x i32> %a) {
-; CHECK: test_same_vzip1q_u32:
+; CHECK-LABEL: test_same_vzip1q_u32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1351,7 +1348,7 @@ entry:
 }
 
 define <4 x float> @test_same_vzip1q_f32(<4 x float> %a) {
-; CHECK: test_same_vzip1q_f32:
+; CHECK-LABEL: test_same_vzip1q_f32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1359,7 +1356,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip1_p8(<8 x i8> %a) {
-; CHECK: test_same_vzip1_p8:
+; CHECK-LABEL: test_same_vzip1_p8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1367,7 +1364,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip1q_p8(<16 x i8> %a) {
-; CHECK: test_same_vzip1q_p8:
+; CHECK-LABEL: test_same_vzip1q_p8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -1375,7 +1372,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip1_p16(<4 x i16> %a) {
-; CHECK: test_same_vzip1_p16:
+; CHECK-LABEL: test_same_vzip1_p16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1383,7 +1380,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip1q_p16(<8 x i16> %a) {
-; CHECK: test_same_vzip1q_p16:
+; CHECK-LABEL: test_same_vzip1q_p16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1391,7 +1388,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip2_s8(<8 x i8> %a) {
-; CHECK: test_same_vzip2_s8:
+; CHECK-LABEL: test_same_vzip2_s8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1399,7 +1396,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip2q_s8(<16 x i8> %a) {
-; CHECK: test_same_vzip2q_s8:
+; CHECK-LABEL: test_same_vzip2q_s8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -1407,7 +1404,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip2_s16(<4 x i16> %a) {
-; CHECK: test_same_vzip2_s16:
+; CHECK-LABEL: test_same_vzip2_s16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1415,7 +1412,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip2q_s16(<8 x i16> %a) {
-; CHECK: test_same_vzip2q_s16:
+; CHECK-LABEL: test_same_vzip2q_s16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1423,7 +1420,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vzip2q_s32(<4 x i32> %a) {
-; CHECK: test_same_vzip2q_s32:
+; CHECK-LABEL: test_same_vzip2q_s32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1431,7 +1428,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip2_u8(<8 x i8> %a) {
-; CHECK: test_same_vzip2_u8:
+; CHECK-LABEL: test_same_vzip2_u8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1439,7 +1436,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip2q_u8(<16 x i8> %a) {
-; CHECK: test_same_vzip2q_u8:
+; CHECK-LABEL: test_same_vzip2q_u8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -1447,7 +1444,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip2_u16(<4 x i16> %a) {
-; CHECK: test_same_vzip2_u16:
+; CHECK-LABEL: test_same_vzip2_u16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1455,7 +1452,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip2q_u16(<8 x i16> %a) {
-; CHECK: test_same_vzip2q_u16:
+; CHECK-LABEL: test_same_vzip2q_u16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1463,7 +1460,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vzip2q_u32(<4 x i32> %a) {
-; CHECK: test_same_vzip2q_u32:
+; CHECK-LABEL: test_same_vzip2q_u32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1471,7 +1468,7 @@ entry:
 }
 
 define <4 x float> @test_same_vzip2q_f32(<4 x float> %a) {
-; CHECK: test_same_vzip2q_f32:
+; CHECK-LABEL: test_same_vzip2q_f32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1479,7 +1476,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip2_p8(<8 x i8> %a) {
-; CHECK: test_same_vzip2_p8:
+; CHECK-LABEL: test_same_vzip2_p8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1487,7 +1484,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip2q_p8(<16 x i8> %a) {
-; CHECK: test_same_vzip2q_p8:
+; CHECK-LABEL: test_same_vzip2q_p8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -1495,7 +1492,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip2_p16(<4 x i16> %a) {
-; CHECK: test_same_vzip2_p16:
+; CHECK-LABEL: test_same_vzip2_p16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1503,7 +1500,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip2q_p16(<8 x i16> %a) {
-; CHECK: test_same_vzip2q_p16:
+; CHECK-LABEL: test_same_vzip2q_p16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1511,7 +1508,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn1_s8(<8 x i8> %a) {
-; CHECK: test_same_vtrn1_s8:
+; CHECK-LABEL: test_same_vtrn1_s8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1519,7 +1516,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn1q_s8(<16 x i8> %a) {
-; CHECK: test_same_vtrn1q_s8:
+; CHECK-LABEL: test_same_vtrn1q_s8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -1527,7 +1524,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn1_s16(<4 x i16> %a) {
-; CHECK: test_same_vtrn1_s16:
+; CHECK-LABEL: test_same_vtrn1_s16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1535,7 +1532,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn1q_s16(<8 x i16> %a) {
-; CHECK: test_same_vtrn1q_s16:
+; CHECK-LABEL: test_same_vtrn1q_s16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1543,7 +1540,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vtrn1q_s32(<4 x i32> %a) {
-; CHECK: test_same_vtrn1q_s32:
+; CHECK-LABEL: test_same_vtrn1q_s32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1551,7 +1548,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn1_u8(<8 x i8> %a) {
-; CHECK: test_same_vtrn1_u8:
+; CHECK-LABEL: test_same_vtrn1_u8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1559,7 +1556,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn1q_u8(<16 x i8> %a) {
-; CHECK: test_same_vtrn1q_u8:
+; CHECK-LABEL: test_same_vtrn1q_u8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -1567,7 +1564,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn1_u16(<4 x i16> %a) {
-; CHECK: test_same_vtrn1_u16:
+; CHECK-LABEL: test_same_vtrn1_u16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1575,7 +1572,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn1q_u16(<8 x i16> %a) {
-; CHECK: test_same_vtrn1q_u16:
+; CHECK-LABEL: test_same_vtrn1q_u16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1583,7 +1580,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vtrn1q_u32(<4 x i32> %a) {
-; CHECK: test_same_vtrn1q_u32:
+; CHECK-LABEL: test_same_vtrn1q_u32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1591,7 +1588,7 @@ entry:
 }
 
 define <4 x float> @test_same_vtrn1q_f32(<4 x float> %a) {
-; CHECK: test_same_vtrn1q_f32:
+; CHECK-LABEL: test_same_vtrn1q_f32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1599,7 +1596,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn1_p8(<8 x i8> %a) {
-; CHECK: test_same_vtrn1_p8:
+; CHECK-LABEL: test_same_vtrn1_p8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1607,7 +1604,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn1q_p8(<16 x i8> %a) {
-; CHECK: test_same_vtrn1q_p8:
+; CHECK-LABEL: test_same_vtrn1q_p8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -1615,7 +1612,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn1_p16(<4 x i16> %a) {
-; CHECK: test_same_vtrn1_p16:
+; CHECK-LABEL: test_same_vtrn1_p16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1623,7 +1620,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn1q_p16(<8 x i16> %a) {
-; CHECK: test_same_vtrn1q_p16:
+; CHECK-LABEL: test_same_vtrn1q_p16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1631,7 +1628,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn2_s8(<8 x i8> %a) {
-; CHECK: test_same_vtrn2_s8:
+; CHECK-LABEL: test_same_vtrn2_s8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1639,7 +1636,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn2q_s8(<16 x i8> %a) {
-; CHECK: test_same_vtrn2q_s8:
+; CHECK-LABEL: test_same_vtrn2q_s8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1647,7 +1644,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn2_s16(<4 x i16> %a) {
-; CHECK: test_same_vtrn2_s16:
+; CHECK-LABEL: test_same_vtrn2_s16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1655,7 +1652,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn2q_s16(<8 x i16> %a) {
-; CHECK: test_same_vtrn2q_s16:
+; CHECK-LABEL: test_same_vtrn2q_s16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1663,7 +1660,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vtrn2q_s32(<4 x i32> %a) {
-; CHECK: test_same_vtrn2q_s32:
+; CHECK-LABEL: test_same_vtrn2q_s32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1671,7 +1668,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn2_u8(<8 x i8> %a) {
-; CHECK: test_same_vtrn2_u8:
+; CHECK-LABEL: test_same_vtrn2_u8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1679,7 +1676,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn2q_u8(<16 x i8> %a) {
-; CHECK: test_same_vtrn2q_u8:
+; CHECK-LABEL: test_same_vtrn2q_u8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1687,7 +1684,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn2_u16(<4 x i16> %a) {
-; CHECK: test_same_vtrn2_u16:
+; CHECK-LABEL: test_same_vtrn2_u16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1695,7 +1692,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn2q_u16(<8 x i16> %a) {
-; CHECK: test_same_vtrn2q_u16:
+; CHECK-LABEL: test_same_vtrn2q_u16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1703,7 +1700,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vtrn2q_u32(<4 x i32> %a) {
-; CHECK: test_same_vtrn2q_u32:
+; CHECK-LABEL: test_same_vtrn2q_u32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1711,7 +1708,7 @@ entry:
 }
 
 define <4 x float> @test_same_vtrn2q_f32(<4 x float> %a) {
-; CHECK: test_same_vtrn2q_f32:
+; CHECK-LABEL: test_same_vtrn2q_f32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1719,7 +1716,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn2_p8(<8 x i8> %a) {
-; CHECK: test_same_vtrn2_p8:
+; CHECK-LABEL: test_same_vtrn2_p8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1727,7 +1724,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn2q_p8(<16 x i8> %a) {
-; CHECK: test_same_vtrn2q_p8:
+; CHECK-LABEL: test_same_vtrn2q_p8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1735,7 +1732,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn2_p16(<4 x i16> %a) {
-; CHECK: test_same_vtrn2_p16:
+; CHECK-LABEL: test_same_vtrn2_p16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1743,7 +1740,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn2q_p16(<8 x i16> %a) {
-; CHECK: test_same_vtrn2q_p16:
+; CHECK-LABEL: test_same_vtrn2q_p16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1752,7 +1749,7 @@ entry:
 
 
 define <8 x i8> @test_undef_vuzp1_s8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp1_s8:
+; CHECK-LABEL: test_undef_vuzp1_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1760,7 +1757,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp1q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp1q_s8:
+; CHECK-LABEL: test_undef_vuzp1q_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1768,7 +1765,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp1_s16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp1_s16:
+; CHECK-LABEL: test_undef_vuzp1_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1776,7 +1773,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp1q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp1q_s16:
+; CHECK-LABEL: test_undef_vuzp1q_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1784,7 +1781,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vuzp1q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp1q_s32:
+; CHECK-LABEL: test_undef_vuzp1q_s32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1792,7 +1789,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vuzp1_u8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp1_u8:
+; CHECK-LABEL: test_undef_vuzp1_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1800,7 +1797,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp1q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp1q_u8:
+; CHECK-LABEL: test_undef_vuzp1q_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1808,7 +1805,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp1_u16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp1_u16:
+; CHECK-LABEL: test_undef_vuzp1_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1816,7 +1813,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp1q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp1q_u16:
+; CHECK-LABEL: test_undef_vuzp1q_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1824,7 +1821,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vuzp1q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp1q_u32:
+; CHECK-LABEL: test_undef_vuzp1q_u32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1832,7 +1829,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vuzp1q_f32(<4 x float> %a) {
-; CHECK: test_undef_vuzp1q_f32:
+; CHECK-LABEL: test_undef_vuzp1q_f32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1840,7 +1837,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vuzp1_p8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp1_p8:
+; CHECK-LABEL: test_undef_vuzp1_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1848,7 +1845,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp1q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp1q_p8:
+; CHECK-LABEL: test_undef_vuzp1q_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1856,7 +1853,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp1_p16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp1_p16:
+; CHECK-LABEL: test_undef_vuzp1_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1864,7 +1861,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp1q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp1q_p16:
+; CHECK-LABEL: test_undef_vuzp1q_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1872,7 +1869,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vuzp2_s8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp2_s8:
+; CHECK-LABEL: test_undef_vuzp2_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1880,7 +1877,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp2q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp2q_s8:
+; CHECK-LABEL: test_undef_vuzp2q_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1888,7 +1885,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp2_s16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp2_s16:
+; CHECK-LABEL: test_undef_vuzp2_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1896,7 +1893,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp2q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp2q_s16:
+; CHECK-LABEL: test_undef_vuzp2q_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1904,7 +1901,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vuzp2q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp2q_s32:
+; CHECK-LABEL: test_undef_vuzp2q_s32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1912,7 +1909,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vuzp2_u8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp2_u8:
+; CHECK-LABEL: test_undef_vuzp2_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1920,7 +1917,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp2q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp2q_u8:
+; CHECK-LABEL: test_undef_vuzp2q_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1928,7 +1925,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp2_u16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp2_u16:
+; CHECK-LABEL: test_undef_vuzp2_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1936,7 +1933,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp2q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp2q_u16:
+; CHECK-LABEL: test_undef_vuzp2q_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1944,7 +1941,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vuzp2q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp2q_u32:
+; CHECK-LABEL: test_undef_vuzp2q_u32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1952,7 +1949,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vuzp2q_f32(<4 x float> %a) {
-; CHECK: test_undef_vuzp2q_f32:
+; CHECK-LABEL: test_undef_vuzp2q_f32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1960,7 +1957,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vuzp2_p8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp2_p8:
+; CHECK-LABEL: test_undef_vuzp2_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1968,7 +1965,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp2q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp2q_p8:
+; CHECK-LABEL: test_undef_vuzp2q_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1976,7 +1973,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp2_p16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp2_p16:
+; CHECK-LABEL: test_undef_vuzp2_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1984,7 +1981,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp2q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp2q_p16:
+; CHECK-LABEL: test_undef_vuzp2q_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1992,7 +1989,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip1_s8(<8 x i8> %a) {
-; CHECK: test_undef_vzip1_s8:
+; CHECK-LABEL: test_undef_vzip1_s8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2000,7 +1997,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip1q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vzip1q_s8:
+; CHECK-LABEL: test_undef_vzip1q_s8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -2008,7 +2005,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip1_s16(<4 x i16> %a) {
-; CHECK: test_undef_vzip1_s16:
+; CHECK-LABEL: test_undef_vzip1_s16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2016,7 +2013,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip1q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vzip1q_s16:
+; CHECK-LABEL: test_undef_vzip1q_s16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2024,7 +2021,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vzip1q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vzip1q_s32:
+; CHECK-LABEL: test_undef_vzip1q_s32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2032,7 +2029,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip1_u8(<8 x i8> %a) {
-; CHECK: test_undef_vzip1_u8:
+; CHECK-LABEL: test_undef_vzip1_u8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2040,7 +2037,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip1q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vzip1q_u8:
+; CHECK-LABEL: test_undef_vzip1q_u8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -2048,7 +2045,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip1_u16(<4 x i16> %a) {
-; CHECK: test_undef_vzip1_u16:
+; CHECK-LABEL: test_undef_vzip1_u16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2056,7 +2053,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip1q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vzip1q_u16:
+; CHECK-LABEL: test_undef_vzip1q_u16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2064,7 +2061,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vzip1q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vzip1q_u32:
+; CHECK-LABEL: test_undef_vzip1q_u32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2072,7 +2069,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vzip1q_f32(<4 x float> %a) {
-; CHECK: test_undef_vzip1q_f32:
+; CHECK-LABEL: test_undef_vzip1q_f32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2080,7 +2077,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip1_p8(<8 x i8> %a) {
-; CHECK: test_undef_vzip1_p8:
+; CHECK-LABEL: test_undef_vzip1_p8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2088,7 +2085,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip1q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vzip1q_p8:
+; CHECK-LABEL: test_undef_vzip1q_p8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -2096,7 +2093,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip1_p16(<4 x i16> %a) {
-; CHECK: test_undef_vzip1_p16:
+; CHECK-LABEL: test_undef_vzip1_p16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2104,7 +2101,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip1q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vzip1q_p16:
+; CHECK-LABEL: test_undef_vzip1q_p16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2112,7 +2109,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip2_s8(<8 x i8> %a) {
-; CHECK: test_undef_vzip2_s8:
+; CHECK-LABEL: test_undef_vzip2_s8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2120,7 +2117,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip2q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vzip2q_s8:
+; CHECK-LABEL: test_undef_vzip2q_s8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -2128,7 +2125,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip2_s16(<4 x i16> %a) {
-; CHECK: test_undef_vzip2_s16:
+; CHECK-LABEL: test_undef_vzip2_s16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2136,7 +2133,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip2q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vzip2q_s16:
+; CHECK-LABEL: test_undef_vzip2q_s16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2144,7 +2141,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vzip2q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vzip2q_s32:
+; CHECK-LABEL: test_undef_vzip2q_s32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2152,7 +2149,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip2_u8(<8 x i8> %a) {
-; CHECK: test_undef_vzip2_u8:
+; CHECK-LABEL: test_undef_vzip2_u8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2160,7 +2157,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip2q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vzip2q_u8:
+; CHECK-LABEL: test_undef_vzip2q_u8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -2168,7 +2165,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip2_u16(<4 x i16> %a) {
-; CHECK: test_undef_vzip2_u16:
+; CHECK-LABEL: test_undef_vzip2_u16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2176,7 +2173,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip2q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vzip2q_u16:
+; CHECK-LABEL: test_undef_vzip2q_u16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2184,7 +2181,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vzip2q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vzip2q_u32:
+; CHECK-LABEL: test_undef_vzip2q_u32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2192,7 +2189,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vzip2q_f32(<4 x float> %a) {
-; CHECK: test_undef_vzip2q_f32:
+; CHECK-LABEL: test_undef_vzip2q_f32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2200,7 +2197,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip2_p8(<8 x i8> %a) {
-; CHECK: test_undef_vzip2_p8:
+; CHECK-LABEL: test_undef_vzip2_p8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2208,7 +2205,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip2q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vzip2q_p8:
+; CHECK-LABEL: test_undef_vzip2q_p8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -2216,7 +2213,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip2_p16(<4 x i16> %a) {
-; CHECK: test_undef_vzip2_p16:
+; CHECK-LABEL: test_undef_vzip2_p16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2224,7 +2221,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip2q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vzip2q_p16:
+; CHECK-LABEL: test_undef_vzip2q_p16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2232,7 +2229,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn1_s8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn1_s8:
+; CHECK-LABEL: test_undef_vtrn1_s8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2240,7 +2237,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn1q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn1q_s8:
+; CHECK-LABEL: test_undef_vtrn1q_s8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -2248,7 +2245,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn1_s16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn1_s16:
+; CHECK-LABEL: test_undef_vtrn1_s16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2256,7 +2253,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn1q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn1q_s16:
+; CHECK-LABEL: test_undef_vtrn1q_s16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2264,7 +2261,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vtrn1q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn1q_s32:
+; CHECK-LABEL: test_undef_vtrn1q_s32:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2272,7 +2269,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn1_u8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn1_u8:
+; CHECK-LABEL: test_undef_vtrn1_u8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2280,7 +2277,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn1q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn1q_u8:
+; CHECK-LABEL: test_undef_vtrn1q_u8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -2288,7 +2285,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn1_u16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn1_u16:
+; CHECK-LABEL: test_undef_vtrn1_u16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2296,7 +2293,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn1q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn1q_u16:
+; CHECK-LABEL: test_undef_vtrn1q_u16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2304,7 +2301,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vtrn1q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn1q_u32:
+; CHECK-LABEL: test_undef_vtrn1q_u32:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2312,7 +2309,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vtrn1q_f32(<4 x float> %a) {
-; CHECK: test_undef_vtrn1q_f32:
+; CHECK-LABEL: test_undef_vtrn1q_f32:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2320,7 +2317,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn1_p8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn1_p8:
+; CHECK-LABEL: test_undef_vtrn1_p8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2328,7 +2325,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn1q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn1q_p8:
+; CHECK-LABEL: test_undef_vtrn1q_p8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -2336,7 +2333,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn1_p16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn1_p16:
+; CHECK-LABEL: test_undef_vtrn1_p16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2344,7 +2341,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn1q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn1q_p16:
+; CHECK-LABEL: test_undef_vtrn1q_p16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2352,7 +2349,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn2_s8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn2_s8:
+; CHECK-LABEL: test_undef_vtrn2_s8:
 ; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2360,7 +2357,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn2q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn2q_s8:
+; CHECK-LABEL: test_undef_vtrn2q_s8:
 ; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -2368,7 +2365,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn2_s16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn2_s16:
+; CHECK-LABEL: test_undef_vtrn2_s16:
 ; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2376,7 +2373,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn2q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn2q_s16:
+; CHECK-LABEL: test_undef_vtrn2q_s16:
 ; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2384,7 +2381,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vtrn2q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn2q_s32:
+; CHECK-LABEL: test_undef_vtrn2q_s32:
 ; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2392,7 +2389,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn2_u8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn2_u8:
+; CHECK-LABEL: test_undef_vtrn2_u8:
 ; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2400,7 +2397,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn2q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn2q_u8:
+; CHECK-LABEL: test_undef_vtrn2q_u8:
 ; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -2408,7 +2405,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn2_u16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn2_u16:
+; CHECK-LABEL: test_undef_vtrn2_u16:
 ; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2416,7 +2413,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn2q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn2q_u16:
+; CHECK-LABEL: test_undef_vtrn2q_u16:
 ; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2424,7 +2421,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vtrn2q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn2q_u32:
+; CHECK-LABEL: test_undef_vtrn2q_u32:
 ; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2432,7 +2429,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vtrn2q_f32(<4 x float> %a) {
-; CHECK: test_undef_vtrn2q_f32:
+; CHECK-LABEL: test_undef_vtrn2q_f32:
 ; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2440,7 +2437,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn2_p8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn2_p8:
+; CHECK-LABEL: test_undef_vtrn2_p8:
 ; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2448,7 +2445,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn2q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn2q_p8:
+; CHECK-LABEL: test_undef_vtrn2q_p8:
 ; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -2456,7 +2453,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn2_p16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn2_p16:
+; CHECK-LABEL: test_undef_vtrn2_p16:
 ; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2464,7 +2461,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn2q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn2q_p16:
+; CHECK-LABEL: test_undef_vtrn2q_p16:
 ; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2472,7 +2469,7 @@ entry:
 }
 
 define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp_s8:
+; CHECK-LABEL: test_vuzp_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2484,7 +2481,7 @@ entry:
 }
 
 define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp_s16:
+; CHECK-LABEL: test_vuzp_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2496,9 +2493,9 @@ entry:
 }
 
 define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2508,7 +2505,7 @@ entry:
 }
 
 define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp_u8:
+; CHECK-LABEL: test_vuzp_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2520,7 +2517,7 @@ entry:
 }
 
 define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp_u16:
+; CHECK-LABEL: test_vuzp_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2532,9 +2529,9 @@ entry:
 }
 
 define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2544,9 +2541,9 @@ entry:
 }
 
 define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vuzp_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2556,7 +2553,7 @@ entry:
 }
 
 define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp_p8:
+; CHECK-LABEL: test_vuzp_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2568,7 +2565,7 @@ entry:
 }
 
 define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp_p16:
+; CHECK-LABEL: test_vuzp_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2580,7 +2577,7 @@ entry:
 }
 
 define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzpq_s8:
+; CHECK-LABEL: test_vuzpq_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2592,7 +2589,7 @@ entry:
 }
 
 define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzpq_s16:
+; CHECK-LABEL: test_vuzpq_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2604,7 +2601,7 @@ entry:
 }
 
 define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzpq_s32:
+; CHECK-LABEL: test_vuzpq_s32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2616,7 +2613,7 @@ entry:
 }
 
 define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzpq_u8:
+; CHECK-LABEL: test_vuzpq_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2628,7 +2625,7 @@ entry:
 }
 
 define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzpq_u16:
+; CHECK-LABEL: test_vuzpq_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2640,7 +2637,7 @@ entry:
 }
 
 define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzpq_u32:
+; CHECK-LABEL: test_vuzpq_u32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2652,7 +2649,7 @@ entry:
 }
 
 define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vuzpq_f32:
+; CHECK-LABEL: test_vuzpq_f32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2664,7 +2661,7 @@ entry:
 }
 
 define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzpq_p8:
+; CHECK-LABEL: test_vuzpq_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2676,7 +2673,7 @@ entry:
 }
 
 define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzpq_p16:
+; CHECK-LABEL: test_vuzpq_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2688,7 +2685,7 @@ entry:
 }
 
 define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip_s8:
+; CHECK-LABEL: test_vzip_s8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2700,7 +2697,7 @@ entry:
 }
 
 define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip_s16:
+; CHECK-LABEL: test_vzip_s16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2712,9 +2709,9 @@ entry:
 }
 
 define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2724,7 +2721,7 @@ entry:
 }
 
 define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip_u8:
+; CHECK-LABEL: test_vzip_u8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2736,7 +2733,7 @@ entry:
 }
 
 define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip_u16:
+; CHECK-LABEL: test_vzip_u16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2748,9 +2745,9 @@ entry:
 }
 
 define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2760,9 +2757,9 @@ entry:
 }
 
 define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vzip_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2772,7 +2769,7 @@ entry:
 }
 
 define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip_p8:
+; CHECK-LABEL: test_vzip_p8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2784,7 +2781,7 @@ entry:
 }
 
 define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip_p16:
+; CHECK-LABEL: test_vzip_p16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2796,7 +2793,7 @@ entry:
 }
 
 define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzipq_s8:
+; CHECK-LABEL: test_vzipq_s8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2808,7 +2805,7 @@ entry:
 }
 
 define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzipq_s16:
+; CHECK-LABEL: test_vzipq_s16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2820,7 +2817,7 @@ entry:
 }
 
 define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzipq_s32:
+; CHECK-LABEL: test_vzipq_s32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2832,7 +2829,7 @@ entry:
 }
 
 define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzipq_u8:
+; CHECK-LABEL: test_vzipq_u8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2844,7 +2841,7 @@ entry:
 }
 
 define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzipq_u16:
+; CHECK-LABEL: test_vzipq_u16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2856,7 +2853,7 @@ entry:
 }
 
 define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzipq_u32:
+; CHECK-LABEL: test_vzipq_u32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2868,7 +2865,7 @@ entry:
 }
 
 define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vzipq_f32:
+; CHECK-LABEL: test_vzipq_f32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2880,7 +2877,7 @@ entry:
 }
 
 define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzipq_p8:
+; CHECK-LABEL: test_vzipq_p8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2892,7 +2889,7 @@ entry:
 }
 
 define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzipq_p16:
+; CHECK-LABEL: test_vzipq_p16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2904,7 +2901,7 @@ entry:
 }
 
 define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn_s8:
+; CHECK-LABEL: test_vtrn_s8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2916,7 +2913,7 @@ entry:
 }
 
 define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn_s16:
+; CHECK-LABEL: test_vtrn_s16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2928,9 +2925,9 @@ entry:
 }
 
 define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2940,7 +2937,7 @@ entry:
 }
 
 define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn_u8:
+; CHECK-LABEL: test_vtrn_u8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2952,7 +2949,7 @@ entry:
 }
 
 define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn_u16:
+; CHECK-LABEL: test_vtrn_u16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2964,9 +2961,9 @@ entry:
 }
 
 define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2976,9 +2973,9 @@ entry:
 }
 
 define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vtrn_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2988,7 +2985,7 @@ entry:
 }
 
 define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn_p8:
+; CHECK-LABEL: test_vtrn_p8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -3000,7 +2997,7 @@ entry:
 }
 
 define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn_p16:
+; CHECK-LABEL: test_vtrn_p16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -3012,7 +3009,7 @@ entry:
 }
 
 define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrnq_s8:
+; CHECK-LABEL: test_vtrnq_s8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -3024,7 +3021,7 @@ entry:
 }
 
 define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrnq_s16:
+; CHECK-LABEL: test_vtrnq_s16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -3036,7 +3033,7 @@ entry:
 }
 
 define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrnq_s32:
+; CHECK-LABEL: test_vtrnq_s32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -3048,7 +3045,7 @@ entry:
 }
 
 define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrnq_u8:
+; CHECK-LABEL: test_vtrnq_u8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -3060,7 +3057,7 @@ entry:
 }
 
 define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrnq_u16:
+; CHECK-LABEL: test_vtrnq_u16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -3072,7 +3069,7 @@ entry:
 }
 
 define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrnq_u32:
+; CHECK-LABEL: test_vtrnq_u32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -3084,7 +3081,7 @@ entry:
 }
 
 define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vtrnq_f32:
+; CHECK-LABEL: test_vtrnq_f32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -3096,7 +3093,7 @@ entry:
 }
 
 define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrnq_p8:
+; CHECK-LABEL: test_vtrnq_p8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -3108,7 +3105,7 @@ entry:
 }
 
 define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrnq_p16:
+; CHECK-LABEL: test_vtrnq_p16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -3120,7 +3117,7 @@ entry:
 }
 
 define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
-; CHECK: test_uzp:
+; CHECK-LABEL: test_uzp:
 
   %vuzp.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %vuzp1.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -3128,7 +3125,4 @@ define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
   %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
   ret %struct.uint8x8x2_t %.fca.0.1.insert
 
-; CHECK: dup	{{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: uzp1	{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-NEXT: uzp2	{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
diff --git a/test/CodeGen/AArch64/neon-rounding-halving-add.ll b/test/CodeGen/AArch64/neon-rounding-halving-add.ll
deleted file mode 100644
index 009da3b..0000000
--- a/test/CodeGen/AArch64/neon-rounding-halving-add.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_urhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_urhadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: urhadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_srhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_srhadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: srhadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_urhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_urhadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: urhadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_srhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_srhadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: srhadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_urhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_urhadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: urhadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_srhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_srhadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: srhadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_urhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_urhadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: urhadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_srhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_srhadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: srhadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_urhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_urhadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: urhadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_srhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_srhadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: srhadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_urhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_urhadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: urhadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_srhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_srhadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: srhadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
diff --git a/test/CodeGen/AArch64/neon-rounding-shift.ll b/test/CodeGen/AArch64/neon-rounding-shift.ll
deleted file mode 100644
index 5b4ec28..0000000
--- a/test/CodeGen/AArch64/neon-rounding-shift.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_urshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_urshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: urshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_srshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_srshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: srshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_urshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_urshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: urshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_srshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_srshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: srshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_urshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_urshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: urshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_srshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_srshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: srshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_urshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_urshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: urshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_srshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_srshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: srshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_urshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_urshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: urshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_srshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_srshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: srshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_urshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_urshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: urshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_srshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_srshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: srshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_urshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_urshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: urshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_srshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_srshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: srshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-saturating-add-sub.ll
deleted file mode 100644
index fc60d90..0000000
--- a/test/CodeGen/AArch64/neon-saturating-add-sub.ll
+++ /dev/null
@@ -1,241 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
-
-declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqadd_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqadd v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqadd_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqadd v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqsub_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqsub v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqsub_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqsub v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
diff --git a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
deleted file mode 100644
index d89262c..0000000
--- a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqrshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqrshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqrshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqrshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqrshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqrshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqrshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqrshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqrshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqrshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqrshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqrshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqrshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqrshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqrshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqrshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqrshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqrshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqrshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqrshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqrshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqrshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqrshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqrshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqrshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqrshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqrshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqrshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-saturating-shift.ll b/test/CodeGen/AArch64/neon-saturating-shift.ll
deleted file mode 100644
index 11009fb..0000000
--- a/test/CodeGen/AArch64/neon-saturating-shift.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-scalar-abs.ll b/test/CodeGen/AArch64/neon-scalar-abs.ll
deleted file mode 100644
index 03a89e04..0000000
--- a/test/CodeGen/AArch64/neon-scalar-abs.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define i64 @test_vabsd_s64(i64 %a) {
-; CHECK: test_vabsd_s64
-; CHECK: abs {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vabs1.i = tail call <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64> %vabs.i)
-  %0 = extractelement <1 x i64> %vabs1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64>)
-
-define i8 @test_vqabsb_s8(i8 %a) {
-; CHECK: test_vqabsb_s8
-; CHECK: sqabs {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vqabs1.i = call <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8> %vqabs.i)
-  %0 = extractelement <1 x i8> %vqabs1.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8>)
-
-define i16 @test_vqabsh_s16(i16 %a) {
-; CHECK: test_vqabsh_s16
-; CHECK: sqabs {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqabs1.i = call <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16> %vqabs.i)
-  %0 = extractelement <1 x i16> %vqabs1.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16>)
-
-define i32 @test_vqabss_s32(i32 %a) {
-; CHECK: test_vqabss_s32
-; CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqabs1.i = call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %vqabs.i)
-  %0 = extractelement <1 x i32> %vqabs1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>)
-
-define i64 @test_vqabsd_s64(i64 %a) {
-; CHECK: test_vqabsd_s64
-; CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqabs1.i = call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %vqabs.i)
-  %0 = extractelement <1 x i64> %vqabs1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-add-sub.ll
deleted file mode 100644
index 4f322e0..0000000
--- a/test/CodeGen/AArch64/neon-scalar-add-sub.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-	%tmp3 = add <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-	%tmp3 = sub <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_add_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_add_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_uadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uadd_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_sub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sub_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_usub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_usub_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index 247514c..32f5962 100644
--- a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -4,7 +4,7 @@ declare float @llvm.fma.f32(float, float, float)
 declare double @llvm.fma.f64(double, double, double)
 
 define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
-  ; CHECK: test_fmla_ss4S
+  ; CHECK-LABEL: test_fmla_ss4S
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
@@ -12,7 +12,7 @@ define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
 }
 
 define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK: test_fmla_ss4S_swap
+  ; CHECK-LABEL: test_fmla_ss4S_swap
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a)
@@ -20,7 +20,7 @@ define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) {
 }
 
 define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) {
-  ; CHECK: test_fmla_ss2S
+  ; CHECK-LABEL: test_fmla_ss2S
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
@@ -28,15 +28,15 @@ define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) {
 }
 
 define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) {
-  ; CHECK: test_fmla_ddD
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  ; CHECK-LABEL: test_fmla_ddD
+  ; CHECK: {{fmla d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmadd d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) {
-  ; CHECK: test_fmla_dd2D
+  ; CHECK-LABEL: test_fmla_dd2D
   ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
@@ -44,7 +44,7 @@ define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) {
 }
 
 define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK: test_fmla_dd2D_swap
+  ; CHECK-LABEL: test_fmla_dd2D_swap
   ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
@@ -52,7 +52,7 @@ define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) {
 }
 
 define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
-  ; CHECK: test_fmls_ss4S
+  ; CHECK-LABEL: test_fmls_ss4S
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
@@ -61,7 +61,7 @@ define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
 }
 
 define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK: test_fmls_ss4S_swap
+  ; CHECK-LABEL: test_fmls_ss4S_swap
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
@@ -71,7 +71,7 @@ define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
 
 
 define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
-  ; CHECK: test_fmls_ss2S
+  ; CHECK-LABEL: test_fmls_ss2S
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fsub float -0.0, %tmp1
@@ -80,8 +80,8 @@ define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
 }
 
 define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) {
-  ; CHECK: test_fmls_ddD
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  ; CHECK-LABEL: test_fmls_ddD
+  ; CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmsub d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = fsub double -0.0, %tmp1
   %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
@@ -89,7 +89,7 @@ define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) {
 }
 
 define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
-  ; CHECK: test_fmls_dd2D
+  ; CHECK-LABEL: test_fmls_dd2D
   ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
@@ -98,7 +98,7 @@ define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
 }
 
 define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK: test_fmls_dd2D_swap
+  ; CHECK-LABEL: test_fmls_dd2D_swap
   ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
diff --git a/test/CodeGen/AArch64/neon-scalar-compare.ll b/test/CodeGen/AArch64/neon-scalar-compare.ll
deleted file mode 100644
index e1f3964..0000000
--- a/test/CodeGen/AArch64/neon-scalar-compare.ll
+++ /dev/null
@@ -1,343 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-;; Scalar Integer Compare
-
-define i64 @test_vceqd(i64 %a, i64 %b) {
-; CHECK: test_vceqd
-; CHECK: cmeq {{d[0-9]+}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vceq.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vceq1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceq.i, <1 x i64> %vceq1.i)
-  %0 = extractelement <1 x i64> %vceq2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vceqzd(i64 %a) {
-; CHECK: test_vceqzd
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vceqz.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vceqz1.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceqz.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vceqz1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcged(i64 %a, i64 %b) {
-; CHECK: test_vcged
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcge.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcge1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
-  %0 = extractelement <1 x i64> %vcge2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcgezd(i64 %a) {
-; CHECK: test_vcgezd
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vcgez.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgez1.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcgez.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vcgez1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcgtd(i64 %a, i64 %b) {
-; CHECK: test_vcgtd
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcgt.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgt1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
-  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcgtzd(i64 %a) {
-; CHECK: test_vcgtzd
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vcgtz.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgtz1.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgtz.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vcgtz1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcled(i64 %a, i64 %b) {
-; CHECK: test_vcled
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcgt.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcgt1.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
-  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vclezd(i64 %a) {
-; CHECK: test_vclezd
-; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vclez.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vclez1.i = call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64> %vclez.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vclez1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcltd(i64 %a, i64 %b) {
-; CHECK: test_vcltd
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcge.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcge1.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
-  %0 = extractelement <1 x i64> %vcge2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcltzd(i64 %a) {
-; CHECK: test_vcltzd
-; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vcltz.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcltz1.i = call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64> %vcltz.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vcltz1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vtstd(i64 %a, i64 %b) {
-; CHECK: test_vtstd
-; CHECK: cmtst {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vtst.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vtst1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vtst2.i = call <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64> %vtst.i, <1 x i64> %vtst1.i)
-  %0 = extractelement <1 x i64> %vtst2.i, i32 0
-  ret i64 %0
-}
-
-
-define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcage_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2
-  ret <1 x i64> %vcage2.i
-}
-
-define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcagt_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2
-  ret <1 x i64> %vcagt2.i
-}
-
-define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcale_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2
-  ret <1 x i64> %vcage2.i
-}
-
-define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcalt_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2
-  ret <1 x i64> %vcagt2.i
-}
-
-define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vceq_s64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp eq <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vceq_u64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp eq <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vceq_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp oeq <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcge_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp sge <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcge_u64
-; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp uge <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcge_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp oge <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcle_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp sle <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcle_u64
-; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp ule <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcle_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp ole <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcgt_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp sgt <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcgt_u64
-; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp ugt <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcgt_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp ogt <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vclt_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp slt <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vclt_u64
-; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp ult <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vclt_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp olt <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_s64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_u64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_p64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 {
-; CHECK: test_vceqzq_p64
-; CHECK: cmeq  {{v[0-9]}}.2d, {{v[0-9]}}.2d, #0
-  %1 = icmp eq <2 x i64> %a, zeroinitializer
-  %vceqz.i = sext <2 x i1> %1 to <2 x i64>
-  ret <2 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcgez_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp sge <1 x i64> %a, zeroinitializer
-  %vcgez.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vcgez.i
-}
-
-define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 {
-; CHECK: test_vclez_s64
-; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp sle <1 x i64> %a, zeroinitializer
-  %vclez.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vclez.i
-}
-
-define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcgtz_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp sgt <1 x i64> %a, zeroinitializer
-  %vcgtz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vcgtz.i
-}
-
-define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcltz_s64
-; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0
-  %1 = icmp slt <1 x i64> %a, zeroinitializer
-  %vcltz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vcltz.i
-}
-
-declare <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vchs.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vchi.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
index fadd734..a01df32 100644
--- a/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -1,103 +1,101 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK
+
 
 define float @test_dup_sv2S(<2 x float> %v) {
- ;CHECK: test_dup_sv2S
- ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+ ; CHECK-LABEL: test_dup_sv2S
+ ; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
  %tmp1 = extractelement <2 x float> %v, i32 1
  ret float  %tmp1
 }
 
 define float @test_dup_sv2S_0(<2 x float> %v) {
- ;CHECK-LABEL: test_dup_sv2S_0
- ;CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_sv2S_0
+ ; CHECK-NOT: dup {{[vsd][0-9]+}}
+ ; CHECK-NOT: ins {{[vsd][0-9]+}}
+ ; CHECK: ret
  %tmp1 = extractelement <2 x float> %v, i32 0
  ret float  %tmp1
 }
 
 define float @test_dup_sv4S(<4 x float> %v) {
- ;CHECK-LABEL: test_dup_sv4S
- ;CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_sv4S
+ ; CHECK-NOT: dup {{[vsd][0-9]+}}
+ ; CHECK-NOT: ins {{[vsd][0-9]+}}
+ ; CHECK: ret
  %tmp1 = extractelement <4 x float> %v, i32 0
  ret float  %tmp1
 }
 
 define double @test_dup_dvD(<1 x double> %v) {
- ;CHECK: test_dup_dvD
- ;CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_dvD
+ ; CHECK-NOT: dup {{[vsd][0-9]+}}
+ ; CHECK-NOT: ins {{[vsd][0-9]+}}
+ ; CHECK: ret
  %tmp1 = extractelement <1 x double> %v, i32 0
  ret double  %tmp1
 }
 
 define double @test_dup_dv2D(<2 x double> %v) {
- ;CHECK: test_dup_dv2D
- ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+ ; CHECK-LABEL: test_dup_dv2D
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
 }
 
 define double @test_dup_dv2D_0(<2 x double> %v) {
- ;CHECK: test_dup_dv2D_0
- ;CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_dv2D_0
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+ ; CHECK: ret
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
 }
 
 define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) {
- ;CHECK: test_vector_dup_bv16B
- ;CHECK: dup {{b[0-9]+}}, {{v[0-9]+}}.b[14]
+ ; CHECK-LABEL: test_vector_dup_bv16B
  %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> <i32 14> 
  ret <1 x i8> %shuffle.i
 }
 
 define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) {
- ;CHECK: test_vector_dup_bv8B
- ;CHECK: dup {{b[0-9]+}}, {{v[0-9]+}}.b[7]
+ ; CHECK-LABEL: test_vector_dup_bv8B
  %shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> <i32 7> 
  ret <1 x i8> %shuffle.i
 }
 
 define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) {
- ;CHECK: test_vector_dup_hv8H
- ;CHECK: dup {{h[0-9]+}}, {{v[0-9]+}}.h[7]
+ ; CHECK-LABEL: test_vector_dup_hv8H
  %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> <i32 7> 
  ret <1 x i16> %shuffle.i
 }
 
 define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) {
- ;CHECK: test_vector_dup_hv4H
- ;CHECK: dup {{h[0-9]+}}, {{v[0-9]+}}.h[3]
+ ; CHECK-LABEL: test_vector_dup_hv4H
  %shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> <i32 3> 
  ret <1 x i16> %shuffle.i
 }
 
 define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) {
- ;CHECK: test_vector_dup_sv4S
- ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+ ; CHECK-LABEL: test_vector_dup_sv4S
  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> <i32 3> 
  ret <1 x i32> %shuffle
 }
 
 define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) {
- ;CHECK: test_vector_dup_sv2S
- ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+ ; CHECK-LABEL: test_vector_dup_sv2S
  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> <i32 1> 
  ret <1 x i32> %shuffle
 }
 
 define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) {
- ;CHECK: test_vector_dup_dv2D
- ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+ ; CHECK-LABEL: test_vector_dup_dv2D
+ ; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
  %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> <i32 1> 
  ret <1 x i64> %shuffle.i
 }
 
 define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) {
-  ;CHECK: test_vector_copy_dup_dv2D
-  ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  ; CHECK-LABEL: test_vector_copy_dup_dv2D
+  ; CHECK: {{dup|mov}} {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %vget_lane = extractelement <2 x i64> %c, i32 1
   %vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0
   ret <1 x i64> %vset_lane
diff --git a/test/CodeGen/AArch64/neon-scalar-cvt.ll b/test/CodeGen/AArch64/neon-scalar-cvt.ll
deleted file mode 100644
index 3a19bed..0000000
--- a/test/CodeGen/AArch64/neon-scalar-cvt.ll
+++ /dev/null
@@ -1,133 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define float @test_vcvts_f32_s32(i32 %a) {
-; CHECK: test_vcvts_f32_s32
-; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32> %vcvtf.i)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32>)
-
-define double @test_vcvtd_f64_s64(i64 %a) {
-; CHECK: test_vcvtd_f64_s64
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64> %vcvtf.i)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64>)
-
-define float @test_vcvts_f32_u32(i32 %a) {
-; CHECK: test_vcvts_f32_u32
-; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32> %vcvtf.i)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32>)
-
-define double @test_vcvtd_f64_u64(i64 %a) {
-; CHECK: test_vcvtd_f64_u64
-; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64> %vcvtf.i)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64>)
-
-define float @test_vcvts_n_f32_s32(i32 %a) {
-; CHECK: test_vcvts_n_f32_s32
-; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32>, i32)
-
-define double @test_vcvtd_n_f64_s64(i64 %a) {
-; CHECK: test_vcvtd_n_f64_s64
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64>, i32)
-
-define float @test_vcvts_n_f32_u32(i32 %a) {
-; CHECK: test_vcvts_n_f32_u32
-; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32>, i32)
-
-define double @test_vcvtd_n_f64_u64(i64 %a) {
-; CHECK: test_vcvtd_n_f64_u64
-; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64>, i32)
-
-define i32 @test_vcvts_n_s32_f32(float %a) {
-; CHECK: test_vcvts_n_s32_f32
-; CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
-  %fcvtzs1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float %a, i32 1)
-  %0 = extractelement <1 x i32> %fcvtzs1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float, i32)
-
-define i64 @test_vcvtd_n_s64_f64(double %a) {
-; CHECK: test_vcvtd_n_s64_f64
-; CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
-  %fcvtzs1 = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double %a, i32 1)
-  %0 = extractelement <1 x i64> %fcvtzs1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double, i32)
-
-define i32 @test_vcvts_n_u32_f32(float %a) {
-; CHECK: test_vcvts_n_u32_f32
-; CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32
-entry:
-  %fcvtzu1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float %a, i32 32)
-  %0 = extractelement <1 x i32> %fcvtzu1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float, i32)
-
-define i64 @test_vcvtd_n_u64_f64(double %a) {
-; CHECK: test_vcvtd_n_u64_f64
-; CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64
-entry:
-  %fcvtzu1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double %a, i32 64)
-  %0 = extractelement <1 x i64> %fcvtzu1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-ext.ll b/test/CodeGen/AArch64/neon-scalar-ext.ll
deleted file mode 100644
index 51dea06..0000000
--- a/test/CodeGen/AArch64/neon-scalar-ext.ll
+++ /dev/null
@@ -1,113 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define <1 x i64> @test_zext_v1i32_v1i64(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i32_v1i64:
-; CHECK: ushll	v0.2d, v0.2s, #0
-  %1 = extractelement <2 x i32> %v, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %1, i32 0
-  %3 = zext <1 x i32> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i32> @test_zext_v1i16_v1i32(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i16_v1i32:
-; CHECK: ushll	v0.4s, v0.4h, #0
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = zext <1 x i16> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i16> @test_zext_v1i8_v1i16(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i16:
-; CHECK: ushll	v0.8h, v0.8b, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = zext <1 x i8> %2 to <1 x i16>
-  ret <1 x i16> %3
-}
-
-define <1 x i32> @test_zext_v1i8_v1i32(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i32:
-; CHECK: dup     b0, v0.b[0]
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = zext <1 x i8> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i64> @test_zext_v1i16_v1i64(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i16_v1i64:
-; CHECK: dup    h0, v0.h[0]
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = zext <1 x i16> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i64> @test_zext_v1i8_v1i64(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i64:
-; CHECK: dup	b0, v0.b[0]
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = zext <1 x i8> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i64> @test_sext_v1i32_v1i64(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i32_v1i64:
-; CHECK: sshll	v0.2d, v0.2s, #0
-  %1 = extractelement <2 x i32> %v, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %1, i32 0
-  %3 = sext <1 x i32> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i32> @test_sext_v1i16_v1i32(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i16_v1i32:
-; CHECK: sshll	v0.4s, v0.4h, #0
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = sext <1 x i16> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i16> @test_sext_v1i8_v1i16(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i16:
-; CHECK: sshll	v0.8h, v0.8b, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = sext <1 x i8> %2 to <1 x i16>
-  ret <1 x i16> %3
-}
-
-define <1 x i32> @test_sext_v1i8_v1i32(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i32:
-; CHECK: sshll	v0.8h, v0.8b, #0
-; CHECK: sshll	v0.4s, v0.4h, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = sext <1 x i8> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i64> @test_sext_v1i16_v1i64(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i16_v1i64:
-; CHECK: sshll	v0.4s, v0.4h, #0
-; CHECK: sshll	v0.2d, v0.2s, #0
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = sext <1 x i16> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i64> @test_sext_v1i8_v1i64(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i64:
-; CHECK: sshll	v0.8h, v0.8b, #0
-; CHECK: sshll	v0.4s, v0.4h, #0
-; CHECK: sshll	v0.2d, v0.2s, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = sext <1 x i8> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
diff --git a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
deleted file mode 100644
index faf521b..0000000
--- a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define i8 @test_vqmovunh_s16(i16 %a) {
-; CHECK: test_vqmovunh_s16
-; CHECK: sqxtun {{b[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqmovun.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqmovun1.i = call <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16> %vqmovun.i)
-  %0 = extractelement <1 x i8> %vqmovun1.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vqmovuns_s32(i32 %a) {
-; CHECK: test_vqmovuns_s32
-; CHECK: sqxtun {{h[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqmovun.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqmovun1.i = call <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32> %vqmovun.i)
-  %0 = extractelement <1 x i16> %vqmovun1.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vqmovund_s64(i64 %a) {
-; CHECK: test_vqmovund_s64
-; CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqmovun.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqmovun1.i = call <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64> %vqmovun.i)
-  %0 = extractelement <1 x i32> %vqmovun1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64>)
-
-define i8 @test_vqmovnh_s16(i16 %a) {
-; CHECK: test_vqmovnh_s16
-; CHECK: sqxtn {{b[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16> %vqmovn.i)
-  %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vqmovns_s32(i32 %a) {
-; CHECK: test_vqmovns_s32
-; CHECK: sqxtn {{h[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32> %vqmovn.i)
-  %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vqmovnd_s64(i64 %a) {
-; CHECK: test_vqmovnd_s64
-; CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64> %vqmovn.i)
-  %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64>)
-
-define i8 @test_vqmovnh_u16(i16 %a) {
-; CHECK: test_vqmovnh_u16
-; CHECK: uqxtn {{b[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16> %vqmovn.i)
-  %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
-  ret i8 %0
-}
-
-
-define i16 @test_vqmovns_u32(i32 %a) {
-; CHECK: test_vqmovns_u32
-; CHECK: uqxtn {{h[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32> %vqmovn.i)
-  %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vqmovnd_u64(i64 %a) {
-; CHECK: test_vqmovnd_u64
-; CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64> %vqmovn.i)
-  %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-fabd.ll b/test/CodeGen/AArch64/neon-scalar-fabd.ll
deleted file mode 100644
index 6343310..0000000
--- a/test/CodeGen/AArch64/neon-scalar-fabd.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define float @test_vabds_f32(float %a, float %b) {
-; CHECK-LABEL: test_vabds_f32
-; CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vabd.f32(float %a, float %a)
-  ret float %0
-}
-
-define double @test_vabdd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vabdd_f64
-; CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vabd.f64(double %a, double %b)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vabd.f64(double, double)
-declare float @llvm.aarch64.neon.vabd.f32(float, float)
diff --git a/test/CodeGen/AArch64/neon-scalar-fcvt.ll b/test/CodeGen/AArch64/neon-scalar-fcvt.ll
deleted file mode 100644
index 6cf30a7..0000000
--- a/test/CodeGen/AArch64/neon-scalar-fcvt.ll
+++ /dev/null
@@ -1,233 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-;; Scalar Floating-point Convert
-
-define float @test_vcvtxn(double %a) {
-; CHECK: test_vcvtxn
-; CHECK: fcvtxn {{s[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtf = call float @llvm.aarch64.neon.fcvtxn(double %a)
-  ret float %vcvtf
-}
-
-declare float @llvm.aarch64.neon.fcvtxn(double)
-
-define i32 @test_vcvtass(float %a) {
-; CHECK: test_vcvtass
-; CHECK: fcvtas {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtas1.i = call <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtas1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float)
-
-define i64 @test_test_vcvtasd(double %a) {
-; CHECK: test_test_vcvtasd
-; CHECK: fcvtas {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtas1.i = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtas1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double)
-
-define i32 @test_vcvtaus(float %a) {
-; CHECK: test_vcvtaus
-; CHECK: fcvtau {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtau1.i = call <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtau1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float)
-
-define i64 @test_vcvtaud(double %a) {
-; CHECK: test_vcvtaud
-; CHECK: fcvtau {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtau1.i = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtau1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double) 
-
-define i32 @test_vcvtmss(float %a) {
-; CHECK: test_vcvtmss
-; CHECK: fcvtms {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtms1.i = call <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtms1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float)
-
-define i64 @test_vcvtmd_s64_f64(double %a) {
-; CHECK: test_vcvtmd_s64_f64
-; CHECK: fcvtms {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtms1.i = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtms1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double)
-
-define i32 @test_vcvtmus(float %a) {
-; CHECK: test_vcvtmus
-; CHECK: fcvtmu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtmu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtmu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float)
-
-define i64 @test_vcvtmud(double %a) {
-; CHECK: test_vcvtmud
-; CHECK: fcvtmu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtmu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtmu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double)
-
-define i32 @test_vcvtnss(float %a) {
-; CHECK: test_vcvtnss
-; CHECK: fcvtns {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtns1.i = call <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtns1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float)
-
-define i64 @test_vcvtnd_s64_f64(double %a) {
-; CHECK: test_vcvtnd_s64_f64
-; CHECK: fcvtns {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtns1.i = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtns1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double)
-
-define i32 @test_vcvtnus(float %a) {
-; CHECK: test_vcvtnus
-; CHECK: fcvtnu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtnu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtnu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float)
-
-define i64 @test_vcvtnud(double %a) {
-; CHECK: test_vcvtnud
-; CHECK: fcvtnu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtnu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtnu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double)
-
-define i32 @test_vcvtpss(float %a) {
-; CHECK: test_vcvtpss
-; CHECK: fcvtps {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtps1.i = call <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtps1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float)
-
-define i64 @test_vcvtpd_s64_f64(double %a) {
-; CHECK: test_vcvtpd_s64_f64
-; CHECK: fcvtps {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtps1.i = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtps1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double)
-
-define i32 @test_vcvtpus(float %a) {
-; CHECK: test_vcvtpus
-; CHECK: fcvtpu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtpu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtpu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float)
-
-define i64 @test_vcvtpud(double %a) {
-; CHECK: test_vcvtpud
-; CHECK: fcvtpu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtpu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtpu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double)
-
-define i32 @test_vcvtss(float %a) {
-; CHECK: test_vcvtss
-; CHECK: fcvtzs {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtzs1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtzs1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float)
-
-define i64 @test_vcvtd_s64_f64(double %a) {
-; CHECK: test_vcvtd_s64_f64
-; CHECK: fcvtzs {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvzs1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvzs1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double)
-
-define i32 @test_vcvtus(float %a) {
-; CHECK: test_vcvtus
-; CHECK: fcvtzu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtzu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtzu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float)
-
-define i64 @test_vcvtud(double %a) {
-; CHECK: test_vcvtud
-; CHECK: fcvtzu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtzu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtzu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double)
diff --git a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
deleted file mode 100644
index e0dce13..0000000
--- a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
+++ /dev/null
@@ -1,282 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-;; Scalar Floating-point Compare
-
-define i32 @test_vceqs_f32(float %a, float %b) {
-; CHECK-LABEL: test_vceqs_f32
-; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fceq2.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fceq2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vceqd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vceqd_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fceq2.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fceq2.i, i32 0
-  ret i64 %0
-}
-
-define <1 x i64> @test_vceqz_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vceqz_f64
-; CHECK: fcmeq  {{d[0-9]+}}, {{d[0-9]+}}, #0.0
-entry:
-  %0 = fcmp oeq <1 x double> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %0 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define i32 @test_vceqzs_f32(float %a) {
-; CHECK-LABEL: test_vceqzs_f32
-; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fceq1.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fceq1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vceqzd_f64(double %a) {
-; CHECK-LABEL: test_vceqzd_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fceq1.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fceq1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcges_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcges_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcge2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcged_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcged_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcge2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcgezs_f32(float %a) {
-; CHECK-LABEL: test_vcgezs_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fcge1.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fcge1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcgezd_f64(double %a) {
-; CHECK-LABEL: test_vcgezd_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fcge1.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fcge1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcgts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcgts_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcgt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcgtd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcgtd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcgtzs_f32(float %a) {
-; CHECK-LABEL: test_vcgtzs_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fcgt1.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fcgt1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcgtzd_f64(double %a) {
-; CHECK-LABEL: test_vcgtzd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fcgt1.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fcgt1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcles_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcles_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcge2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcled_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcled_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcge2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vclezs_f32(float %a) {
-; CHECK-LABEL: test_vclezs_f32
-; CHECK: fcmle {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fcle1.i = call <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fcle1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vclezd_f64(double %a) {
-; CHECK-LABEL: test_vclezd_f64
-; CHECK: fcmle {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fcle1.i = call <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fcle1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vclts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vclts_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcgt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcltd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcltd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcltzs_f32(float %a) {
-; CHECK-LABEL: test_vcltzs_f32
-; CHECK: fcmlt {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fclt1.i = call <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fclt1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcltzd_f64(double %a) {
-; CHECK-LABEL: test_vcltzd_f64
-; CHECK: fcmlt {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fclt1.i = call <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fclt1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcages_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcages_f32
-; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcage2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcaged_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaged_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcage2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcagts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcagts_f32
-; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcagt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcagt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcagtd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcagtd_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcagt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcagt2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcales_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcales_f32
-; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcage2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcaled_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaled_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcage2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcalts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcalts_f32
-; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcalt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcalt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcaltd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaltd_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcalt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcalt2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double, float)
-declare <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double, float)
-declare <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double, double)
diff --git a/test/CodeGen/AArch64/neon-scalar-mul.ll b/test/CodeGen/AArch64/neon-scalar-mul.ll
deleted file mode 100644
index 991037f..0000000
--- a/test/CodeGen/AArch64/neon-scalar-mul.ll
+++ /dev/null
@@ -1,143 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqdmulhh_s16
-; CHECK: sqdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  %1 = insertelement <1 x i16> undef, i16 %a, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %b, i32 0
-  %3 = call <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
-  %4 = extractelement <1 x i16> %3, i32 0
-  ret i16 %4
-}
-
-define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) {
-; CHECK: test_vqdmulhs_s32
-; CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = insertelement <1 x i32> undef, i32 %a, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %b, i32 0
-  %3 = call <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
-  %4 = extractelement <1 x i32> %3, i32 0
-  ret i32 %4
-}
-
-declare <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32>, <1 x i32>)
-
-define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqrdmulhh_s16
-; CHECK: sqrdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  %1 = insertelement <1 x i16> undef, i16 %a, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %b, i32 0
-  %3 = call <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
-  %4 = extractelement <1 x i16> %3, i32 0
-  ret i16 %4
-}
-
-define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) {
-; CHECK: test_vqrdmulhs_s32
-; CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = insertelement <1 x i32> undef, i32 %a, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %b, i32 0
-  %3 = call <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
-  %4 = extractelement <1 x i32> %3, i32 0
-  ret i32 %4
-}
-
-declare <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32>, <1 x i32>)
-
-define float @test_vmulxs_f32(float %a, float %b) {
-; CHECK: test_vmulxs_f32
-; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %b)
-  ret float %1
-}
-
-define double @test_vmulxd_f64(double %a, double %b) {
-; CHECK: test_vmulxd_f64
-; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %b)
-  ret double %1
-}
-
-declare float @llvm.aarch64.neon.vmulx.f32(float, float)
-declare double @llvm.aarch64.neon.vmulx.f64(double, double)
-
-define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) {
-; CHECK: test_vqdmlalh_s16
-; CHECK: sqdmlal {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqdmlal.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqdmlal1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vqdmlal2.i = insertelement <1 x i16> undef, i16 %c, i32 0
-  %vqdmlal3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32> %vqdmlal.i, <1 x i16> %vqdmlal1.i, <1 x i16> %vqdmlal2.i)
-  %0 = extractelement <1 x i32> %vqdmlal3.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) {
-; CHECK: test_vqdmlals_s32
-; CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqdmlal.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqdmlal1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vqdmlal2.i = insertelement <1 x i32> undef, i32 %c, i32 0
-  %vqdmlal3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64> %vqdmlal.i, <1 x i32> %vqdmlal1.i, <1 x i32> %vqdmlal2.i)
-  %0 = extractelement <1 x i64> %vqdmlal3.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
-
-define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) {
-; CHECK: test_vqdmlslh_s16
-; CHECK: sqdmlsl {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqdmlsl.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqdmlsl1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vqdmlsl2.i = insertelement <1 x i16> undef, i16 %c, i32 0
-  %vqdmlsl3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32> %vqdmlsl.i, <1 x i16> %vqdmlsl1.i, <1 x i16> %vqdmlsl2.i)
-  %0 = extractelement <1 x i32> %vqdmlsl3.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) {
-; CHECK: test_vqdmlsls_s32
-; CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqdmlsl.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqdmlsl1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vqdmlsl2.i = insertelement <1 x i32> undef, i32 %c, i32 0
-  %vqdmlsl3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64> %vqdmlsl.i, <1 x i32> %vqdmlsl1.i, <1 x i32> %vqdmlsl2.i)
-  %0 = extractelement <1 x i64> %vqdmlsl3.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
-
-define i32 @test_vqdmullh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqdmullh_s16
-; CHECK: sqdmull {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqdmull.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqdmull1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vqdmull2.i = call <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16> %vqdmull.i, <1 x i16> %vqdmull1.i)
-  %0 = extractelement <1 x i32> %vqdmull2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vqdmulls_s32(i32 %a, i32 %b) {
-; CHECK: test_vqdmulls_s32
-; CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqdmull.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqdmull1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vqdmull2.i = call <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32> %vqdmull.i, <1 x i32> %vqdmull1.i)
-  %0 = extractelement <1 x i64> %vqdmull2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32>, <1 x i32>)
diff --git a/test/CodeGen/AArch64/neon-scalar-neg.ll b/test/CodeGen/AArch64/neon-scalar-neg.ll
deleted file mode 100644
index 4dc9d51..0000000
--- a/test/CodeGen/AArch64/neon-scalar-neg.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define i64 @test_vnegd_s64(i64 %a) {
-; CHECK: test_vnegd_s64
-; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vneg1.i = tail call <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64> %vneg.i)
-  %0 = extractelement <1 x i64> %vneg1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64>)
-
-define i8 @test_vqnegb_s8(i8 %a) {
-; CHECK: test_vqnegb_s8
-; CHECK: sqneg {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vqneg1.i = call <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8> %vqneg.i)
-  %0 = extractelement <1 x i8> %vqneg1.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8>)
-
-define i16 @test_vqnegh_s16(i16 %a) {
-; CHECK: test_vqnegh_s16
-; CHECK: sqneg {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqneg1.i = call <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16> %vqneg.i)
-  %0 = extractelement <1 x i16> %vqneg1.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16>)
-
-define i32 @test_vqnegs_s32(i32 %a) {
-; CHECK: test_vqnegs_s32
-; CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqneg1.i = call <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32> %vqneg.i)
-  %0 = extractelement <1 x i32> %vqneg1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32>)
-
-define i64 @test_vqnegd_s64(i64 %a) {
-; CHECK: test_vqnegd_s64
-; CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqneg1.i = call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %vqneg.i)
-  %0 = extractelement <1 x i64> %vqneg1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
-\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-scalar-recip.ll b/test/CodeGen/AArch64/neon-scalar-recip.ll
deleted file mode 100644
index 100839b..0000000
--- a/test/CodeGen/AArch64/neon-scalar-recip.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define float @test_vrecpss_f32(float %a, float %b) {
-; CHECK: test_vrecpss_f32
-; CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = call float @llvm.aarch64.neon.vrecps.f32(float %a, float %b)
-  ret float %1
-}
-
-define double @test_vrecpsd_f64(double %a, double %b) {
-; CHECK: test_vrecpsd_f64
-; CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = call double @llvm.aarch64.neon.vrecps.f64(double %a, double %b)
-  ret double %1
-}
-
-declare float @llvm.aarch64.neon.vrecps.f32(float, float)
-declare double @llvm.aarch64.neon.vrecps.f64(double, double)
-
-define float @test_vrsqrtss_f32(float %a, float %b) {
-; CHECK: test_vrsqrtss_f32
-; CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = call float @llvm.aarch64.neon.vrsqrts.f32(float %a, float %b)
-  ret float %1
-}
-
-define double @test_vrsqrtsd_f64(double %a, double %b) {
-; CHECK: test_vrsqrtsd_f64
-; CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = call double @llvm.aarch64.neon.vrsqrts.f64(double %a, double %b)
-  ret double %1
-}
-
-declare float @llvm.aarch64.neon.vrsqrts.f32(float, float)
-declare double @llvm.aarch64.neon.vrsqrts.f64(double, double)
-
-define float @test_vrecpes_f32(float %a) {
-; CHECK: test_vrecpes_f32
-; CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vrecpe.f32(float %a)
-  ret float %0
-}
-
-define double @test_vrecped_f64(double %a) {
-; CHECK: test_vrecped_f64
-; CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vrecpe.f64(double %a)
-  ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrecpe.f32(float)
-declare double @llvm.aarch64.neon.vrecpe.f64(double)
-
-define float @test_vrecpxs_f32(float %a) {
-; CHECK: test_vrecpxs_f32
-; CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vrecpx.f32(float %a)
-  ret float %0
-}
-
-define double @test_vrecpxd_f64(double %a) {
-; CHECK: test_vrecpxd_f64
-; CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vrecpx.f64(double %a)
-  ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrecpx.f32(float)
-declare double @llvm.aarch64.neon.vrecpx.f64(double)
-
-define float @test_vrsqrtes_f32(float %a) {
-; CHECK: test_vrsqrtes_f32
-; CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vrsqrte.f32(float %a)
-  ret float %0
-}
-
-define double @test_vrsqrted_f64(double %a) {
-; CHECK: test_vrsqrted_f64
-; CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vrsqrte.f64(double %a)
-  ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrsqrte.f32(float)
-declare double @llvm.aarch64.neon.vrsqrte.f64(double)
diff --git a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
deleted file mode 100644
index 33ce5cf..0000000
--- a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
+++ /dev/null
@@ -1,215 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64>)
-
-define <1 x i64> @test_addp_v1i64(<2 x i64> %a) {
-; CHECK: test_addp_v1i64:
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64> %a)
-  ret <1 x i64> %val
-}
-
-declare float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float>)
-
-define float @test_faddp_f32(<2 x float> %a) {
-; CHECK: test_faddp_f32:
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double>)
-
-define double @test_faddp_f64(<2 x double> %a) {
-; CHECK: test_faddp_f64:
-; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-
-declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>)
-
-define float @test_fmaxp_f32(<2 x float> %a) {
-; CHECK: test_fmaxp_f32:
-; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double>)
-
-define double @test_fmaxp_f64(<2 x double> %a) {
-; CHECK: test_fmaxp_f64:
-; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float>)
-
-define float @test_fminp_f32(<2 x float> %a) {
-; CHECK: test_fminp_f32:
-; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double>)
-
-define double @test_fminp_f64(<2 x double> %a) {
-; CHECK: test_fminp_f64:
-; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float>)
-
-define float @test_fmaxnmp_f32(<2 x float> %a) {
-; CHECK: test_fmaxnmp_f32:
-; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double>)
-
-define double @test_fmaxnmp_f64(<2 x double> %a) {
-; CHECK: test_fmaxnmp_f64:
-; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float>)
-
-define float @test_fminnmp_f32(<2 x float> %a) {
-; CHECK: test_fminnmp_f32:
-; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double>)
-
-define double @test_fminnmp_f64(<2 x double> %a) {
-; CHECK: test_fminnmp_f64:
-; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-define float @test_vaddv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vaddv_f32
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define float @test_vaddvq_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vaddvq_f32
-; CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float> %a)
-  ret float %1
-}
-
-define double @test_vaddvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vaddvq_f64
-; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vmaxv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vmaxv_f32
-; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define double @test_vmaxvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vmaxvq_f64
-; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vminv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vminv_f32
-; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define double @test_vminvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vminvq_f64
-; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define double @test_vmaxnmvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vmaxnmvq_f64
-; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vmaxnmv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vmaxnmv_f32
-; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define double @test_vminnmvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vminnmvq_f64
-; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vminnmv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vminnmv_f32
-; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpaddq_s64
-; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpaddq_u64
-; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %1
-}
-
-define i64 @test_vaddvq_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vaddvq_s64
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-define i64 @test_vaddvq_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vaddvq_u64
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64>)
-
-declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
-
-declare float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float>)
diff --git a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
deleted file mode 100644
index 7c9ffa0..0000000
--- a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-
-declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_urshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_srshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_urshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_urshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_srshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_srshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
deleted file mode 100644
index 5c010ef..0000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
+++ /dev/null
@@ -1,242 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqadd_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqadd_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqsub_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqsub_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqadd_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqadd_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqsub_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqsub_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqadd_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqadd_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqsub_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-
-define <1 x i32> @test_sqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqsub_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqadd_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqadd_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqsub_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqsub_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define i8 @test_vuqaddb_s8(i8 %a, i8 %b) {
-; CHECK: test_vuqaddb_s8
-; CHECK: suqadd {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
-  %vuqadd2.i = call <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8> %vuqadd.i, <1 x i8> %vuqadd1.i)
-  %0 = extractelement <1 x i8> %vuqadd2.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>)
-
-define i16 @test_vuqaddh_s16(i16 %a, i16 %b) {
-; CHECK: test_vuqaddh_s16
-; CHECK: suqadd {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vuqadd2.i = call <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16> %vuqadd.i, <1 x i16> %vuqadd1.i)
-  %0 = extractelement <1 x i16> %vuqadd2.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define i32 @test_vuqadds_s32(i32 %a, i32 %b) {
-; CHECK: test_vuqadds_s32
-; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vuqadd2.i = call <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32> %vuqadd.i, <1 x i32> %vuqadd1.i)
-  %0 = extractelement <1 x i32> %vuqadd2.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32>, <1 x i32>)
-
-define i64 @test_vuqaddd_s64(i64 %a, i64 %b) {
-; CHECK: test_vuqaddd_s64
-; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vuqadd2.i = call <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64> %vuqadd.i, <1 x i64> %vuqadd1.i)
-  %0 = extractelement <1 x i64> %vuqadd2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64>, <1 x i64>)
-
-define i8 @test_vsqaddb_u8(i8 %a, i8 %b) {
-; CHECK: test_vsqaddb_u8
-; CHECK: usqadd {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
-  %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %vsqadd.i, <1 x i8> %vsqadd1.i)
-  %0 = extractelement <1 x i8> %vsqadd2.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8>, <1 x i8>)
-
-define i16 @test_vsqaddh_u16(i16 %a, i16 %b) {
-; CHECK: test_vsqaddh_u16
-; CHECK: usqadd {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %vsqadd.i, <1 x i16> %vsqadd1.i)
-  %0 = extractelement <1 x i16> %vsqadd2.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define i32 @test_vsqadds_u32(i32 %a, i32 %b) {
-; CHECK: test_vsqadds_u32
-; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vsqadd2.i = call <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32> %vsqadd.i, <1 x i32> %vsqadd1.i)
-  %0 = extractelement <1 x i32> %vsqadd2.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32>, <1 x i32>)
-
-define i64 @test_vsqaddd_u64(i64 %a, i64 %b) {
-; CHECK: test_vsqaddd_u64
-; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsqadd2.i = call <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64> %vsqadd.i, <1 x i64> %vsqadd1.i)
-  %0 = extractelement <1 x i64> %vsqadd2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
deleted file mode 100644
index dbf9669..0000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
+++ /dev/null
@@ -1,94 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqrshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqrshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqrshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqrshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqrshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqrshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqrshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-
-  ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqrshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqrshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqrshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
deleted file mode 100644
index 0a1f4c9..0000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
deleted file mode 100644
index 6224361..0000000
--- a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
+++ /dev/null
@@ -1,531 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define i64 @test_vshrd_n_s64(i64 %a) {
-; CHECK: test_vshrd_n_s64
-; CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsshr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsshr1 = call <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64> %vsshr, i32 63)
-  %0 = extractelement <1 x i64> %vsshr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64>, i32)
-
-define i64 @test_vshrd_n_u64(i64 %a) {
-; CHECK: test_vshrd_n_u64
-; CHECK: ushr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vushr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vushr1 = call <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64> %vushr, i32 63)
-  %0 = extractelement <1 x i64> %vushr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64>, i32)
-
-define i64 @test_vrshrd_n_s64(i64 %a) {
-; CHECK: test_vrshrd_n_s64
-; CHECK: srshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsrshr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsrshr1 = call <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64> %vsrshr, i32 63)
-  %0 = extractelement <1 x i64> %vsrshr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64>, i32)
-
-define i64 @test_vrshrd_n_u64(i64 %a) {
-; CHECK: test_vrshrd_n_u64
-; CHECK: urshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vurshr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vurshr1 = call <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64> %vurshr, i32 63)
-  %0 = extractelement <1 x i64> %vurshr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64>, i32)
-
-define i64 @test_vsrad_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vsrad_n_s64
-; CHECK: ssra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vssra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vssra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vssra2 = call <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64> %vssra, <1 x i64> %vssra1, i32 63)
-  %0 = extractelement <1 x i64> %vssra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vsrad_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vsrad_n_u64
-; CHECK: usra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vusra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vusra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vusra2 = call <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64> %vusra, <1 x i64> %vusra1, i32 63)
-  %0 = extractelement <1 x i64> %vusra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vrsrad_n_s64
-; CHECK: srsra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsrsra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsrsra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsrsra2 = call <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64> %vsrsra, <1 x i64> %vsrsra1, i32 63)
-  %0 = extractelement <1 x i64> %vsrsra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vrsrad_n_u64
-; CHECK: ursra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vursra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vursra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vursra2 = call <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64> %vursra, <1 x i64> %vursra1, i32 63)
-  %0 = extractelement <1 x i64> %vursra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vshld_n_s64(i64 %a) {
-; CHECK: test_vshld_n_s64
-; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
-  %0 = extractelement <1 x i64> %vshl1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64>, i32)
-
-define i64 @test_vshld_n_u64(i64 %a) {
-; CHECK: test_vshld_n_u64
-; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
-  %0 = extractelement <1 x i64> %vshl1, i32 0
-  ret i64 %0
-}
-
-define i8 @test_vqshlb_n_s8(i8 %a) {
-; CHECK: test_vqshlb_n_s8
-; CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
-  %vsqshl = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vsqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8> %vsqshl, i32 7)
-  %0 = extractelement <1 x i8> %vsqshl1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshlh_n_s16(i16 %a) {
-; CHECK: test_vqshlh_n_s16
-; CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
-  %vsqshl = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16> %vsqshl, i32 15)
-  %0 = extractelement <1 x i16> %vsqshl1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshls_n_s32(i32 %a) {
-; CHECK: test_vqshls_n_s32
-; CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
-  %vsqshl = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32> %vsqshl, i32 31)
-  %0 = extractelement <1 x i32> %vsqshl1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshld_n_s64(i64 %a) {
-; CHECK: test_vqshld_n_s64
-; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsqshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64> %vsqshl, i32 63)
-  %0 = extractelement <1 x i64> %vsqshl1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64>, i32)
-
-define i8 @test_vqshlb_n_u8(i8 %a) {
-; CHECK: test_vqshlb_n_u8
-; CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
-  %vuqshl = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vuqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8> %vuqshl, i32 7)
-  %0 = extractelement <1 x i8> %vuqshl1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshlh_n_u16(i16 %a) {
-; CHECK: test_vqshlh_n_u16
-; CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
-  %vuqshl = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16> %vuqshl, i32 15)
-  %0 = extractelement <1 x i16> %vuqshl1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshls_n_u32(i32 %a) {
-; CHECK: test_vqshls_n_u32
-; CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
-  %vuqshl = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32> %vuqshl, i32 31)
-  %0 = extractelement <1 x i32> %vuqshl1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshld_n_u64(i64 %a) {
-; CHECK: test_vqshld_n_u64
-; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vuqshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64> %vuqshl, i32 63)
-  %0 = extractelement <1 x i64> %vuqshl1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64>, i32)
-
-define i8 @test_vqshlub_n_s8(i8 %a) {
-; CHECK: test_vqshlub_n_s8
-; CHECK: sqshlu {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
-  %vsqshlu = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vsqshlu1 = call <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8> %vsqshlu, i32 7)
-  %0 = extractelement <1 x i8> %vsqshlu1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshluh_n_s16(i16 %a) {
-; CHECK: test_vqshluh_n_s16
-; CHECK: sqshlu {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
-  %vsqshlu = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshlu1 = call <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16> %vsqshlu, i32 15)
-  %0 = extractelement <1 x i16> %vsqshlu1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshlus_n_s32(i32 %a) {
-; CHECK: test_vqshlus_n_s32
-; CHECK: sqshlu {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
-  %vsqshlu = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshlu1 = call <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32> %vsqshlu, i32 31)
-  %0 = extractelement <1 x i32> %vsqshlu1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshlud_n_s64(i64 %a) {
-; CHECK: test_vqshlud_n_s64
-; CHECK: sqshlu {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsqshlu = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshlu1 = call <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64> %vsqshlu, i32 63)
-  %0 = extractelement <1 x i64> %vsqshlu1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64>, i32)
-
-define i64 @test_vsrid_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vsrid_n_s64
-; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
-  %0 = extractelement <1 x i64> %vsri2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vsrid_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vsrid_n_u64
-; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
-  %0 = extractelement <1 x i64> %vsri2, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vslid_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vslid_n_s64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
-  %0 = extractelement <1 x i64> %vsli2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vslid_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vslid_n_u64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
-  %0 = extractelement <1 x i64> %vsli2, i32 0
-  ret i64 %0
-}
-
-define i8 @test_vqshrnh_n_s16(i16 %a) {
-; CHECK: test_vqshrnh_n_s16
-; CHECK: sqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16> %vsqshrn, i32 8)
-  %0 = extractelement <1 x i8> %vsqshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshrns_n_s32(i32 %a) {
-; CHECK: test_vqshrns_n_s32
-; CHECK: sqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32> %vsqshrn, i32 16)
-  %0 = extractelement <1 x i16> %vsqshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrnd_n_s64(i64 %a) {
-; CHECK: test_vqshrnd_n_s64
-; CHECK: sqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64> %vsqshrn, i32 32)
-  %0 = extractelement <1 x i32> %vsqshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqshrnh_n_u16(i16 %a) {
-; CHECK: test_vqshrnh_n_u16
-; CHECK: uqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vuqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16> %vuqshrn, i32 8)
-  %0 = extractelement <1 x i8> %vuqshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshrns_n_u32(i32 %a) {
-; CHECK: test_vqshrns_n_u32
-; CHECK: uqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vuqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32> %vuqshrn, i32 16)
-  %0 = extractelement <1 x i16> %vuqshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrnd_n_u64(i64 %a) {
-; CHECK: test_vqshrnd_n_u64
-; CHECK: uqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vuqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64> %vuqshrn, i32 32)
-  %0 = extractelement <1 x i32> %vuqshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrnh_n_s16(i16 %a) {
-; CHECK: test_vqrshrnh_n_s16
-; CHECK: sqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16> %vsqrshrn, i32 8)
-  %0 = extractelement <1 x i8> %vsqrshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshrns_n_s32(i32 %a) {
-; CHECK: test_vqrshrns_n_s32
-; CHECK: sqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32> %vsqrshrn, i32 16)
-  %0 = extractelement <1 x i16> %vsqrshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrnd_n_s64(i64 %a) {
-; CHECK: test_vqrshrnd_n_s64
-; CHECK: sqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64> %vsqrshrn, i32 32)
-  %0 = extractelement <1 x i32> %vsqrshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrnh_n_u16(i16 %a) {
-; CHECK: test_vqrshrnh_n_u16
-; CHECK: uqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vuqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16> %vuqrshrn, i32 8)
-  %0 = extractelement <1 x i8> %vuqrshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshrns_n_u32(i32 %a) {
-; CHECK: test_vqrshrns_n_u32
-; CHECK: uqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vuqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %vuqrshrn, i32 16)
-  %0 = extractelement <1 x i16> %vuqrshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrnd_n_u64(i64 %a) {
-; CHECK: test_vqrshrnd_n_u64
-; CHECK: uqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vuqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64> %vuqrshrn, i32 32)
-  %0 = extractelement <1 x i32> %vuqrshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqshrunh_n_s16(i16 %a) {
-; CHECK: test_vqshrunh_n_s16
-; CHECK: sqshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqshrun = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16> %vsqshrun, i32 8)
-  %0 = extractelement <1 x i8> %vsqshrun1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshruns_n_s32(i32 %a) {
-; CHECK: test_vqshruns_n_s32
-; CHECK: sqshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqshrun = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32> %vsqshrun, i32 16)
-  %0 = extractelement <1 x i16> %vsqshrun1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrund_n_s64(i64 %a) {
-; CHECK: test_vqshrund_n_s64
-; CHECK: sqshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqshrun = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64> %vsqshrun, i32 32)
-  %0 = extractelement <1 x i32> %vsqshrun1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrunh_n_s16(i16 %a) {
-; CHECK: test_vqrshrunh_n_s16
-; CHECK: sqrshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqrshrun = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqrshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16> %vsqrshrun, i32 8)
-  %0 = extractelement <1 x i8> %vsqrshrun1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshruns_n_s32(i32 %a) {
-; CHECK: test_vqrshruns_n_s32
-; CHECK: sqrshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqrshrun = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqrshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32> %vsqrshrun, i32 16)
-  %0 = extractelement <1 x i16> %vsqrshrun1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrund_n_s64(i64 %a) {
-; CHECK: test_vqrshrund_n_s64
-; CHECK: sqrshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqrshrun = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqrshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64> %vsqrshrun, i32 32)
-  %0 = extractelement <1 x i32> %vsqrshrun1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64>, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-shift.ll b/test/CodeGen/AArch64/neon-scalar-shift.ll
deleted file mode 100644
index b712ea4..0000000
--- a/test/CodeGen/AArch64/neon-scalar-shift.ll
+++ /dev/null
@@ -1,236 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_ushl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_ushl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_ushl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_vtst_s64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vtst_s64
-; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = and <1 x i64> %a, %b
-  %1 = icmp ne <1 x i64> %0, zeroinitializer
-  %vtst.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vtst.i
-}
-
-define <1 x i64> @test_vtst_u64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vtst_u64
-; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = and <1 x i64> %a, %b
-  %1 = icmp ne <1 x i64> %0, zeroinitializer
-  %vtst.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vtst.i
-}
-
-define <1 x i64> @test_vsli_n_p64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vsli_n_p64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #0
-entry:
-  %vsli_n2 = tail call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %a, <1 x i64> %b, i32 0)
-  ret <1 x i64> %vsli_n2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define <2 x i64> @test_vsliq_n_p64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsliq_n_p64
-; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-entry:
-  %vsli_n2 = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 0)
-  ret <2 x i64> %vsli_n2
-}
-
-declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32)
-
-define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vrsqrte_u32
-; CHECK: ursqrte {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vrsqrte1.i = tail call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a)
-  ret <2 x i32> %vrsqrte1.i
-}
-
-define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vrsqrteq_u32
-; CHECK: ursqrte {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsqrte1.i = tail call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a)
-  ret <4 x i32> %vrsqrte1.i
-}
-
-define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) {
-; CHECK-LABEL: test_vqshl_n_s8
-; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-entry:
-  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
-  ret <8 x i8> %vqshl_n
-}
-
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vqshlq_n_s8
-; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-entry:
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
-  ret <16 x i8> %vqshl_n
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) {
-; CHECK-LABEL: test_vqshl_n_s16
-; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-entry:
-  %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer)
-  ret <4 x i16> %vqshl_n1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_vqshlq_n_s16
-; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-entry:
-  %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer)
-  ret <8 x i16> %vqshl_n1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vqshl_n_s32
-; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-entry:
-  %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer)
-  ret <2 x i32> %vqshl_n1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_vqshlq_n_s32
-; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-entry:
-  %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer)
-  ret <4 x i32> %vqshl_n1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vqshlq_n_s64
-; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-entry:
-  %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer)
-  ret <2 x i64> %vqshl_n1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) {
-; CHECK-LABEL: test_vqshl_n_u8
-; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-entry:
-  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
-  ret <8 x i8> %vqshl_n
-}
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-
-define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_vqshlq_n_u8
-; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-entry:
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
-  ret <16 x i8> %vqshl_n
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
-
-define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) {
-; CHECK-LABEL: test_vqshl_n_u16
-; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-entry:
-  %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer)
-  ret <4 x i16> %vqshl_n1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-
-define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_vqshlq_n_u16
-; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-entry:
-  %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer)
-  ret <8 x i16> %vqshl_n1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
-
-define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vqshl_n_u32
-; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-entry:
-  %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer)
-  ret <2 x i32> %vqshl_n1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
-
-define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vqshlq_n_u32
-; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-entry:
-  %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer)
-  ret <4 x i32> %vqshl_n1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-
-define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vqshlq_n_u64
-; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d,
-entry:
-  %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer)
-  ret <2 x i64> %vqshl_n1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>)
-
-declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/neon-shift.ll b/test/CodeGen/AArch64/neon-shift.ll
deleted file mode 100644
index 33b04ce..0000000
--- a/test/CodeGen/AArch64/neon-shift.ll
+++ /dev/null
@@ -1,171 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: ushl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_ushl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_ushl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: ushl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_ushl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_ushl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: ushl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_ushl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_ushl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: ushl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_ushl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_ushl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: ushl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_ushl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_ushl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: ushl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_ushl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_ushl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: ushl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-
-define <8 x i8> @test_shl_v8i8(<8 x i8> %a) {
-; CHECK: test_shl_v8i8:
-; CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %tmp = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <8 x i8> %tmp
-}
-
-define <4 x i16> @test_shl_v4i16(<4 x i16> %a) {
-; CHECK: test_shl_v4i16:
-; CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %tmp = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
-  ret <4 x i16> %tmp
-}
-
-define <2 x i32> @test_shl_v2i32(<2 x i32> %a) {
-; CHECK: test_shl_v2i32:
-; CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %tmp = shl <2 x i32> %a, <i32 3, i32 3>
-  ret <2 x i32> %tmp
-}
-
-define <16 x i8> @test_shl_v16i8(<16 x i8> %a) {
-; CHECK: test_shl_v16i8:
-; CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %tmp = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <16 x i8> %tmp
-}
-
-define <8 x i16> @test_shl_v8i16(<8 x i16> %a) {
-; CHECK: test_shl_v8i16:
-; CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %tmp = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  ret <8 x i16> %tmp
-}
-
-define <4 x i32> @test_shl_v4i32(<4 x i32> %a) {
-; CHECK: test_shl_v4i32:
-; CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %tmp = shl <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %tmp
-}
-
-define <2 x i64> @test_shl_v2i64(<2 x i64> %a) {
-; CHECK: test_shl_v2i64:
-; CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63
-  %tmp = shl <2 x i64> %a, <i64 63, i64 63>
-  ret <2 x i64> %tmp
-}
-
diff --git a/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll b/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
deleted file mode 100644
index 0b520d7..0000000
--- a/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
+++ /dev/null
@@ -1,333 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <8 x i8> @shl.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: shl.v8i8:
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = shl <8 x i8> %a, %b
-  ret <8 x i8> %c
-}
-
-define <4 x i16> @shl.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: shl.v4i16:
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = shl <4 x i16> %a, %b
-  ret <4 x i16> %c
-}
-
-define <2 x i32> @shl.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: shl.v2i32:
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = shl <2 x i32> %a, %b
-  ret <2 x i32> %c
-}
-
-define <1 x i64> @shl.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: shl.v1i64:
-; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %c = shl <1 x i64> %a, %b
-  ret <1 x i64> %c
-}
-
-define <16 x i8> @shl.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shl.v16i8:
-; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %c = shl <16 x i8> %a, %b
-  ret <16 x i8> %c
-}
-
-define <8 x i16> @shl.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shl.v8i16:
-; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %c = shl <8 x i16> %a, %b
-  ret <8 x i16> %c
-}
-
-define <4 x i32> @shl.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shl.v4i32:
-; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %c = shl <4 x i32> %a, %b
-  ret <4 x i32> %c
-}
-
-define <2 x i64> @shl.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: shl.v2i64:
-; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %c = shl <2 x i64> %a, %b
-  ret <2 x i64> %c
-}
-
-define <8 x i8> @lshr.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: lshr.v8i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = lshr <8 x i8> %a, %b
-  ret <8 x i8> %c
-}
-
-define <4 x i16> @lshr.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: lshr.v4i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = lshr <4 x i16> %a, %b
-  ret <4 x i16> %c
-}
-
-define <2 x i32> @lshr.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: lshr.v2i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = lshr <2 x i32> %a, %b
-  ret <2 x i32> %c
-}
-
-define <1 x i64> @lshr.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: lshr.v1i64:
-; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %c = lshr <1 x i64> %a, %b
-  ret <1 x i64> %c
-}
-
-define <16 x i8> @lshr.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: lshr.v16i8:
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %c = lshr <16 x i8> %a, %b
-  ret <16 x i8> %c
-}
-
-define <8 x i16> @lshr.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: lshr.v8i16:
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %c = lshr <8 x i16> %a, %b
-  ret <8 x i16> %c
-}
-
-define <4 x i32> @lshr.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: lshr.v4i32:
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %c = lshr <4 x i32> %a, %b
-  ret <4 x i32> %c
-}
-
-define <2 x i64> @lshr.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: lshr.v2i64:
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %c = lshr <2 x i64> %a, %b
-  ret <2 x i64> %c
-}
-
-define <8 x i8> @ashr.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: ashr.v8i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = ashr <8 x i8> %a, %b
-  ret <8 x i8> %c
-}
-
-define <4 x i16> @ashr.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: ashr.v4i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = ashr <4 x i16> %a, %b
-  ret <4 x i16> %c
-}
-
-define <2 x i32> @ashr.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: ashr.v2i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = ashr <2 x i32> %a, %b
-  ret <2 x i32> %c
-}
-
-define <1 x i64> @ashr.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: ashr.v1i64:
-; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %c = ashr <1 x i64> %a, %b
-  ret <1 x i64> %c
-}
-
-define <16 x i8> @ashr.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: ashr.v16i8:
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: sshl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %c = ashr <16 x i8> %a, %b
-  ret <16 x i8> %c
-}
-
-define <8 x i16> @ashr.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: ashr.v8i16:
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-; CHECK: sshl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %c = ashr <8 x i16> %a, %b
-  ret <8 x i16> %c
-}
-
-define <4 x i32> @ashr.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: ashr.v4i32:
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: sshl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %c = ashr <4 x i32> %a, %b
-  ret <4 x i32> %c
-}
-
-define <2 x i64> @ashr.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: ashr.v2i64:
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: sshl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %c = ashr <2 x i64> %a, %b
-  ret <2 x i64> %c
-}
-
-define <1 x i64> @shl.v1i64.0(<1 x i64> %a) {
-; CHECK-LABEL: shl.v1i64.0:
-; CHECK-NOT: shl d{{[0-9]+}}, d{{[0-9]+}}, #0
-  %c = shl <1 x i64> %a, zeroinitializer
-  ret <1 x i64> %c
-}
-
-define <2 x i32> @shl.v2i32.0(<2 x i32> %a) {
-; CHECK-LABEL: shl.v2i32.0:
-; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0
-  %c = shl <2 x i32> %a, zeroinitializer
-  ret <2 x i32> %c
-}
-
-; The following test cases test shl/ashr/lshr with v1i8/v1i16/v1i32 types
-
-define <1 x i8> @shl.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: shl.v1i8:
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = shl <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @shl.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: shl.v1i16:
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = shl <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @shl.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: shl.v1i32:
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = shl <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @ashr.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: ashr.v1i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = ashr <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @ashr.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: ashr.v1i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = ashr <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @ashr.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: ashr.v1i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = ashr <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @lshr.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: lshr.v1i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = lshr <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @lshr.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: lshr.v1i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = lshr <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @lshr.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: lshr.v1i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = lshr <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @shl.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: shl.v1i8.imm:
-; CHECK: shl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
-  %c = shl <1 x i8> %a, <i8 3>
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @shl.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: shl.v1i16.imm:
-; CHECK: shl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #5
-  %c = shl <1 x i16> %a, <i16 5>
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @shl.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: shl.v1i32.imm:
-; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0
-  %c = shl <1 x i32> %a, zeroinitializer
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @ashr.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: ashr.v1i8.imm:
-; CHECK: sshr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
-  %c = ashr <1 x i8> %a, <i8 3>
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @ashr.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: ashr.v1i16.imm:
-; CHECK: sshr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10
-  %c = ashr <1 x i16> %a, <i16 10>
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @ashr.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: ashr.v1i32.imm:
-; CHECK: sshr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31
-  %c = ashr <1 x i32> %a, <i32 31>
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @lshr.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: lshr.v1i8.imm:
-; CHECK: ushr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
-  %c = lshr <1 x i8> %a, <i8 3>
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @lshr.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: lshr.v1i16.imm:
-; CHECK: ushr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10
-  %c = lshr <1 x i16> %a, <i16 10>
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @lshr.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: lshr.v1i32.imm:
-; CHECK: ushr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31
-  %c = lshr <1 x i32> %a, <i32 31>
-  ret <1 x i32> %c
-}
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
deleted file mode 100644
index d5557c0..0000000
--- a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
+++ /dev/null
@@ -1,2314 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define void @test_ldst1_v16i8(<16 x i8>* %ptr, <16 x i8>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v16i8:
-; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-  %tmp = load <16 x i8>* %ptr
-  store <16 x i8> %tmp, <16 x i8>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v8i16(<8 x i16>* %ptr, <8 x i16>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v8i16:
-; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-  %tmp = load <8 x i16>* %ptr
-  store <8 x i16> %tmp, <8 x i16>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v4i32(<4 x i32>* %ptr, <4 x i32>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v4i32:
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %tmp = load <4 x i32>* %ptr
-  store <4 x i32> %tmp, <4 x i32>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v2i64(<2 x i64>* %ptr, <2 x i64>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v2i64:
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %tmp = load <2 x i64>* %ptr
-  store <2 x i64> %tmp, <2 x i64>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v8i8(<8 x i8>* %ptr, <8 x i8>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v8i8:
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %tmp = load <8 x i8>* %ptr
-  store <8 x i8> %tmp, <8 x i8>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v4i16(<4 x i16>* %ptr, <4 x i16>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v4i16:
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %tmp = load <4 x i16>* %ptr
-  store <4 x i16> %tmp, <4 x i16>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v2i32(<2 x i32>* %ptr, <2 x i32>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v2i32:
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %tmp = load <2 x i32>* %ptr
-  store <2 x i32> %tmp, <2 x i32>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v1i64(<1 x i64>* %ptr, <1 x i64>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v1i64:
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %tmp = load <1 x i64>* %ptr
-  store <1 x i64> %tmp, <1 x i64>* %ptr2
-  ret void
-}
-
-%struct.int8x16x2_t = type { [2 x <16 x i8>] }
-%struct.int16x8x2_t = type { [2 x <8 x i16>] }
-%struct.int32x4x2_t = type { [2 x <4 x i32>] }
-%struct.int64x2x2_t = type { [2 x <2 x i64>] }
-%struct.float32x4x2_t = type { [2 x <4 x float>] }
-%struct.float64x2x2_t = type { [2 x <2 x double>] }
-%struct.int8x8x2_t = type { [2 x <8 x i8>] }
-%struct.int16x4x2_t = type { [2 x <4 x i16>] }
-%struct.int32x2x2_t = type { [2 x <2 x i32>] }
-%struct.int64x1x2_t = type { [2 x <1 x i64>] }
-%struct.float32x2x2_t = type { [2 x <2 x float>] }
-%struct.float64x1x2_t = type { [2 x <1 x double>] }
-%struct.int8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int16x8x3_t = type { [3 x <8 x i16>] }
-%struct.int32x4x3_t = type { [3 x <4 x i32>] }
-%struct.int64x2x3_t = type { [3 x <2 x i64>] }
-%struct.float32x4x3_t = type { [3 x <4 x float>] }
-%struct.float64x2x3_t = type { [3 x <2 x double>] }
-%struct.int8x8x3_t = type { [3 x <8 x i8>] }
-%struct.int16x4x3_t = type { [3 x <4 x i16>] }
-%struct.int32x2x3_t = type { [3 x <2 x i32>] }
-%struct.int64x1x3_t = type { [3 x <1 x i64>] }
-%struct.float32x2x3_t = type { [3 x <2 x float>] }
-%struct.float64x1x3_t = type { [3 x <1 x double>] }
-%struct.int8x16x4_t = type { [4 x <16 x i8>] }
-%struct.int16x8x4_t = type { [4 x <8 x i16>] }
-%struct.int32x4x4_t = type { [4 x <4 x i32>] }
-%struct.int64x2x4_t = type { [4 x <2 x i64>] }
-%struct.float32x4x4_t = type { [4 x <4 x float>] }
-%struct.float64x2x4_t = type { [4 x <2 x double>] }
-%struct.int8x8x4_t = type { [4 x <8 x i8>] }
-%struct.int16x4x4_t = type { [4 x <4 x i16>] }
-%struct.int32x2x4_t = type { [4 x <2 x i32>] }
-%struct.int64x1x4_t = type { [4 x <1 x i64>] }
-%struct.float32x2x4_t = type { [4 x <2 x float>] }
-%struct.float64x1x4_t = type { [4 x <1 x double>] }
-
-
-define <16 x i8> @test_vld1q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1q_s8
-; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-  %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1)
-  ret <16 x i8> %vld1
-}
-
-define <8 x i16> @test_vld1q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1q_s16
-; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2)
-  ret <8 x i16> %vld1
-}
-
-define <4 x i32> @test_vld1q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld1q_s32
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4)
-  ret <4 x i32> %vld1
-}
-
-define <2 x i64> @test_vld1q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld1q_s64
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8)
-  ret <2 x i64> %vld1
-}
-
-define <4 x float> @test_vld1q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld1q_f32
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4)
-  ret <4 x float> %vld1
-}
-
-define <2 x double> @test_vld1q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld1q_f64
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8)
-  ret <2 x double> %vld1
-}
-
-define <8 x i8> @test_vld1_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1_s8
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
-  ret <8 x i8> %vld1
-}
-
-define <4 x i16> @test_vld1_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1_s16
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
-  ret <4 x i16> %vld1
-}
-
-define <2 x i32> @test_vld1_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld1_s32
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4)
-  ret <2 x i32> %vld1
-}
-
-define <1 x i64> @test_vld1_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld1_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8)
-  ret <1 x i64> %vld1
-}
-
-define <2 x float> @test_vld1_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld1_f32
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4)
-  ret <2 x float> %vld1
-}
-
-define <1 x double> @test_vld1_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld1_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8)
-  ret <1 x double> %vld1
-}
-
-define <8 x i8> @test_vld1_p8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1_p8
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
-  ret <8 x i8> %vld1
-}
-
-define <4 x i16> @test_vld1_p16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1_p16
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
-  ret <4 x i16> %vld1
-}
-
-define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld2q_s8
-; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-  %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1)
-  %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1
-  ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld2q_s16
-; CHECK: ld2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2)
-  %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1
-  ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld2q_s32
-; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1
-  ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld2q_s64
-; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1
-  ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld2q_f32
-; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1
-  ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld2q_f64
-; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1
-  ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld2_s8
-; CHECK: ld2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1)
-  %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1
-  ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld2_s16
-; CHECK: ld2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2)
-  %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1
-  ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld2_s32
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1
-  ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld2_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1
-  ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld2_f32
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1
-  ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld2_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1
-  ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld3q_s8
-; CHECK: ld3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-  %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1)
-  %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2
-  ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld3q_s16
-; CHECK: ld3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2)
-  %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2
-  ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld3q_s32
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2
-  ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld3q_s64
-; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2
-  ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld3q_f32
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2
-  ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld3q_f64
-; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2
-  ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld3_s8
-; CHECK: ld3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1)
-  %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2
-  ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld3_s16
-; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2)
-  %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2
-  ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld3_s32
-; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2
-  ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld3_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2
-  ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld3_f32
-; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2
-  ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld3_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2
-  ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld4q_s8
-; CHECK: ld4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-  %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1)
-  %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3
-  ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld4q_s16
-; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2)
-  %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3
-  ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld4q_s32
-; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3
-  ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld4q_s64
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3
-  ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld4q_f32
-; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3
-  ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld4q_f64
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3
-  ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld4_s8
-; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1)
-  %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3
-  ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld4_s16
-; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2)
-  %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3
-  ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld4_s32
-; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3
-  ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld4_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3
-  ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld4_f32
-; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3
-  ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld4_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3
-  ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32)
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32)
-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32)
-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32)
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32)
-declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32)
-declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32)
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
-declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32)
-declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32)
-declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
-
-define void @test_vst1q_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vst1q_s8
-; CHECK: st1 {v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vst1q_s16
-; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vst1q_s32
-; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vst1q_s64
-; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vst1q_f32
-; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1q_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vst1q_f64
-; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vst1_s8
-; CHECK: st1 {v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_s16
-; CHECK: st1 {v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_s32
-; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vst1_s64
-; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_f32
-; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vst1_f64
-; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8)
-  ret void
-}
-
-define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s8
-; CHECK: st2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1)
-  ret void
-}
-
-define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s16
-; CHECK: st2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2)
-  ret void
-}
-
-define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s32
-; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s64
-; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_f32
-; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_f64
-; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s8
-; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1)
-  ret void
-}
-
-define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s16
-; CHECK: st2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2)
-  ret void
-}
-
-define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s32
-; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2_f32
-; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s8
-; CHECK: st3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1)
-  ret void
-}
-
-define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s16
-; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2)
-  ret void
-}
-
-define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s32
-; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s64
-; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_f32
-; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_f64
-; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s8
-; CHECK: st3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1)
-  ret void
-}
-
-define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s16
-; CHECK: st3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2)
-  ret void
-}
-
-define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s32
-; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3_f32
-; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s8
-; CHECK: st4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1)
-  ret void
-}
-
-define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s16
-; CHECK: st4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2)
-  ret void
-}
-
-define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s32
-; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s64
-; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_f32
-; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_f64
-; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s8
-; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1)
-  ret void
-}
-
-define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s16
-; CHECK: st4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2)
-  ret void
-}
-
-define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s32
-; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4_f32
-; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32)
-declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32)
-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
-declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32)
-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32)
-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
-
-define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a)  {
-; CHECK-LABEL: test_vld1q_s8_x2
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %1 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
-  %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
-  %4 = insertvalue %struct.int8x16x2_t undef, <16 x i8> %2, 0, 0
-  %5 = insertvalue %struct.int8x16x2_t %4, <16 x i8> %3, 0, 1
-  ret %struct.int8x16x2_t %5
-}
-
-define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a)  {
-; CHECK-LABEL: test_vld1q_s16_x2
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
-  %4 = extractvalue { <8 x i16>, <8 x i16> } %2, 1
-  %5 = insertvalue %struct.int16x8x2_t undef, <8 x i16> %3, 0, 0
-  %6 = insertvalue %struct.int16x8x2_t %5, <8 x i16> %4, 0, 1
-  ret %struct.int16x8x2_t %6
-}
-
-define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a)  {
-; CHECK-LABEL: test_vld1q_s32_x2
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
-  %4 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
-  %5 = insertvalue %struct.int32x4x2_t undef, <4 x i32> %3, 0, 0
-  %6 = insertvalue %struct.int32x4x2_t %5, <4 x i32> %4, 0, 1
-  ret %struct.int32x4x2_t %6
-}
-
-define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a)  {
-; CHECK-LABEL: test_vld1q_s64_x2
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64> } %2, 0
-  %4 = extractvalue { <2 x i64>, <2 x i64> } %2, 1
-  %5 = insertvalue %struct.int64x2x2_t undef, <2 x i64> %3, 0, 0
-  %6 = insertvalue %struct.int64x2x2_t %5, <2 x i64> %4, 0, 1
-  ret %struct.int64x2x2_t %6
-}
-
-define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a)  {
-; CHECK-LABEL: test_vld1q_f32_x2
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float> } %2, 0
-  %4 = extractvalue { <4 x float>, <4 x float> } %2, 1
-  %5 = insertvalue %struct.float32x4x2_t undef, <4 x float> %3, 0, 0
-  %6 = insertvalue %struct.float32x4x2_t %5, <4 x float> %4, 0, 1
-  ret %struct.float32x4x2_t %6
-}
-
-
-define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a)  {
-; CHECK-LABEL: test_vld1q_f64_x2
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double> } %2, 0
-  %4 = extractvalue { <2 x double>, <2 x double> } %2, 1
-  %5 = insertvalue %struct.float64x2x2_t undef, <2 x double> %3, 0, 0
-  %6 = insertvalue %struct.float64x2x2_t %5, <2 x double> %4, 0, 1
-  ret %struct.float64x2x2_t %6
-}
-
-define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a)  {
-; CHECK-LABEL: test_vld1_s8_x2
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %1 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8> } %1, 0
-  %3 = extractvalue { <8 x i8>, <8 x i8> } %1, 1
-  %4 = insertvalue %struct.int8x8x2_t undef, <8 x i8> %2, 0, 0
-  %5 = insertvalue %struct.int8x8x2_t %4, <8 x i8> %3, 0, 1
-  ret %struct.int8x8x2_t %5
-}
-
-define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a)  {
-; CHECK-LABEL: test_vld1_s16_x2
-; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8* %1, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16> } %2, 0
-  %4 = extractvalue { <4 x i16>, <4 x i16> } %2, 1
-  %5 = insertvalue %struct.int16x4x2_t undef, <4 x i16> %3, 0, 0
-  %6 = insertvalue %struct.int16x4x2_t %5, <4 x i16> %4, 0, 1
-  ret %struct.int16x4x2_t %6
-}
-
-define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a)  {
-; CHECK-LABEL: test_vld1_s32_x2
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32> } %2, 0
-  %4 = extractvalue { <2 x i32>, <2 x i32> } %2, 1
-  %5 = insertvalue %struct.int32x2x2_t undef, <2 x i32> %3, 0, 0
-  %6 = insertvalue %struct.int32x2x2_t %5, <2 x i32> %4, 0, 1
-  ret %struct.int32x2x2_t %6
-}
-
-define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a)  {
-; CHECK-LABEL: test_vld1_s64_x2
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x i64>, <1 x i64> } %2, 0
-  %4 = extractvalue { <1 x i64>, <1 x i64> } %2, 1
-  %5 = insertvalue %struct.int64x1x2_t undef, <1 x i64> %3, 0, 0
-  %6 = insertvalue %struct.int64x1x2_t %5, <1 x i64> %4, 0, 1
-  ret %struct.int64x1x2_t %6
-}
-
-define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a)  {
-; CHECK-LABEL: test_vld1_f32_x2
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x float>, <2 x float> } %2, 0
-  %4 = extractvalue { <2 x float>, <2 x float> } %2, 1
-  %5 = insertvalue %struct.float32x2x2_t undef, <2 x float> %3, 0, 0
-  %6 = insertvalue %struct.float32x2x2_t %5, <2 x float> %4, 0, 1
-  ret %struct.float32x2x2_t %6
-}
-
-define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a)  {
-; CHECK-LABEL: test_vld1_f64_x2
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x double>, <1 x double> } %2, 0
-  %4 = extractvalue { <1 x double>, <1 x double> } %2, 1
-  %5 = insertvalue %struct.float64x1x2_t undef, <1 x double> %3, 0, 0
-  %6 = insertvalue %struct.float64x1x2_t %5, <1 x double> %4, 0, 1
-  ret %struct.float64x1x2_t %6
-}
-
-define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a)  {
-; CHECK-LABEL: test_vld1q_s8_x3
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
-; [{{x[0-9]+|sp}}]
-  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
-  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
-  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
-  %5 = insertvalue %struct.int8x16x3_t undef, <16 x i8> %2, 0, 0
-  %6 = insertvalue %struct.int8x16x3_t %5, <16 x i8> %3, 0, 1
-  %7 = insertvalue %struct.int8x16x3_t %6, <16 x i8> %4, 0, 2
-  ret %struct.int8x16x3_t %7
-}
-
-define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a)  {
-; CHECK-LABEL: test_vld1q_s16_x3
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
-  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
-  %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
-  %6 = insertvalue %struct.int16x8x3_t undef, <8 x i16> %3, 0, 0
-  %7 = insertvalue %struct.int16x8x3_t %6, <8 x i16> %4, 0, 1
-  %8 = insertvalue %struct.int16x8x3_t %7, <8 x i16> %5, 0, 2
-  ret %struct.int16x8x3_t %8
-}
-
-define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a)  {
-; CHECK-LABEL: test_vld1q_s32_x3
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
-  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
-  %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
-  %6 = insertvalue %struct.int32x4x3_t undef, <4 x i32> %3, 0, 0
-  %7 = insertvalue %struct.int32x4x3_t %6, <4 x i32> %4, 0, 1
-  %8 = insertvalue %struct.int32x4x3_t %7, <4 x i32> %5, 0, 2
-  ret %struct.int32x4x3_t %8
-}
-
-define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a)  {
-; CHECK-LABEL: test_vld1q_s64_x3
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
-  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
-  %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
-  %6 = insertvalue %struct.int64x2x3_t undef, <2 x i64> %3, 0, 0
-  %7 = insertvalue %struct.int64x2x3_t %6, <2 x i64> %4, 0, 1
-  %8 = insertvalue %struct.int64x2x3_t %7, <2 x i64> %5, 0, 2
-  ret %struct.int64x2x3_t %8
-}
-
-define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a)  {
-; CHECK-LABEL: test_vld1q_f32_x3
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 0
-  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 1
-  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 2
-  %6 = insertvalue %struct.float32x4x3_t undef, <4 x float> %3, 0, 0
-  %7 = insertvalue %struct.float32x4x3_t %6, <4 x float> %4, 0, 1
-  %8 = insertvalue %struct.float32x4x3_t %7, <4 x float> %5, 0, 2
-  ret %struct.float32x4x3_t %8
-}
-
-
-define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a)  {
-; CHECK-LABEL: test_vld1q_f64_x3
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 0
-  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 1
-  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 2
-  %6 = insertvalue %struct.float64x2x3_t undef, <2 x double> %3, 0, 0
-  %7 = insertvalue %struct.float64x2x3_t %6, <2 x double> %4, 0, 1
-  %8 = insertvalue %struct.float64x2x3_t %7, <2 x double> %5, 0, 2
-  ret %struct.float64x2x3_t %8
-}
-
-define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a)  {
-; CHECK-LABEL: test_vld1_s8_x3
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
-; [{{x[0-9]+|sp}}]
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-  %5 = insertvalue %struct.int8x8x3_t undef, <8 x i8> %2, 0, 0
-  %6 = insertvalue %struct.int8x8x3_t %5, <8 x i8> %3, 0, 1
-  %7 = insertvalue %struct.int8x8x3_t %6, <8 x i8> %4, 0, 2
-  ret %struct.int8x8x3_t %7
-}
-
-define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a)  {
-; CHECK-LABEL: test_vld1_s16_x3
-; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8* %1, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
-  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
-  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
-  %6 = insertvalue %struct.int16x4x3_t undef, <4 x i16> %3, 0, 0
-  %7 = insertvalue %struct.int16x4x3_t %6, <4 x i16> %4, 0, 1
-  %8 = insertvalue %struct.int16x4x3_t %7, <4 x i16> %5, 0, 2
-  ret %struct.int16x4x3_t %8
-}
-
-define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a)  {
-  %1 = bitcast i32* %a to i8*
-; CHECK-LABEL: test_vld1_s32_x3
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
-  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
-  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
-  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
-  %6 = insertvalue %struct.int32x2x3_t undef, <2 x i32> %3, 0, 0
-  %7 = insertvalue %struct.int32x2x3_t %6, <2 x i32> %4, 0, 1
-  %8 = insertvalue %struct.int32x2x3_t %7, <2 x i32> %5, 0, 2
-  ret %struct.int32x2x3_t %8
-}
-
-define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a)  {
-; CHECK-LABEL: test_vld1_s64_x3
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
-  %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
-  %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
-  %6 = insertvalue %struct.int64x1x3_t undef, <1 x i64> %3, 0, 0
-  %7 = insertvalue %struct.int64x1x3_t %6, <1 x i64> %4, 0, 1
-  %8 = insertvalue %struct.int64x1x3_t %7, <1 x i64> %5, 0, 2
-  ret %struct.int64x1x3_t %8
-}
-
-define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a)  {
-; CHECK-LABEL: test_vld1_f32_x3
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 0
-  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 1
-  %5 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 2
-  %6 = insertvalue %struct.float32x2x3_t undef, <2 x float> %3, 0, 0
-  %7 = insertvalue %struct.float32x2x3_t %6, <2 x float> %4, 0, 1
-  %8 = insertvalue %struct.float32x2x3_t %7, <2 x float> %5, 0, 2
-  ret %struct.float32x2x3_t %8
-}
-
-
-define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a)  {
-; CHECK-LABEL: test_vld1_f64_x3
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 0
-  %4 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 1
-  %5 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 2
-  %6 = insertvalue %struct.float64x1x3_t undef, <1 x double> %3, 0, 0
-  %7 = insertvalue %struct.float64x1x3_t %6, <1 x double> %4, 0, 1
-  %8 = insertvalue %struct.float64x1x3_t %7, <1 x double> %5, 0, 2
-  ret %struct.float64x1x3_t %8
-}
-
-define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a)  {
-; CHECK-LABEL: test_vld1q_s8_x4
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
-; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
-  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
-  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
-  %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3
-  %6 = insertvalue %struct.int8x16x4_t undef, <16 x i8> %2, 0, 0
-  %7 = insertvalue %struct.int8x16x4_t %6, <16 x i8> %3, 0, 1
-  %8 = insertvalue %struct.int8x16x4_t %7, <16 x i8> %4, 0, 2
-  %9 = insertvalue %struct.int8x16x4_t %8, <16 x i8> %5, 0, 3
-  ret %struct.int8x16x4_t %9
-}
-
-define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a)  {
-; CHECK-LABEL: test_vld1q_s16_x4
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
-; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
-  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
-  %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
-  %6 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 3
-  %7 = insertvalue %struct.int16x8x4_t undef, <8 x i16> %3, 0, 0
-  %8 = insertvalue %struct.int16x8x4_t %7, <8 x i16> %4, 0, 1
-  %9 = insertvalue %struct.int16x8x4_t %8, <8 x i16> %5, 0, 2
-  %10 = insertvalue %struct.int16x8x4_t %9, <8 x i16> %6, 0, 3
-  ret %struct.int16x8x4_t %10
-}
-
-define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a)  {
-; CHECK-LABEL: test_vld1q_s32_x4
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
-  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
-  %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
-  %6 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 3
-  %7 = insertvalue %struct.int32x4x4_t undef, <4 x i32> %3, 0, 0
-  %8 = insertvalue %struct.int32x4x4_t %7, <4 x i32> %4, 0, 1
-  %9 = insertvalue %struct.int32x4x4_t %8, <4 x i32> %5, 0, 2
-  %10 = insertvalue %struct.int32x4x4_t %9, <4 x i32> %6, 0, 3
-  ret %struct.int32x4x4_t %10
-}
-
-define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a)  {
-; CHECK-LABEL: test_vld1q_s64_x4
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
-  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
-  %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
-  %6 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 3
-  %7 = insertvalue %struct.int64x2x4_t undef, <2 x i64> %3, 0, 0
-  %8 = insertvalue %struct.int64x2x4_t %7, <2 x i64> %4, 0, 1
-  %9 = insertvalue %struct.int64x2x4_t %8, <2 x i64> %5, 0, 2
-  %10 = insertvalue %struct.int64x2x4_t %9, <2 x i64> %6, 0, 3
-  ret %struct.int64x2x4_t %10
-}
-
-define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a)  {
-; CHECK-LABEL: test_vld1q_f32_x4
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
-  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
-  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
-  %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
-  %7 = insertvalue %struct.float32x4x4_t undef, <4 x float> %3, 0, 0
-  %8 = insertvalue %struct.float32x4x4_t %7, <4 x float> %4, 0, 1
-  %9 = insertvalue %struct.float32x4x4_t %8, <4 x float> %5, 0, 2
-  %10 = insertvalue %struct.float32x4x4_t %9, <4 x float> %6, 0, 3
-  ret %struct.float32x4x4_t %10
-}
-
-define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a)  {
-; CHECK-LABEL: test_vld1q_f64_x4
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
-  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
-  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
-  %6 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
-  %7 = insertvalue %struct.float64x2x4_t undef, <2 x double> %3, 0, 0
-  %8 = insertvalue %struct.float64x2x4_t %7, <2 x double> %4, 0, 1
-  %9 = insertvalue %struct.float64x2x4_t %8, <2 x double> %5, 0, 2
-  %10 = insertvalue %struct.float64x2x4_t %9, <2 x double> %6, 0, 3
-  ret %struct.float64x2x4_t %10
-}
-
-define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a)  {
-; CHECK-LABEL: test_vld1_s8_x4
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
-; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-  %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
-  %6 = insertvalue %struct.int8x8x4_t undef, <8 x i8> %2, 0, 0
-  %7 = insertvalue %struct.int8x8x4_t %6, <8 x i8> %3, 0, 1
-  %8 = insertvalue %struct.int8x8x4_t %7, <8 x i8> %4, 0, 2
-  %9 = insertvalue %struct.int8x8x4_t %8, <8 x i8> %5, 0, 3
-  ret %struct.int8x8x4_t %9
-}
-
-define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a)  {
-; CHECK-LABEL: test_vld1_s16_x4
-; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
-; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8* %1, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
-  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
-  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
-  %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 3
-  %7 = insertvalue %struct.int16x4x4_t undef, <4 x i16> %3, 0, 0
-  %8 = insertvalue %struct.int16x4x4_t %7, <4 x i16> %4, 0, 1
-  %9 = insertvalue %struct.int16x4x4_t %8, <4 x i16> %5, 0, 2
-  %10 = insertvalue %struct.int16x4x4_t %9, <4 x i16> %6, 0, 3
-  ret %struct.int16x4x4_t %10
-}
-
-define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a)  {
-; CHECK-LABEL: test_vld1_s32_x4
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
-  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
-  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
-  %6 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
-  %7 = insertvalue %struct.int32x2x4_t undef, <2 x i32> %3, 0, 0
-  %8 = insertvalue %struct.int32x2x4_t %7, <2 x i32> %4, 0, 1
-  %9 = insertvalue %struct.int32x2x4_t %8, <2 x i32> %5, 0, 2
-  %10 = insertvalue %struct.int32x2x4_t %9, <2 x i32> %6, 0, 3
-  ret %struct.int32x2x4_t %10
-}
-
-define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a)  {
-; CHECK-LABEL: test_vld1_s64_x4
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
-  %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
-  %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
-  %6 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 3
-  %7 = insertvalue %struct.int64x1x4_t undef, <1 x i64> %3, 0, 0
-  %8 = insertvalue %struct.int64x1x4_t %7, <1 x i64> %4, 0, 1
-  %9 = insertvalue %struct.int64x1x4_t %8, <1 x i64> %5, 0, 2
-  %10 = insertvalue %struct.int64x1x4_t %9, <1 x i64> %6, 0, 3
-  ret %struct.int64x1x4_t %10
-}
-
-define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a)  {
-; CHECK-LABEL: test_vld1_f32_x4
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 0
-  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 1
-  %5 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 2
-  %6 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 3
-  %7 = insertvalue %struct.float32x2x4_t undef, <2 x float> %3, 0, 0
-  %8 = insertvalue %struct.float32x2x4_t %7, <2 x float> %4, 0, 1
-  %9 = insertvalue %struct.float32x2x4_t %8, <2 x float> %5, 0, 2
-  %10 = insertvalue %struct.float32x2x4_t %9, <2 x float> %6, 0, 3
-  ret %struct.float32x2x4_t %10
-}
-
-
-define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a)  {
-; CHECK-LABEL: test_vld1_f64_x4
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 0
-  %4 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 1
-  %5 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 2
-  %6 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 3
-  %7 = insertvalue %struct.float64x1x4_t undef, <1 x double> %3, 0, 0
-  %8 = insertvalue %struct.float64x1x4_t %7, <1 x double> %4, 0, 1
-  %9 = insertvalue %struct.float64x1x4_t %8, <1 x double> %5, 0, 2
-  %10 = insertvalue %struct.float64x1x4_t %9, <1 x double> %6, 0, 3
-  ret %struct.float64x1x4_t %10
-}
-
-define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b)  {
-; CHECK-LABEL: test_vst1q_s8_x2
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <16 x i8>] %b, 0
-  %2 = extractvalue [2 x <16 x i8>] %b, 1
-  tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b)  {
-; CHECK-LABEL: test_vst1q_s16_x2
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <8 x i16>] %b, 0
-  %2 = extractvalue [2 x <8 x i16>] %b, 1
-  %3 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b)  {
-; CHECK-LABEL: test_vst1q_s32_x2
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <4 x i32>] %b, 0
-  %2 = extractvalue [2 x <4 x i32>] %b, 1
-  %3 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v4i32(i8* %3, <4 x i32> %1, <4 x i32> %2, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b)  {
-; CHECK-LABEL: test_vst1q_s64_x2
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x i64>] %b, 0
-  %2 = extractvalue [2 x <2 x i64>] %b, 1
-  %3 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2i64(i8* %3, <2 x i64> %1, <2 x i64> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b)  {
-; CHECK-LABEL: test_vst1q_f32_x2
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <4 x float>] %b, 0
-  %2 = extractvalue [2 x <4 x float>] %b, 1
-  %3 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v4f32(i8* %3, <4 x float> %1, <4 x float> %2, i32 4)
-  ret void
-}
-
-
-define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b)  {
-; CHECK-LABEL: test_vst1q_f64_x2
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x double>] %b, 0
-  %2 = extractvalue [2 x <2 x double>] %b, 1
-  %3 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2f64(i8* %3, <2 x double> %1, <2 x double> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b)  {
-; CHECK-LABEL: test_vst1_s8_x2
-; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  tail call void @llvm.aarch64.neon.vst1x2.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b)  {
-; CHECK-LABEL: test_vst1_s16_x2
-; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <4 x i16>] %b, 0
-  %2 = extractvalue [2 x <4 x i16>] %b, 1
-  %3 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v4i16(i8* %3, <4 x i16> %1, <4 x i16> %2, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b)  {
-; CHECK-LABEL: test_vst1_s32_x2
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x i32>] %b, 0
-  %2 = extractvalue [2 x <2 x i32>] %b, 1
-  %3 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b)  {
-; CHECK-LABEL: test_vst1_s64_x2
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <1 x i64>] %b, 0
-  %2 = extractvalue [2 x <1 x i64>] %b, 1
-  %3 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v1i64(i8* %3, <1 x i64> %1, <1 x i64> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b)  {
-; CHECK-LABEL: test_vst1_f32_x2
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x float>] %b, 0
-  %2 = extractvalue [2 x <2 x float>] %b, 1
-  %3 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2f32(i8* %3, <2 x float> %1, <2 x float> %2, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b)  {
-; CHECK-LABEL: test_vst1_f64_x2
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <1 x double>] %b, 0
-  %2 = extractvalue [2 x <1 x double>] %b, 1
-  %3 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v1f64(i8* %3, <1 x double> %1, <1 x double> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b)  {
-; CHECK-LABEL: test_vst1q_s8_x3
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <16 x i8>] %b, 0
-  %2 = extractvalue [3 x <16 x i8>] %b, 1
-  %3 = extractvalue [3 x <16 x i8>] %b, 2
-  tail call void @llvm.aarch64.neon.vst1x3.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b)  {
-; CHECK-LABEL: test_vst1q_s16_x3
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <8 x i16>] %b, 0
-  %2 = extractvalue [3 x <8 x i16>] %b, 1
-  %3 = extractvalue [3 x <8 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v8i16(i8* %4, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b)  {
-; CHECK-LABEL: test_vst1q_s32_x3
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <4 x i32>] %b, 0
-  %2 = extractvalue [3 x <4 x i32>] %b, 1
-  %3 = extractvalue [3 x <4 x i32>] %b, 2
-  %4 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v4i32(i8* %4, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b)  {
-; CHECK-LABEL: test_vst1q_s64_x3
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x i64>] %b, 0
-  %2 = extractvalue [3 x <2 x i64>] %b, 1
-  %3 = extractvalue [3 x <2 x i64>] %b, 2
-  %4 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2i64(i8* %4, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b)  {
-; CHECK-LABEL: test_vst1q_f32_x3
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <4 x float>] %b, 0
-  %2 = extractvalue [3 x <4 x float>] %b, 1
-  %3 = extractvalue [3 x <4 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b)  {
-; CHECK-LABEL: test_vst1q_f64_x3
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x double>] %b, 0
-  %2 = extractvalue [3 x <2 x double>] %b, 1
-  %3 = extractvalue [3 x <2 x double>] %b, 2
-  %4 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2f64(i8* %4, <2 x double> %1, <2 x double> %2, <2 x double> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b)  {
-; CHECK-LABEL: test_vst1_s8_x3
-; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <8 x i8>] %b, 0
-  %2 = extractvalue [3 x <8 x i8>] %b, 1
-  %3 = extractvalue [3 x <8 x i8>] %b, 2
-  tail call void @llvm.aarch64.neon.vst1x3.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b)  {
-; CHECK-LABEL: test_vst1_s16_x3
-; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <4 x i16>] %b, 0
-  %2 = extractvalue [3 x <4 x i16>] %b, 1
-  %3 = extractvalue [3 x <4 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b)  {
-; CHECK-LABEL: test_vst1_s32_x3
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x i32>] %b, 0
-  %2 = extractvalue [3 x <2 x i32>] %b, 1
-  %3 = extractvalue [3 x <2 x i32>] %b, 2
-  %4 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b)  {
-; CHECK-LABEL: test_vst1_s64_x3
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <1 x i64>] %b, 0
-  %2 = extractvalue [3 x <1 x i64>] %b, 1
-  %3 = extractvalue [3 x <1 x i64>] %b, 2
-  %4 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b)  {
-; CHECK-LABEL: test_vst1_f32_x3
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x float>] %b, 0
-  %2 = extractvalue [3 x <2 x float>] %b, 1
-  %3 = extractvalue [3 x <2 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b)  {
-; CHECK-LABEL: test_vst1_f64_x3
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <1 x double>] %b, 0
-  %2 = extractvalue [3 x <1 x double>] %b, 1
-  %3 = extractvalue [3 x <1 x double>] %b, 2
-  %4 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v1f64(i8* %4, <1 x double> %1, <1 x double> %2, <1 x double> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b)  {
-; CHECK-LABEL: test_vst1q_s8_x4
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
-; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <16 x i8>] %b, 0
-  %2 = extractvalue [4 x <16 x i8>] %b, 1
-  %3 = extractvalue [4 x <16 x i8>] %b, 2
-  %4 = extractvalue [4 x <16 x i8>] %b, 3
-  tail call void @llvm.aarch64.neon.vst1x4.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, <16 x i8> %4, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b)  {
-; CHECK-LABEL: test_vst1q_s16_x4
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
-; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <8 x i16>] %b, 0
-  %2 = extractvalue [4 x <8 x i16>] %b, 1
-  %3 = extractvalue [4 x <8 x i16>] %b, 2
-  %4 = extractvalue [4 x <8 x i16>] %b, 3
-  %5 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v8i16(i8* %5, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, <8 x i16> %4, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b)  {
-; CHECK-LABEL: test_vst1q_s32_x4
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <4 x i32>] %b, 0
-  %2 = extractvalue [4 x <4 x i32>] %b, 1
-  %3 = extractvalue [4 x <4 x i32>] %b, 2
-  %4 = extractvalue [4 x <4 x i32>] %b, 3
-  %5 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4i32(i8* %5, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b)  {
-; CHECK-LABEL: test_vst1q_s64_x4
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x i64>] %b, 0
-  %2 = extractvalue [4 x <2 x i64>] %b, 1
-  %3 = extractvalue [4 x <2 x i64>] %b, 2
-  %4 = extractvalue [4 x <2 x i64>] %b, 3
-  %5 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2i64(i8* %5, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b)  {
-; CHECK-LABEL: test_vst1q_f32_x4
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <4 x float>] %b, 0
-  %2 = extractvalue [4 x <4 x float>] %b, 1
-  %3 = extractvalue [4 x <4 x float>] %b, 2
-  %4 = extractvalue [4 x <4 x float>] %b, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b)  {
-; CHECK-LABEL: test_vst1q_f64_x4
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x double>] %b, 0
-  %2 = extractvalue [4 x <2 x double>] %b, 1
-  %3 = extractvalue [4 x <2 x double>] %b, 2
-  %4 = extractvalue [4 x <2 x double>] %b, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b)  {
-; CHECK-LABEL: test_vst1_s8_x4
-; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
-; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <8 x i8>] %b, 0
-  %2 = extractvalue [4 x <8 x i8>] %b, 1
-  %3 = extractvalue [4 x <8 x i8>] %b, 2
-  %4 = extractvalue [4 x <8 x i8>] %b, 3
-  tail call void @llvm.aarch64.neon.vst1x4.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %4, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b)  {
-; CHECK-LABEL: test_vst1_s16_x4
-; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
-; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <4 x i16>] %b, 0
-  %2 = extractvalue [4 x <4 x i16>] %b, 1
-  %3 = extractvalue [4 x <4 x i16>] %b, 2
-  %4 = extractvalue [4 x <4 x i16>] %b, 3
-  %5 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4i16(i8* %5, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, <4 x i16> %4, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b)  {
-; CHECK-LABEL: test_vst1_s32_x4
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x i32>] %b, 0
-  %2 = extractvalue [4 x <2 x i32>] %b, 1
-  %3 = extractvalue [4 x <2 x i32>] %b, 2
-  %4 = extractvalue [4 x <2 x i32>] %b, 3
-  %5 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b)  {
-; CHECK-LABEL: test_vst1_s64_x4
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <1 x i64>] %b, 0
-  %2 = extractvalue [4 x <1 x i64>] %b, 1
-  %3 = extractvalue [4 x <1 x i64>] %b, 2
-  %4 = extractvalue [4 x <1 x i64>] %b, 3
-  %5 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v1i64(i8* %5, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, <1 x i64> %4, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b)  {
-; CHECK-LABEL: test_vst1_f32_x4
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x float>] %b, 0
-  %2 = extractvalue [4 x <2 x float>] %b, 1
-  %3 = extractvalue [4 x <2 x float>] %b, 2
-  %4 = extractvalue [4 x <2 x float>] %b, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b)  {
-; CHECK-LABEL: test_vst1_f64_x4
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <1 x double>] %b, 0
-  %2 = extractvalue [4 x <1 x double>] %b, 1
-  %3 = extractvalue [4 x <1 x double>] %b, 2
-  %4 = extractvalue [4 x <1 x double>] %b, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v1f64(i8* %5, <1 x double> %1, <1 x double> %2, <1 x double> %3, <1 x double> %4, i32 8)
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8*, i32)
-declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4f32(i8*, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2f64(i8*, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2f32(i8*, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v1f64(i8*, <1 x double>, <1 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
deleted file mode 100644
index 927c933..0000000
--- a/test/CodeGen/AArch64/neon-simd-ldst-one.ll
+++ /dev/null
@@ -1,2299 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
-%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
-%struct.uint8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int8x16x2_t = type { [2 x <16 x i8>] }
-%struct.int16x8x2_t = type { [2 x <8 x i16>] }
-%struct.int32x4x2_t = type { [2 x <4 x i32>] }
-%struct.int64x2x2_t = type { [2 x <2 x i64>] }
-%struct.float32x4x2_t = type { [2 x <4 x float>] }
-%struct.float64x2x2_t = type { [2 x <2 x double>] }
-%struct.int8x8x2_t = type { [2 x <8 x i8>] }
-%struct.int16x4x2_t = type { [2 x <4 x i16>] }
-%struct.int32x2x2_t = type { [2 x <2 x i32>] }
-%struct.int64x1x2_t = type { [2 x <1 x i64>] }
-%struct.float32x2x2_t = type { [2 x <2 x float>] }
-%struct.float64x1x2_t = type { [2 x <1 x double>] }
-%struct.int8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int16x8x3_t = type { [3 x <8 x i16>] }
-%struct.int32x4x3_t = type { [3 x <4 x i32>] }
-%struct.int64x2x3_t = type { [3 x <2 x i64>] }
-%struct.float32x4x3_t = type { [3 x <4 x float>] }
-%struct.float64x2x3_t = type { [3 x <2 x double>] }
-%struct.int8x8x3_t = type { [3 x <8 x i8>] }
-%struct.int16x4x3_t = type { [3 x <4 x i16>] }
-%struct.int32x2x3_t = type { [3 x <2 x i32>] }
-%struct.int64x1x3_t = type { [3 x <1 x i64>] }
-%struct.float32x2x3_t = type { [3 x <2 x float>] }
-%struct.float64x1x3_t = type { [3 x <1 x double>] }
-%struct.int8x16x4_t = type { [4 x <16 x i8>] }
-%struct.int16x8x4_t = type { [4 x <8 x i16>] }
-%struct.int32x4x4_t = type { [4 x <4 x i32>] }
-%struct.int64x2x4_t = type { [4 x <2 x i64>] }
-%struct.float32x4x4_t = type { [4 x <4 x float>] }
-%struct.float64x2x4_t = type { [4 x <2 x double>] }
-%struct.int8x8x4_t = type { [4 x <8 x i8>] }
-%struct.int16x4x4_t = type { [4 x <4 x i16>] }
-%struct.int32x2x4_t = type { [4 x <2 x i32>] }
-%struct.int64x1x4_t = type { [4 x <1 x i64>] }
-%struct.float32x2x4_t = type { [4 x <2 x float>] }
-%struct.float64x1x4_t = type { [4 x <1 x double>] }
-
-define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
-; CHECK-LABEL: test_ld_from_poll_v16i8
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
-  ret <16 x i8> %b
-}
-
-define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
-; CHECK-LABEL: test_ld_from_poll_v8i16
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
-  ret <8 x i16> %b
-}
-
-define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4i32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
-  ret <4 x i32> %b
-}
-
-define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2i64
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <2 x i64> %a, <i64 1, i64 2>
-  ret <2 x i64> %b
-}
-
-define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4f32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
-  ret <4 x float> %b
-}
-
-define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2f64
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = fadd <2 x double> %a, <double 1.0, double 2.0>
-  ret <2 x double> %b
-}
-
-define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
-; CHECK-LABEL: test_ld_from_poll_v8i8
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
-  ret <8 x i8> %b
-}
-
-define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4i16
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
-  ret <4 x i16> %b
-}
-
-define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2i32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <2 x i32> %a, <i32 1, i32 2>
-  ret <2 x i32> %b
-}
-
-define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld1q_dup_s8
-; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
-  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  ret <16 x i8> %lane
-}
-
-define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld1q_dup_s16
-; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  ret <8 x i16> %lane
-}
-
-define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld1q_dup_s32
-; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  ret <4 x i32> %lane
-}
-
-define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld1q_dup_s64
-; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %lane
-}
-
-define <4 x float> @test_vld1q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld1q_dup_f32
-; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = load float* %a, align 4
-  %1 = insertelement <4 x float> undef, float %0, i32 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  ret <4 x float> %lane
-}
-
-define <2 x double> @test_vld1q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld1q_dup_f64
-; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %1 = insertelement <2 x double> undef, double %0, i32 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  ret <2 x double> %lane
-}
-
-define <8 x i8> @test_vld1_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld1_dup_s8
-; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
-  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  ret <8 x i8> %lane
-}
-
-define <4 x i16> @test_vld1_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld1_dup_s16
-; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  ret <4 x i16> %lane
-}
-
-define <2 x i32> @test_vld1_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld1_dup_s32
-; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  ret <2 x i32> %lane
-}
-
-define <1 x i64> @test_vld1_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld1_dup_s64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
-  ret <1 x i64> %1
-}
-
-define <2 x float> @test_vld1_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld1_dup_f32
-; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = load float* %a, align 4
-  %1 = insertelement <2 x float> undef, float %0, i32 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  ret <2 x float> %lane
-}
-
-define <1 x double> @test_vld1_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld1_dup_f64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %1 = insertelement <1 x double> undef, double %0, i32 0
-  ret <1 x double> %1
-}
-
-define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
-; As there is a store operation depending on %1, LD1R pattern can't be selected.
-; So LDR and FMOV should be emitted.
-; CHECK-LABEL: testDUP.v1i64
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}]
-  %1 = load i64* %a, align 8
-  store i64 %1, i64* %b, align 8
-  %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
-  ret <1 x i64> %vecinit.i
-}
-
-define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
-; As there is a store operation depending on %1, LD1R pattern can't be selected.
-; So LDR and FMOV should be emitted.
-; CHECK-LABEL: testDUP.v1f64
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
-; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
-  %1 = load double* %a, align 8
-  store double %1, double* %b, align 8
-  %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
-  ret <1 x double> %vecinit.i
-}
-
-define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld2q_dup_s8
-; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
-entry:
-  %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
-  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-  %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
-  ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld2q_dup_s16
-; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
-  ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld2q_dup_s32
-; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
-  ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld2q_dup_s64
-; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
-  ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld2q_dup_f32
-; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
-  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
-  ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld2q_dup_f64
-; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
-  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
-  ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld2_dup_s8
-; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
-entry:
-  %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
-  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
-  %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
-  ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld2_dup_s16
-; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
-  ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld2_dup_s32
-; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
-  ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld2_dup_s64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
-  ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld2_dup_f32
-; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
-  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
-  ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld2_dup_f64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
-  ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld3q_dup_s8
-; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
-entry:
-  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
-  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
-  ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld3q_dup_s16
-; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
-  ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld3q_dup_s32
-; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
-  ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld3q_dup_s64
-; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
-  ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld3q_dup_f32
-; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
-  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
-  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
-  ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld3q_dup_f64
-; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
-  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
-  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
-  ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld3_dup_s8
-; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
-entry:
-  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
-  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
-  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
-  ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld3_dup_s16
-; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
-  ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld3_dup_s32
-; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
-  ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld3_dup_s64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
-  ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld3_dup_f32
-; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
-  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
-  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
-  ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld3_dup_f64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
-  ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld4q_dup_s8
-; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
-entry:
-  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
-  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
-  %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
-  ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld4q_dup_s16
-; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
-  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
-  %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
-  ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld4q_dup_s32
-; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
-  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
-  %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
-  ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld4q_dup_s64
-; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
-  %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
-  ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld4q_dup_f32
-; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
-  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
-  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
-  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
-  %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
-  ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld4q_dup_f64
-; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
-  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
-  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
-  %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
-  ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld4_dup_s8
-; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
-entry:
-  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
-  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
-  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
-  %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
-  ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld4_dup_s16
-; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
-  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
-  %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
-  ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld4_dup_s32
-; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
-  %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
-  ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld4_dup_s64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
-  %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
-  ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld4_dup_f32
-; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
-  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
-  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
-  %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
-  ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld4_dup_f64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
-  %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
-  ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vld1q_lane_s8
-; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
-  ret <16 x i8> %vld1_lane
-}
-
-define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vld1q_lane_s16
-; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
-  ret <8 x i16> %vld1_lane
-}
-
-define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vld1q_lane_s32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
-  ret <4 x i32> %vld1_lane
-}
-
-define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vld1q_lane_s64
-; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
-  ret <2 x i64> %vld1_lane
-}
-
-define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vld1q_lane_f32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load float* %a, align 4
-  %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
-  ret <4 x float> %vld1_lane
-}
-
-define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vld1q_lane_f64
-; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load double* %a, align 8
-  %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
-  ret <2 x double> %vld1_lane
-}
-
-define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vld1_lane_s8
-; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
-  ret <8 x i8> %vld1_lane
-}
-
-define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vld1_lane_s16
-; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
-  ret <4 x i16> %vld1_lane
-}
-
-define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vld1_lane_s32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
-  ret <2 x i32> %vld1_lane
-}
-
-define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vld1_lane_s64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
-  ret <1 x i64> %vld1_lane
-}
-
-define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vld1_lane_f32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load float* %a, align 4
-  %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
-  ret <2 x float> %vld1_lane
-}
-
-define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vld1_lane_f64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
-  ret <1 x double> %vld1_lane
-}
-
-define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s16
-; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
-  %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_f32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_f64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s16
-; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
-  %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_f32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_f64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s16
-; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_f32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_f64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s8
-; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s16
-; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_f32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_f64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s8
-; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s16
-; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_f32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_f64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s8
-; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s16
-; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_f32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_f64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vst1q_lane_s8
-; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <16 x i8> %b, i32 15
-  store i8 %0, i8* %a, align 1
-  ret void
-}
-
-define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vst1q_lane_s16
-; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <8 x i16> %b, i32 7
-  store i16 %0, i16* %a, align 2
-  ret void
-}
-
-define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vst1q_lane_s32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x i32> %b, i32 3
-  store i32 %0, i32* %a, align 4
-  ret void
-}
-
-define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vst1q_lane_s64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x i64> %b, i32 1
-  store i64 %0, i64* %a, align 8
-  ret void
-}
-
-define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vst1q_lane_f32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x float> %b, i32 3
-  store float %0, float* %a, align 4
-  ret void
-}
-
-define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vst1q_lane_f64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x double> %b, i32 1
-  store double %0, double* %a, align 8
-  ret void
-}
-
-define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vst1_lane_s8
-; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <8 x i8> %b, i32 7
-  store i8 %0, i8* %a, align 1
-  ret void
-}
-
-define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_lane_s16
-; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x i16> %b, i32 3
-  store i16 %0, i16* %a, align 2
-  ret void
-}
-
-define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_lane_s32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x i32> %b, i32 1
-  store i32 %0, i32* %a, align 4
-  ret void
-}
-
-define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vst1_lane_s64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <1 x i64> %b, i32 0
-  store i64 %0, i64* %a, align 8
-  ret void
-}
-
-define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_lane_f32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x float> %b, i32 1
-  store float %0, float* %a, align 4
-  ret void
-}
-
-define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vst1_lane_f64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <1 x double> %b, i32 0
-  store double %0, double* %a, align 8
-  ret void
-}
-
-define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s8
-; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
-  ret void
-}
-
-define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s16
-; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
-  ret void
-}
-
-define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_f32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_f64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s8
-; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
-  ret void
-}
-
-define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s16
-; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
-  ret void
-}
-
-define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_f32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_f64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s8
-; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
-  ret void
-}
-
-define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s16
-; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
-  ret void
-}
-
-define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_f32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_f64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s8
-; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
-  ret void
-}
-
-define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s16
-; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
-  ret void
-}
-
-define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_f32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_f64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s8
-; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
-  ret void
-}
-
-define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s16
-; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
-  ret void
-}
-
-define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_f32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_f64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s8
-; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
-  ret void
-}
-
-define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s16
-; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
-  ret void
-}
-
-define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_f32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_f64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-
-define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
-  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_u8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
-  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.uint8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_p8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
-  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.poly8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s8
-; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
-  %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
-  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld3q_lane_u8
-; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
-  %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
-  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.uint8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.uint8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.uint8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.uint8x16x3_t %.fca.0.2.insert
-}
-
diff --git a/test/CodeGen/AArch64/neon-simd-ldst.ll b/test/CodeGen/AArch64/neon-simd-ldst.ll
deleted file mode 100644
index afc0901..0000000
--- a/test/CodeGen/AArch64/neon-simd-ldst.ll
+++ /dev/null
@@ -1,164 +0,0 @@
-; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define void @test_ldstq_4v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_4v
-; CHECK: ld4     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
-; CHECK: st4     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
-entry:
-  %tobool62 = icmp eq i32 %count, 0
-  br i1 %tobool62, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.063 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.063, -1
-  %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %io, i32 1)
-  %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
-  tail call void @llvm.arm.neon.vst4.v16i8(i8* %io, <16 x i8> %vld4.fca.0.extract, <16 x i8> %vld4.fca.1.extract, <16 x i8> %vld4.fca.2.extract, <16 x i8> %vld4.fca.3.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldstq_3v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_3v
-; CHECK: ld3     {v0.16b, v1.16b, v2.16b}, [x0]
-; CHECK: st3     {v0.16b, v1.16b, v2.16b}, [x0]
-entry:
-  %tobool47 = icmp eq i32 %count, 0
-  br i1 %tobool47, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.048 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.048, -1
-  %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %io, i32 1)
-  %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
-  tail call void @llvm.arm.neon.vst3.v16i8(i8* %io, <16 x i8> %vld3.fca.0.extract, <16 x i8> %vld3.fca.1.extract, <16 x i8> %vld3.fca.2.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldstq_2v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_2v
-; CHECK: ld2     {v0.16b, v1.16b}, [x0]
-; CHECK: st2     {v0.16b, v1.16b}, [x0]
-entry:
-  %tobool22 = icmp eq i32 %count, 0
-  br i1 %tobool22, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.023, -1
-  %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %io, i32 1)
-  %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
-  tail call void @llvm.arm.neon.vst2.v16i8(i8* %io, <16 x i8> %vld2.fca.0.extract, <16 x i8> %vld2.fca.1.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldst_4v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_4v
-; CHECK: ld4     {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
-; CHECK: st4     {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
-entry:
-  %tobool42 = icmp eq i32 %count, 0
-  br i1 %tobool42, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.043 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.043, -1
-  %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %io, i32 1)
-  %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
-  tail call void @llvm.arm.neon.vst4.v8i8(i8* %io, <8 x i8> %vld4.fca.0.extract, <8 x i8> %vld4.fca.1.extract, <8 x i8> %vld4.fca.2.extract, <8 x i8> %vld4.fca.3.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define void @test_ldst_3v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_3v
-; CHECK: ld3     {v0.8b, v1.8b, v2.8b}, [x0]
-; CHECK: st3     {v0.8b, v1.8b, v2.8b}, [x0]
-entry:
-  %tobool32 = icmp eq i32 %count, 0
-  br i1 %tobool32, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.033 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.033, -1
-  %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %io, i32 1)
-  %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %io, <8 x i8> %vld3.fca.0.extract, <8 x i8> %vld3.fca.1.extract, <8 x i8> %vld3.fca.2.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define void @test_ldst_2v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_2v
-; CHECK: ld2     {v0.8b, v1.8b}, [x0]
-; CHECK: st2     {v0.8b, v1.8b}, [x0]
-entry:
-  %tobool22 = icmp eq i32 %count, 0
-  br i1 %tobool22, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.023, -1
-  %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %io, i32 1)
-  %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
-  tail call void @llvm.arm.neon.vst2.v8i8(i8* %io, <8 x i8> %vld2.fca.0.extract, <8 x i8> %vld2.fca.1.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
deleted file mode 100644
index 156fe1d..0000000
--- a/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
+++ /dev/null
@@ -1,354 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-;Check for a post-increment updating load.
-define <4 x i16> @test_vld1_fx_update(i16** %ptr) nounwind {
-; CHECK: test_vld1_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #8
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 2)
-  %tmp2 = getelementptr i16* %A, i32 4
-  store i16* %tmp2, i16** %ptr
-  ret <4 x i16> %tmp1
-}
-
-;Check for a post-increment updating load with register increment.
-define <2 x i32> @test_vld1_reg_update(i32** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld1_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i32** %ptr
-  %tmp0 = bitcast i32* %A to i8*
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 4)
-  %tmp2 = getelementptr i32* %A, i32 %inc
-  store i32* %tmp2, i32** %ptr
-  ret <2 x i32> %tmp1
-}
-
-define <2 x float> @test_vld2_fx_update(float** %ptr) nounwind {
-; CHECK: test_vld2_fx_update
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
-  %A = load float** %ptr
-  %tmp0 = bitcast float* %A to i8*
-  %tmp1 = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 4)
-  %tmp2 = extractvalue { <2 x float>, <2 x float> } %tmp1, 0
-  %tmp3 = getelementptr float* %A, i32 4
-  store float* %tmp3, float** %ptr
-  ret <2 x float> %tmp2
-}
-
-define <16 x i8> @test_vld2_reg_update(i8** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld2_reg_update
-; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  %tmp0 = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %A, i32 1)
-  %tmp1 = extractvalue { <16 x i8>, <16 x i8> } %tmp0, 0
-  %tmp2 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp2, i8** %ptr
-  ret <16 x i8> %tmp1
-}
-
-define <4 x i32> @test_vld3_fx_update(i32** %ptr) nounwind {
-; CHECK: test_vld3_fx_update
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #48
-  %A = load i32** %ptr
-  %tmp0 = bitcast i32* %A to i8*
-  %tmp1 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 4)
-  %tmp2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %tmp1, 0
-  %tmp3 = getelementptr i32* %A, i32 12
-  store i32* %tmp3, i32** %ptr
-  ret <4 x i32> %tmp2
-}
-
-define <4 x i16> @test_vld3_reg_update(i16** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld3_reg_update
-; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  %tmp1 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 2)
-  %tmp2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %tmp1, 0
-  %tmp3 = getelementptr i16* %A, i32 %inc
-  store i16* %tmp3, i16** %ptr
-  ret <4 x i16> %tmp2
-}
-
-define <8 x i16> @test_vld4_fx_update(i16** %ptr) nounwind {
-; CHECK: test_vld4_fx_update
-; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], #64
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  %tmp1 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
-  %tmp2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %tmp1, 0
-  %tmp3 = getelementptr i16* %A, i32 32
-  store i16* %tmp3, i16** %ptr
-  ret <8 x i16> %tmp2
-}
-
-define <8 x i8> @test_vld4_reg_update(i8** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld4_reg_update
-; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  %tmp0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %A, i32 1)
-  %tmp1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %tmp0, 0
-  %tmp2 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp2, i8** %ptr
-  ret <8 x i8> %tmp1
-}
-
-define void @test_vst1_fx_update(float** %ptr, <2 x float> %B) nounwind {
-; CHECK: test_vst1_fx_update
-; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #8
-  %A = load float** %ptr
-  %tmp0 = bitcast float* %A to i8*
-  call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %B, i32 4)
-  %tmp2 = getelementptr float* %A, i32 2
-  store float* %tmp2, float** %ptr
-  ret void
-}
-
-define void @test_vst1_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
-; CHECK: test_vst1_reg_update
-; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %B, i32 2)
-  %tmp1 = getelementptr i16* %A, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst2_fx_update(i64** %ptr, <1 x i64> %B) nounwind {
-; CHECK: test_vst2_fx_update
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}], #16
-  %A = load i64** %ptr
-  %tmp0 = bitcast i64* %A to i8*
-  call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %B, <1 x i64> %B, i32 8)
-  %tmp1 = getelementptr i64* %A, i32 2
-  store i64* %tmp1, i64** %ptr
-  ret void
-}
-
-define void @test_vst2_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
-; CHECK: test_vst2_reg_update
-; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, i32 4)
-  %tmp0 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp0, i8** %ptr
-  ret void
-}
-
-define void @test_vst3_fx_update(i32** %ptr, <2 x i32> %B) nounwind {
-; CHECK: test_vst3_fx_update
-; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #24
-  %A = load i32** %ptr
-  %tmp0 = bitcast i32* %A to i8*
-  call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %B, <2 x i32> %B, <2 x i32> %B, i32 4)
-  %tmp1 = getelementptr i32* %A, i32 6
-  store i32* %tmp1, i32** %ptr
-  ret void
-}
-
-define void @test_vst3_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
-; CHECK: test_vst3_reg_update
-; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %B, <8 x i16> %B, <8 x i16> %B, i32 2)
-  %tmp1 = getelementptr i16* %A, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst4_fx_update(float** %ptr, <4 x float> %B) nounwind {
-; CHECK: test_vst4_fx_update
-; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}], #64
-  %A = load float** %ptr
-  %tmp0 = bitcast float* %A to i8*
-  call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %B, <4 x float> %B, <4 x float> %B, <4 x float> %B, i32 4)
-  %tmp1 = getelementptr float* %A, i32 16
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-define void @test_vst4_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
-; CHECK: test_vst4_reg_update
-; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, i32 1)
-  %tmp0 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp0, i8** %ptr
-  ret void
-}
-
-
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define <16 x i8> @test_vld1x2_fx_update(i8* %a, i8** %ptr) {
-; CHECK: test_vld1x2_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32
-  %1 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
-  %tmp1 = getelementptr i8* %a, i32 32
-  store i8* %tmp1, i8** %ptr
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vld1x2_reg_update(i16* %a, i16** %ptr, i32 %inc) {
-; CHECK: test_vld1x2_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret <8 x i16> %3
-}
-
-define <2 x i64> @test_vld1x3_fx_update(i64* %a, i64** %ptr) {
-; CHECK: test_vld1x3_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], #48
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
-  %tmp1 = getelementptr i64* %a, i32 6
-  store i64* %tmp1, i64** %ptr
-  ret  <2 x i64> %3
-}
-
-define <8 x i16> @test_vld1x3_reg_update(i16* %a, i16** %ptr, i32 %inc) {
-; CHECK: test_vld1x3_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret <8 x i16> %3
-}
-
-define <4 x float> @test_vld1x4_fx_update(float* %a, float** %ptr) {
-; CHECK: test_vld1x4_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
-  %tmp1 = getelementptr float* %a, i32 16
-  store float* %tmp1, float** %ptr
-  ret <4 x float> %3
-}
-
-define <8 x i8> @test_vld1x4_reg_update(i8* readonly %a, i8** %ptr, i32 %inc) #0 {
-; CHECK: test_vld1x4_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %tmp1 = getelementptr i8* %a, i32 %inc
-  store i8* %tmp1, i8** %ptr
-  ret <8 x i8> %2
-}
-
-define void @test_vst1x2_fx_update(i8* %a, [2 x <16 x i8>] %b.coerce, i8** %ptr) #2 {
-; CHECK: test_vst1x2_fx_update
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32
-  %1 = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %2 = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
-  %tmp1 = getelementptr i8* %a, i32 32
-  store i8* %tmp1, i8** %ptr
-  ret void
-}
-
-define void @test_vst1x2_reg_update(i16* %a, [2 x <8 x i16>] %b.coerce, i16** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x2_reg_update
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %2 = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %3 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst1x3_fx_update(i32* %a, [3 x <2 x i32>] %b.coerce, i32** %ptr) #2 {
-; CHECK: test_vst1x3_fx_update
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #24
-  %1 = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %2 = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %3 = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %4 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
-  %tmp1 = getelementptr i32* %a, i32 6
-  store i32* %tmp1, i32** %ptr
-  ret void
-}
-
-define void @test_vst1x3_reg_update(i64* %a, [3 x <1 x i64>] %b.coerce, i64** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x3_reg_update
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %2 = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %3 = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %4 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
-  %tmp1 = getelementptr i64* %a, i32 %inc
-  store i64* %tmp1, i64** %ptr
-  ret void
-}
-
-define void @test_vst1x4_fx_update(float* %a, [4 x <4 x float>] %b.coerce, float** %ptr) #2 {
-; CHECK: test_vst1x4_fx_update
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64
-  %1 = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %2 = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %3 = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %4 = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
-  %tmp1 = getelementptr float* %a, i32 16
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-define void @test_vst1x4_reg_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x4_reg_update
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
-  %tmp1 = getelementptr double* %a, i32 %inc
-  store double* %tmp1, double** %ptr
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
-declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #3
-declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32) #3
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
deleted file mode 100644
index 80a9347..0000000
--- a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
+++ /dev/null
@@ -1,319 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) {
-; CHECK-LABEL: test_vld2q_dup_fx_update
-; CHECK: ld2r  {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #2
-  %1 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
-  %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  %4 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
-  %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer
-  %6 = insertvalue { [2 x <16 x i8>] } undef, <16 x i8> %3, 0, 0
-  %7 = insertvalue { [2 x <16 x i8>] } %6, <16 x i8> %5, 0, 1
-  %tmp1 = getelementptr i8* %a, i32 2
-  store i8* %tmp1, i8** %ptr
-  ret { [2 x <16 x i8>] } %7
-}
-
-define { [2 x <4 x i32>] } @test_vld2q_dup_reg_update(i32* %a, i32** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld2q_dup_reg_update
-; CHECK: ld2r  {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %1, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
-  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
-  %5 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
-  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
-  %7 = insertvalue { [2 x <4 x i32>] } undef, <4 x i32> %4, 0, 0
-  %8 = insertvalue { [2 x <4 x i32>] } %7, <4 x i32> %6, 0, 1
-  %tmp1 = getelementptr i32* %a, i32 %inc
-  store i32* %tmp1, i32** %ptr
-  ret { [2 x <4 x i32>] } %8
-}
-
-define { [3 x <4 x i16>] } @test_vld3_dup_fx_update(i16* %a, i16** %ptr) {
-; CHECK-LABEL: test_vld3_dup_fx_update
-; CHECK: ld3r  {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #6
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %1, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
-  %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
-  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
-  %6 = shufflevector <4 x i16> %5, <4 x i16> undef, <4 x i32> zeroinitializer
-  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
-  %8 = shufflevector <4 x i16> %7, <4 x i16> undef, <4 x i32> zeroinitializer
-  %9 = insertvalue { [3 x <4 x i16>] }  undef, <4 x i16> %4, 0, 0
-  %10 = insertvalue { [3 x <4 x i16>] }  %9, <4 x i16> %6, 0, 1
-  %11 = insertvalue { [3 x <4 x i16>] }  %10, <4 x i16> %8, 0, 2
-  %tmp1 = getelementptr i16* %a, i32 3
-  store i16* %tmp1, i16** %ptr
-  ret { [3 x <4 x i16>] }  %11
-}
-
-define { [3 x <8 x i8>] } @test_vld3_dup_reg_update(i8* %a, i8** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld3_dup_reg_update
-; CHECK: ld3r  {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <8 x i32> zeroinitializer
-  %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-  %7 = shufflevector <8 x i8> %6, <8 x i8> undef, <8 x i32> zeroinitializer
-  %8 = insertvalue { [3 x <8 x i8>] } undef, <8 x i8> %3, 0, 0
-  %9 = insertvalue { [3 x <8 x i8>] } %8, <8 x i8> %5, 0, 1
-  %10 = insertvalue { [3 x <8 x i8>] } %9, <8 x i8> %7, 0, 2
-  %tmp1 = getelementptr i8* %a, i32 %inc
-  store i8* %tmp1, i8** %ptr
-  ret { [3 x <8 x i8>] }%10
-}
-
-define { [4 x <2 x i32>] } @test_vld4_dup_fx_update(i32* %a, i32** %ptr) #0 {
-; CHECK-LABEL: test_vld4_dup_fx_update
-; CHECK: ld4r  {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
-  %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
-  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
-  %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <2 x i32> zeroinitializer
-  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
-  %8 = shufflevector <2 x i32> %7, <2 x i32> undef, <2 x i32> zeroinitializer
-  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
-  %10 = shufflevector <2 x i32> %9, <2 x i32> undef, <2 x i32> zeroinitializer
-  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %4, 0, 0
-  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %6, 0, 1
-  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %8, 0, 2
-  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
-  %tmp1 = getelementptr i32* %a, i32 4
-  store i32* %tmp1, i32** %ptr
-  ret { [4 x <2 x i32>] } %14
-}
-
-define { [4 x <2 x double>] } @test_vld4_dup_reg_update(double* %a, double** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld4_dup_reg_update
-; CHECK: ld4r  {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %1, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
-  %4 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
-  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
-  %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer
-  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
-  %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer
-  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
-  %10 = shufflevector <2 x double> %9, <2 x double> undef, <2 x i32> zeroinitializer
-  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %4, 0, 0
-  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %6, 0, 1
-  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %8, 0, 2
-  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
-  %tmp1 = getelementptr double* %a, i32 %inc
-  store double* %tmp1, double** %ptr
-  ret { [4 x <2 x double>] } %14
-}
-
-define { [2 x <8 x i8>] } @test_vld2_lane_fx_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr) {
-; CHECK-LABEL: test_vld2_lane_fx_update
-; CHECK: ld2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
-  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
-  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
-  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
-  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
-  %tmp1 = getelementptr i8* %a, i32 2
-  store i8* %tmp1, i8** %ptr
-  ret { [2 x <8 x i8>] } %7
-}
-
-define { [2 x <8 x i8>] } @test_vld2_lane_reg_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld2_lane_reg_update
-; CHECK: ld2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[6], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 6, i32 1)
-  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
-  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
-  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
-  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
-  %tmp1 = getelementptr i8* %a, i32 %inc
-  store i8* %tmp1, i8** %ptr
-  ret { [2 x <8 x i8>] } %7
-}
-
-define { [3 x <2 x float>] } @test_vld3_lane_fx_update(float* %a, [3 x <2 x float>] %b, float** %ptr) {
-; CHECK-LABEL: test_vld3_lane_fx_update
-; CHECK: ld3  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #12
-  %1 = extractvalue [3 x <2 x float>] %b, 0
-  %2 = extractvalue [3 x <2 x float>] %b, 1
-  %3 = extractvalue [3 x <2 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  %5 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4)
-  %6 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 0
-  %7 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 1
-  %8 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 2
-  %9 = insertvalue { [3 x <2 x float>] } undef, <2 x float> %6, 0, 0
-  %10 = insertvalue { [3 x <2 x float>] } %9, <2 x float> %7, 0, 1
-  %11 = insertvalue { [3 x <2 x float>] } %10, <2 x float> %8, 0, 2
-  %tmp1 = getelementptr float* %a, i32 3
-  store float* %tmp1, float** %ptr
-  ret { [3 x <2 x float>] } %11
-}
-
-define { [3 x <4 x i16>] } @test_vld3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld3_lane_reg_update
-; CHECK: ld3  {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [3 x <4 x i16>] %b, 0
-  %2 = extractvalue [3 x <4 x i16>] %b, 1
-  %3 = extractvalue [3 x <4 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  %5 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
-  %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 0
-  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 1
-  %8 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 2
-  %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %6, 0, 0
-  %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %7, 0, 1
-  %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret { [3 x <4 x i16>] } %11
-}
-
-define { [4 x <2 x i32>] } @test_vld4_lane_fx_update(i32* readonly %a, [4 x <2 x i32>] %b, i32** %ptr) {
-; CHECK-LABEL: test_vld4_lane_fx_update
-; CHECK: ld4  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #16
-  %1 = extractvalue [4 x <2 x i32>] %b, 0
-  %2 = extractvalue [4 x <2 x i32>] %b, 1
-  %3 = extractvalue [4 x <2 x i32>] %b, 2
-  %4 = extractvalue [4 x <2 x i32>] %b, 3
-  %5 = bitcast i32* %a to i8*
-  %6 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 1, i32 4)
-  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 0
-  %8 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 1
-  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 2
-  %10 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 3
-  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %7, 0, 0
-  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %8, 0, 1
-  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %9, 0, 2
-  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
-  %tmp1 = getelementptr i32* %a, i32 4
-  store i32* %tmp1, i32** %ptr
-  ret { [4 x <2 x i32>] } %14
-}
-
-define { [4 x <2 x double>] } @test_vld4_lane_reg_update(double* readonly %a, [4 x <2 x double>] %b, double** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld4_lane_reg_update
-; CHECK: ld4  {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [4 x <2 x double>] %b, 0
-  %2 = extractvalue [4 x <2 x double>] %b, 1
-  %3 = extractvalue [4 x <2 x double>] %b, 2
-  %4 = extractvalue [4 x <2 x double>] %b, 3
-  %5 = bitcast double* %a to i8*
-  %6 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
-  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 0
-  %8 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 1
-  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 2
-  %10 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 3
-  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %7, 0, 0
-  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %8, 0, 1
-  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %9, 0, 2
-  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
-  %tmp1 = getelementptr double* %a, i32 %inc
-  store double* %tmp1, double** %ptr
-  ret { [4 x <2 x double>] } %14
-}
-
-define void @test_vst2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) {
-; CHECK-LABEL: test_vst2_lane_fx_update
-; CHECK: st2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
-  %tmp1 = getelementptr i8* %a, i32 2
-  store i8* %tmp1, i8** %ptr
-  ret void
-}
-
-define void @test_vst2_lane_reg_update(i32* %a, [2 x <2 x i32>] %b.coerce, i32** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst2_lane_reg_update
-; CHECK: st2  {v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %2 = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %3 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
-  %tmp1 = getelementptr i32* %a, i32 %inc
-  store i32* %tmp1, i32** %ptr
-  ret void
-}
-
-define void @test_vst3_lane_fx_update(float* %a, [3 x <4 x float>] %b, float** %ptr) {
-; CHECK-LABEL: test_vst3_lane_fx_update
-; CHECK: st3  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[3], [x{{[0-9]+|sp}}], #12
-  %1 = extractvalue [3 x <4 x float>] %b, 0
-  %2 = extractvalue [3 x <4 x float>] %b, 1
-  %3 = extractvalue [3 x <4 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  call void @llvm.arm.neon.vst3lane.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 3, i32 4)
-  %tmp1 = getelementptr float* %a, i32 3
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @test_vst3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst3_lane_reg_update
-; CHECK: st3  {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [3 x <4 x i16>] %b, 0
-  %2 = extractvalue [3 x <4 x i16>] %b, 1
-  %3 = extractvalue [3 x <4 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst4_lane_fx_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr) {
-; CHECK-LABEL: test_vst4_lane_fx_update
-; CHECK: st4  {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], #32
-  %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
-  %tmp1 = getelementptr double* %a, i32 4
-  store double* %tmp1, double** %ptr
-  ret void
-}
-
-
-define void @test_vst4_lane_reg_update(float* %a, [4 x <2 x float>] %b.coerce, float** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst4_lane_reg_update
-; CHECK: st4  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %2 = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %3 = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %4 = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 1, i32 4)
-  %tmp1 = getelementptr float* %a, i32 %inc
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-shift.ll b/test/CodeGen/AArch64/neon-simd-shift.ll
deleted file mode 100644
index fd76265..0000000
--- a/test/CodeGen/AArch64/neon-simd-shift.ll
+++ /dev/null
@@ -1,1556 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
-; CHECK: test_vshr_n_s8
-; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <8 x i8> %vshr_n
-}
-
-define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) {
-; CHECK: test_vshr_n_s16
-; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
-  ret <4 x i16> %vshr_n
-}
-
-define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) {
-; CHECK: test_vshr_n_s32
-; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3>
-  ret <2 x i32> %vshr_n
-}
-
-define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
-; CHECK: test_vshrq_n_s8
-; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <16 x i8> %vshr_n
-}
-
-define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
-; CHECK: test_vshrq_n_s16
-; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  ret <8 x i16> %vshr_n
-}
-
-define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
-; CHECK: test_vshrq_n_s32
-; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %vshr_n
-}
-
-define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) {
-; CHECK: test_vshrq_n_s64
-; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3>
-  ret <2 x i64> %vshr_n
-}
-
-define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) {
-; CHECK: test_vshr_n_u8
-; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <8 x i8> %vshr_n
-}
-
-define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) {
-; CHECK: test_vshr_n_u16
-; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
-  ret <4 x i16> %vshr_n
-}
-
-define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) {
-; CHECK: test_vshr_n_u32
-; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3>
-  ret <2 x i32> %vshr_n
-}
-
-define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
-; CHECK: test_vshrq_n_u8
-; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <16 x i8> %vshr_n
-}
-
-define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
-; CHECK: test_vshrq_n_u16
-; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  ret <8 x i16> %vshr_n
-}
-
-define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
-; CHECK: test_vshrq_n_u32
-; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %vshr_n
-}
-
-define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) {
-; CHECK: test_vshrq_n_u64
-; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3>
-  ret <2 x i64> %vshr_n
-}
-
-define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsra_n_s8
-; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <8 x i8> %vsra_n, %a
-  ret <8 x i8> %1
-}
-
-define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsra_n_s16
-; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
-  %1 = add <4 x i16> %vsra_n, %a
-  ret <4 x i16> %1
-}
-
-define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsra_n_s32
-; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3>
-  %1 = add <2 x i32> %vsra_n, %a
-  ret <2 x i32> %1
-}
-
-define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsraq_n_s8
-; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <16 x i8> %vsra_n, %a
-  ret <16 x i8> %1
-}
-
-define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsraq_n_s16
-; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %1 = add <8 x i16> %vsra_n, %a
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsraq_n_s32
-; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
-  %1 = add <4 x i32> %vsra_n, %a
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsraq_n_s64
-; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3>
-  %1 = add <2 x i64> %vsra_n, %a
-  ret <2 x i64> %1
-}
-
-define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsra_n_u8
-; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <8 x i8> %vsra_n, %a
-  ret <8 x i8> %1
-}
-
-define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsra_n_u16
-; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
-  %1 = add <4 x i16> %vsra_n, %a
-  ret <4 x i16> %1
-}
-
-define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsra_n_u32
-; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3>
-  %1 = add <2 x i32> %vsra_n, %a
-  ret <2 x i32> %1
-}
-
-define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsraq_n_u8
-; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <16 x i8> %vsra_n, %a
-  ret <16 x i8> %1
-}
-
-define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsraq_n_u16
-; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %1 = add <8 x i16> %vsra_n, %a
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsraq_n_u32
-; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
-  %1 = add <4 x i32> %vsra_n, %a
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsraq_n_u64
-; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3>
-  %1 = add <2 x i64> %vsra_n, %a
-  ret <2 x i64> %1
-}
-
-define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) {
-; CHECK: test_vrshr_n_s8
-; CHECK: srshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %a, i32 3)
-  ret <8 x i8> %vrshr_n
-}
-
-
-define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) {
-; CHECK: test_vrshr_n_s16
-; CHECK: srshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %a, i32 3)
-  ret <4 x i16> %vrshr_n
-}
-
-
-define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) {
-; CHECK: test_vrshr_n_s32
-; CHECK: srshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %a, i32 3)
-  ret <2 x i32> %vrshr_n
-}
-
-
-define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) {
-; CHECK: test_vrshrq_n_s8
-; CHECK: srshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %a, i32 3)
-  ret <16 x i8> %vrshr_n
-}
-
-
-define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) {
-; CHECK: test_vrshrq_n_s16
-; CHECK: srshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %a, i32 3)
-  ret <8 x i16> %vrshr_n
-}
-
-
-define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) {
-; CHECK: test_vrshrq_n_s32
-; CHECK: srshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %a, i32 3)
-  ret <4 x i32> %vrshr_n
-}
-
-
-define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) {
-; CHECK: test_vrshrq_n_s64
-; CHECK: srshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %a, i32 3)
-  ret <2 x i64> %vrshr_n
-}
-
-
-define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) {
-; CHECK: test_vrshr_n_u8
-; CHECK: urshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %a, i32 3)
-  ret <8 x i8> %vrshr_n
-}
-
-
-define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) {
-; CHECK: test_vrshr_n_u16
-; CHECK: urshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %a, i32 3)
-  ret <4 x i16> %vrshr_n
-}
-
-
-define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) {
-; CHECK: test_vrshr_n_u32
-; CHECK: urshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %a, i32 3)
-  ret <2 x i32> %vrshr_n
-}
-
-
-define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) {
-; CHECK: test_vrshrq_n_u8
-; CHECK: urshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %a, i32 3)
-  ret <16 x i8> %vrshr_n
-}
-
-
-define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) {
-; CHECK: test_vrshrq_n_u16
-; CHECK: urshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %a, i32 3)
-  ret <8 x i16> %vrshr_n
-}
-
-
-define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) {
-; CHECK: test_vrshrq_n_u32
-; CHECK: urshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %a, i32 3)
-  ret <4 x i32> %vrshr_n
-}
-
-
-define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) {
-; CHECK: test_vrshrq_n_u64
-; CHECK: urshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %a, i32 3)
-  ret <2 x i64> %vrshr_n
-}
-
-
-define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vrsra_n_s8
-; CHECK: srsra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %1 = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %b, i32 3)
-  %vrsra_n = add <8 x i8> %1, %a
-  ret <8 x i8> %vrsra_n
-}
-
-define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vrsra_n_s16
-; CHECK: srsra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %1 = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %b, i32 3)
-  %vrsra_n = add <4 x i16> %1, %a
-  ret <4 x i16> %vrsra_n
-}
-
-define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vrsra_n_s32
-; CHECK: srsra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %1 = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %b, i32 3)
-  %vrsra_n = add <2 x i32> %1, %a
-  ret <2 x i32> %vrsra_n
-}
-
-define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vrsraq_n_s8
-; CHECK: srsra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %1 = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %b, i32 3)
-  %vrsra_n = add <16 x i8> %1, %a
-  ret <16 x i8> %vrsra_n
-}
-
-define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsraq_n_s16
-; CHECK: srsra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %1 = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %b, i32 3)
-  %vrsra_n = add <8 x i16> %1, %a
-  ret <8 x i16> %vrsra_n
-}
-
-define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsraq_n_s32
-; CHECK: srsra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %1 = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %b, i32 3)
-  %vrsra_n = add <4 x i32> %1, %a
-  ret <4 x i32> %vrsra_n
-}
-
-define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsraq_n_s64
-; CHECK: srsra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %1 = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %b, i32 3)
-  %vrsra_n = add <2 x i64> %1, %a
-  ret <2 x i64> %vrsra_n
-}
-
-define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vrsra_n_u8
-; CHECK: ursra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %1 = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %b, i32 3)
-  %vrsra_n = add <8 x i8> %1, %a
-  ret <8 x i8> %vrsra_n
-}
-
-define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vrsra_n_u16
-; CHECK: ursra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %1 = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %b, i32 3)
-  %vrsra_n = add <4 x i16> %1, %a
-  ret <4 x i16> %vrsra_n
-}
-
-define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vrsra_n_u32
-; CHECK: ursra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %1 = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %b, i32 3)
-  %vrsra_n = add <2 x i32> %1, %a
-  ret <2 x i32> %vrsra_n
-}
-
-define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vrsraq_n_u8
-; CHECK: ursra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %1 = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %b, i32 3)
-  %vrsra_n = add <16 x i8> %1, %a
-  ret <16 x i8> %vrsra_n
-}
-
-define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsraq_n_u16
-; CHECK: ursra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %1 = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %b, i32 3)
-  %vrsra_n = add <8 x i16> %1, %a
-  ret <8 x i16> %vrsra_n
-}
-
-define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsraq_n_u32
-; CHECK: ursra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %1 = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %b, i32 3)
-  %vrsra_n = add <4 x i32> %1, %a
-  ret <4 x i32> %vrsra_n
-}
-
-define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsraq_n_u64
-; CHECK: ursra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %1 = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %b, i32 3)
-  %vrsra_n = add <2 x i64> %1, %a
-  ret <2 x i64> %vrsra_n
-}
-
-define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsri_n_s8
-; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsri_n
-}
-
-
-define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsri_n_s16
-; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
-  ret <4 x i16> %vsri
-}
-
-
-define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsri_n_s32
-; CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsri = tail call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
-  ret <2 x i32> %vsri
-}
-
-
-define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsriq_n_s8
-; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsri_n
-}
-
-
-define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsriq_n_s16
-; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
-  ret <8 x i16> %vsri
-}
-
-
-define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsriq_n_s32
-; CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsri = tail call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
-  ret <4 x i32> %vsri
-}
-
-
-define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsriq_n_s64
-; CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsri = tail call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
-  ret <2 x i64> %vsri
-}
-
-define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsri_n_p8
-; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsri_n
-}
-
-define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsri_n_p16
-; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
-  %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
-  ret <4 x i16> %vsri
-}
-
-define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsriq_n_p8
-; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsri_n
-}
-
-define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsriq_n_p16
-; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
-  %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
-  ret <8 x i16> %vsri
-}
-
-define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsli_n_s8
-; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsli_n
-}
-
-define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsli_n_s16
-; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
-  ret <4 x i16> %vsli
-}
-
-define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsli_n_s32
-; CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsli = tail call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
-  ret <2 x i32> %vsli
-}
-
-define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsliq_n_s8
-; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsli_n
-}
-
-define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsliq_n_s16
-; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
-  ret <8 x i16> %vsli
-}
-
-define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsliq_n_s32
-; CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsli = tail call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
-  ret <4 x i32> %vsli
-}
-
-define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsliq_n_s64
-; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsli = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
-  ret <2 x i64> %vsli
-}
-
-define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsli_n_p8
-; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsli_n
-}
-
-define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsli_n_p16
-; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
-  %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
-  ret <4 x i16> %vsli
-}
-
-define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsliq_n_p8
-; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsli_n
-}
-
-define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsliq_n_p16
-; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
-  %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
-  ret <8 x i16> %vsli
-}
-
-define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) {
-; CHECK: test_vqshl_n_s8
-; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vqshl = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <8 x i8> %vqshl
-}
-
-
-define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) {
-; CHECK: test_vqshl_n_s16
-; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
-  ret <4 x i16> %vqshl
-}
-
-
-define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) {
-; CHECK: test_vqshl_n_s32
-; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
-  ret <2 x i32> %vqshl
-}
-
-
-define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
-; CHECK: test_vqshlq_n_s8
-; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <16 x i8> %vqshl_n
-}
-
-
-define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshlq_n_s16
-; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-  ret <8 x i16> %vqshl
-}
-
-
-define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshlq_n_s32
-; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
-  ret <4 x i32> %vqshl
-}
-
-
-define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshlq_n_s64
-; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
-  ret <2 x i64> %vqshl
-}
-
-
-define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) {
-; CHECK: test_vqshl_n_u8
-; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <8 x i8> %vqshl_n
-}
-
-
-define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) {
-; CHECK: test_vqshl_n_u16
-; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
-  ret <4 x i16> %vqshl
-}
-
-
-define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) {
-; CHECK: test_vqshl_n_u32
-; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
-  ret <2 x i32> %vqshl
-}
-
-
-define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
-; CHECK: test_vqshlq_n_u8
-; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <16 x i8> %vqshl_n
-}
-
-
-define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
-; CHECK: test_vqshlq_n_u16
-; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-  ret <8 x i16> %vqshl
-}
-
-
-define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
-; CHECK: test_vqshlq_n_u32
-; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
-  ret <4 x i32> %vqshl
-}
-
-
-define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) {
-; CHECK: test_vqshlq_n_u64
-; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
-  ret <2 x i64> %vqshl
-}
-
-define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) {
-; CHECK: test_vqshlu_n_s8
-; CHECK: sqshlu {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vqshlu = tail call <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8> %a, i32 3)
-  ret <8 x i8> %vqshlu
-}
-
-
-define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) {
-; CHECK: test_vqshlu_n_s16
-; CHECK: sqshlu {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vqshlu = tail call <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16> %a, i32 3)
-  ret <4 x i16> %vqshlu
-}
-
-
-define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) {
-; CHECK: test_vqshlu_n_s32
-; CHECK: sqshlu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vqshlu = tail call <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32> %a, i32 3)
-  ret <2 x i32> %vqshlu
-}
-
-
-define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) {
-; CHECK: test_vqshluq_n_s8
-; CHECK: sqshlu {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vqshlu = tail call <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8> %a, i32 3)
-  ret <16 x i8> %vqshlu
-}
-
-
-define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshluq_n_s16
-; CHECK: sqshlu {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vqshlu = tail call <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16> %a, i32 3)
-  ret <8 x i16> %vqshlu
-}
-
-
-define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshluq_n_s32
-; CHECK: sqshlu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vqshlu = tail call <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32> %a, i32 3)
-  ret <4 x i32> %vqshlu
-}
-
-
-define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshluq_n_s64
-; CHECK: sqshlu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vqshlu = tail call <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64> %a, i32 3)
-  ret <2 x i64> %vqshlu
-}
-
-
-define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vshrn_n_s16
-; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  ret <8 x i8> %vshrn_n
-}
-
-define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vshrn_n_s32
-; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  ret <4 x i16> %vshrn_n
-}
-
-define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vshrn_n_s64
-; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %1 = ashr <2 x i64> %a, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
-  ret <2 x i32> %vshrn_n
-}
-
-define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vshrn_n_u16
-; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  ret <8 x i8> %vshrn_n
-}
-
-define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vshrn_n_u32
-; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  ret <4 x i16> %vshrn_n
-}
-
-define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vshrn_n_u64
-; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %1 = lshr <2 x i64> %a, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
-  ret <2 x i32> %vshrn_n
-}
-
-define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vshrn_high_n_s16
-; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  %2 = bitcast <8 x i8> %a to <1 x i64>
-  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %4
-}
-
-define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vshrn_high_n_s32
-; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  %2 = bitcast <4 x i16> %a to <1 x i64>
-  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %4
-}
-
-define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vshrn_high_n_s64
-; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %2 = ashr <2 x i64> %b, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
-  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %4
-}
-
-define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vshrn_high_n_u16
-; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  %2 = bitcast <8 x i8> %a to <1 x i64>
-  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %4
-}
-
-define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vshrn_high_n_u32
-; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  %2 = bitcast <4 x i16> %a to <1 x i64>
-  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %4
-}
-
-define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vshrn_high_n_u64
-; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %2 = lshr <2 x i64> %b, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
-  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %4
-}
-
-define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshrun_n_s16
-; CHECK: sqshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqshrun
-}
-
-
-define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshrun_n_s32
-; CHECK: sqshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqshrun
-}
-
-define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshrun_n_s64
-; CHECK: sqshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqshrun
-}
-
-define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrun_high_n_s16
-; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrun_high_n_s32
-; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrun_high_n_s64
-; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vrshrn_n_s16
-; CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vrshrn
-}
-
-
-define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vrshrn_n_s32
-; CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vrshrn
-}
-
-
-define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vrshrn_n_s64
-; CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vrshrn
-}
-
-define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vrshrn_high_n_s16
-; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vrshrn_high_n_s32
-; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vrshrn_high_n_s64
-; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) {
-; CHECK: test_vqrshrun_n_s16
-; CHECK: sqrshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqrshrun
-}
-
-define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) {
-; CHECK: test_vqrshrun_n_s32
-; CHECK: sqrshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqrshrun
-}
-
-define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) {
-; CHECK: test_vqrshrun_n_s64
-; CHECK: sqrshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqrshrun
-}
-
-define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrun_high_n_s16
-; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrun_high_n_s32
-; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrun_high_n_s64
-; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshrn_n_s16
-; CHECK: sqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqshrn
-}
-
-
-define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshrn_n_s32
-; CHECK: sqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqshrn
-}
-
-
-define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshrn_n_s64
-; CHECK: sqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqshrn
-}
-
-
-define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vqshrn_n_u16
-; CHECK: uqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqshrn
-}
-
-
-define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vqshrn_n_u32
-; CHECK: uqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqshrn
-}
-
-
-define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vqshrn_n_u64
-; CHECK: uqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqshrn
-}
-
-
-define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrn_high_n_s16
-; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrn_high_n_s32
-; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrn_high_n_s64
-; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrn_high_n_u16
-; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrn_high_n_u32
-; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrn_high_n_u64
-; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vqrshrn_n_s16
-; CHECK: sqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqrshrn
-}
-
-
-define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vqrshrn_n_s32
-; CHECK: sqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqrshrn
-}
-
-
-define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vqrshrn_n_s64
-; CHECK: sqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqrshrn
-}
-
-
-define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vqrshrn_n_u16
-; CHECK: uqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqrshrn
-}
-
-
-define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vqrshrn_n_u32
-; CHECK: uqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqrshrn
-}
-
-
-define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vqrshrn_n_u64
-; CHECK: uqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqrshrn
-}
-
-
-define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrn_high_n_s16
-; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrn_high_n_s32
-; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrn_high_n_s64
-; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrn_high_n_u16
-; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrn_high_n_u32
-; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrn_high_n_u64
-; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) {
-; CHECK: test_vcvt_n_f32_s32
-; CHECK: scvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
-  ret <2 x float> %vcvt
-}
-
-define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) {
-; CHECK: test_vcvtq_n_f32_s32
-; CHECK: scvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
-  ret <4 x float> %vcvt
-}
-
-define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) {
-; CHECK: test_vcvtq_n_f64_s64
-; CHECK: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
-  ret <2 x double> %vcvt
-}
-
-define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) {
-; CHECK: test_vcvt_n_f32_u32
-; CHECK: ucvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
-  ret <2 x float> %vcvt
-}
-
-define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) {
-; CHECK: test_vcvtq_n_f32_u32
-; CHECK: ucvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
-  ret <4 x float> %vcvt
-}
-
-define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) {
-; CHECK: test_vcvtq_n_f64_u64
-; CHECK: ucvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
-  ret <2 x double> %vcvt
-}
-
-define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) {
-; CHECK: test_vcvt_n_s32_f32
-; CHECK: fcvtzs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %a, i32 31)
-  ret <2 x i32> %vcvt
-}
-
-define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) {
-; CHECK: test_vcvtq_n_s32_f32
-; CHECK: fcvtzs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %a, i32 31)
-  ret <4 x i32> %vcvt
-}
-
-define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) {
-; CHECK: test_vcvtq_n_s64_f64
-; CHECK: fcvtzs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %a, i32 50)
-  ret <2 x i64> %vcvt
-}
-
-define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) {
-; CHECK: test_vcvt_n_u32_f32
-; CHECK: fcvtzu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %a, i32 31)
-  ret <2 x i32> %vcvt
-}
-
-define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) {
-; CHECK: test_vcvt_n_u32_f32
-; CHECK: fcvtzu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %a, i32 31)
-  ret <4 x i32> %vcvt
-}
-
-define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) {
-; CHECK: test_vcvtq_n_u64_f64
-; CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %a, i32 50)
-  ret <2 x i64> %vcvt
-}
-
-declare <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8>, <8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16>, <4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32>, <2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8>, <16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16>, <8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32>, <4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64>, <2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) 
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) 
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) 
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64>, i32)
-
-declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
-
-declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
-
-declare <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
-
-declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
-
-declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
-
-declare <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
-
-declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
-
-declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
-
-declare <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
-
-declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
-
-declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
-
-declare <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
-
-define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_n_s64_f64
-; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_n_u64_f64
-; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
-  ret <1 x i64> %1
-}
-
-define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_n_f64_s64
-; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_n_f64_u64
-; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
-  ret <1 x double> %1
-}
-
-declare <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
-declare <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
-declare <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
-declare <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
-\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-simd-tbl.ll b/test/CodeGen/AArch64/neon-simd-tbl.ll
deleted file mode 100644
index 7a51c0f..0000000
--- a/test/CodeGen/AArch64/neon-simd-tbl.ll
+++ /dev/null
@@ -1,828 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8>, <8 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8>, <16 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
-  ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
-  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
-  ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx4.i
-}
-
-define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
-  ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
-  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
-  ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx4.i
-}
-
-define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
-  ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
-  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
-  ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx4.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll b/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
deleted file mode 100644
index bb3300e..0000000
--- a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-; This file tests the spill of FPR8/FPR16. The volatile loads/stores force the
-; allocator to keep the value live until it's needed.
-
-%bigtype_v1i8 = type [20 x <1 x i8>]
-
-define void @spill_fpr8(%bigtype_v1i8* %addr) {
-; CHECK-LABEL: spill_fpr8:
-; CHECK: 1-byte Folded Spill
-; CHECK: 1-byte Folded Reload
-  %val1 = load volatile %bigtype_v1i8* %addr
-  %val2 = load volatile %bigtype_v1i8* %addr
-  store volatile %bigtype_v1i8 %val1, %bigtype_v1i8* %addr
-  store volatile %bigtype_v1i8 %val2, %bigtype_v1i8* %addr
-  ret void
-}
-
-%bigtype_v1i16 = type [20 x <1 x i16>]
-
-define void @spill_fpr16(%bigtype_v1i16* %addr) {
-; CHECK-LABEL: spill_fpr16:
-; CHECK: 2-byte Folded Spill
-; CHECK: 2-byte Folded Reload
-  %val1 = load volatile %bigtype_v1i16* %addr
-  %val2 = load volatile %bigtype_v1i16* %addr
-  store volatile %bigtype_v1i16 %val1, %bigtype_v1i16* %addr
-  store volatile %bigtype_v1i16 %val2, %bigtype_v1i16* %addr
-  ret void
-}
-\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
index e5b7694..1df3719 100644
--- a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
+++ b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
@@ -5,7 +5,7 @@
 define void @truncStore.v2i64(<2 x i64> %a, <2 x i32>* %result) {
 ; CHECK-LABEL: truncStore.v2i64:
 ; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+; CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <2 x i64> %a to <2 x i32>
   store <2 x i32> %b, <2 x i32>* %result
   ret void
@@ -14,7 +14,7 @@ define void @truncStore.v2i64(<2 x i64> %a, <2 x i32>* %result) {
 define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) {
 ; CHECK-LABEL: truncStore.v4i32:
 ; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+; CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <4 x i32> %a to <4 x i16>
   store <4 x i16> %b, <4 x i16>* %result
   ret void
@@ -23,7 +23,7 @@ define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) {
 define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) {
 ; CHECK-LABEL: truncStore.v8i16:
 ; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+; CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <8 x i16> %a to <8 x i8>
   store <8 x i8> %b, <8 x i8>* %result
   ret void
@@ -54,4 +54,4 @@ define i32 @loadExt.i32(<4 x i8>* %ref) {
   %vecext = extractelement <4 x i8> %a, i32 0
   %conv = zext i8 %vecext to i32
   ret i32 %conv
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/AArch64/nzcv-save.ll b/test/CodeGen/AArch64/nzcv-save.ll
new file mode 100644
index 0000000..32baff3
--- /dev/null
+++ b/test/CodeGen/AArch64/nzcv-save.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=aarch64 < %s | FileCheck %s
+
+; CHECK: mrs [[NZCV_SAVE:x[0-9]+]], NZCV
+; CHECK: msr NZCV, [[NZCV_SAVE]]
+
+; DAG ends up with two uses for the flags from an ADCS node, which means they
+; must be saved for later.
+define void @f(i256* nocapture %a, i256* nocapture %b, i256* nocapture %cc, i256* nocapture %dd) nounwind uwtable noinline ssp {
+entry:
+  %c = load i256* %cc
+  %d = load i256* %dd
+  %add = add nsw i256 %c, %d
+  store i256 %add, i256* %a, align 8
+  %or = or i256 %c, 1606938044258990275541962092341162602522202993782792835301376
+  %add6 = add nsw i256 %or, %d
+  store i256 %add6, i256* %b, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
index 3404d3f..e8c7625 100644
--- a/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 
 ; Make sure exception-handling PIC code can be linked correctly. An alternative
 ; to the sequence described below would have .gcc_except_table itself writable
@@ -11,8 +11,8 @@
   ; ... referring indirectly to stubs for its typeinfo ...
 ; CHECK: // @TType Encoding = indirect pcrel sdata8
   ; ... one of which is "int"'s typeinfo
-; CHECK: .Ltmp7:
-; CHECK-NEXT: .xword  .L_ZTIi.DW.stub-.Ltmp7
+; CHECK: [[TYPEINFO_LBL:.Ltmp[0-9]+]]: // TypeInfo 1
+; CHECK-NEXT: .xword  .L_ZTIi.DW.stub-[[TYPEINFO_LBL]]
 
   ; .. and which is properly defined (in a writable section for the dynamic loader) later.
 ; CHECK: .section .data.rel,"aw"
diff --git a/test/CodeGen/AArch64/ragreedy-csr.ll b/test/CodeGen/AArch64/ragreedy-csr.ll
index 18a948b..de29b1b 100644
--- a/test/CodeGen/AArch64/ragreedy-csr.ll
+++ b/test/CodeGen/AArch64/ragreedy-csr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -regalloc=greedy -regalloc-csr-first-time-cost=15 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -regalloc=greedy -regalloc-csr-first-time-cost=15 | FileCheck %s
 
 ; This testing case is reduced from 197.parser prune_match function.
 ; We make sure that we do not use callee-saved registers (x19 to x25).
@@ -6,14 +6,14 @@
 
 ; CHECK-LABEL: prune_match:
 ; CHECK: entry
-; CHECK: str x30, [sp
+; CHECK: {{str x30|stp x29, x30}}, [sp
 ; CHECK-NOT: stp x25,
 ; CHECK-NOT: stp x23, x24
 ; CHECK-NOT: stp x21, x22
 ; CHECK-NOT: stp x19, x20
 ; CHECK: if.end
 ; CHECK: return
-; CHECK: ldr x30, [sp
+; CHECK: {{ldr x30|ldp x29, x30}}, [sp
 ; CHECK-NOT: ldp x19, x20
 ; CHECK-NOT: ldp x21, x22
 ; CHECK-NOT: ldp x23, x24
diff --git a/test/CodeGen/AArch64/regress-bitcast-formals.ll b/test/CodeGen/AArch64/regress-bitcast-formals.ll
index 9655f90..58e0542 100644
--- a/test/CodeGen/AArch64/regress-bitcast-formals.ll
+++ b/test/CodeGen/AArch64/regress-bitcast-formals.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios7.0 -verify-machineinstrs < %s | FileCheck %s
 
 ; CallingConv.td requires a bitcast for vector arguments. Make sure we're
 ; actually capable of that (the test was omitted from LowerFormalArguments).
diff --git a/test/CodeGen/AArch64/regress-f128csel-flags.ll b/test/CodeGen/AArch64/regress-f128csel-flags.ll
index b35185c..25b5e0c 100644
--- a/test/CodeGen/AArch64/regress-f128csel-flags.ll
+++ b/test/CodeGen/AArch64/regress-f128csel-flags.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 ; We used to not mark NZCV as being used in the continuation basic-block
 ; when lowering a 128-bit "select" to branches. This meant a subsequent use
diff --git a/test/CodeGen/AArch64/regress-fp128-livein.ll b/test/CodeGen/AArch64/regress-fp128-livein.ll
index cb8432a..5e6ab0a 100644
--- a/test/CodeGen/AArch64/regress-fp128-livein.ll
+++ b/test/CodeGen/AArch64/regress-fp128-livein.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s
 
 ; Regression test for NZCV reg live-in not being added to fp128csel IfTrue BB,
 ; causing a crash during live range calc.
diff --git a/test/CodeGen/AArch64/regress-tail-livereg.ll b/test/CodeGen/AArch64/regress-tail-livereg.ll
index 053249c..e32ac84 100644
--- a/test/CodeGen/AArch64/regress-tail-livereg.ll
+++ b/test/CodeGen/AArch64/regress-tail-livereg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
 @var = global void()* zeroinitializer
 
 declare void @bar()
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
index ff77fb4..477d996 100644
--- a/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
 
 ; When generating DAG selection tables, TableGen used to only flag an
 ; instruction as needing a chain on its own account if it had a built-in pattern
@@ -17,17 +17,18 @@ define i64 @test_chains() {
   %locvar = alloca i8
 
   call void @bar(i8* %locvar)
-; CHECK: bl bar
+; CHECK: bl {{_?bar}}
 
   %inc.1 = load i8* %locvar
   %inc.2 = zext i8 %inc.1 to i64
   %inc.3 = add i64 %inc.2, 1
   %inc.4 = trunc i64 %inc.3 to i8
   store i8 %inc.4, i8* %locvar
-; CHECK: ldrb {{w[0-9]+}}, [sp, [[LOCADDR:#[0-9]+]]]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #1
-; CHECK: strb {{w[0-9]+}}, [sp, [[LOCADDR]]]
-; CHECK: ldrb {{w[0-9]+}}, [sp, [[LOCADDR]]]
+
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
 
   %ret.1 = load i8* %locvar
   %ret.2 = zext i8 %ret.1 to i64
diff --git a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
index 0ef9818..c3167e4 100644
--- a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
+++ b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
@@ -5,22 +5,7 @@ declare void @bar()
 
 define void @test_w29_reserved() {
 ; CHECK-LABEL: test_w29_reserved:
-; CHECK: .cfi_startproc
-; CHECK: .cfi_def_cfa sp, 96
 ; CHECK: add x29, sp, #{{[0-9]+}}
-; CHECK: .cfi_def_cfa x29, 16
-; CHECK: .cfi_offset x30, -8
-; CHECK: .cfi_offset x29, -16
-; CHECK: .cfi_offset x28, -24
-; CHECK: .cfi_offset x27, -32
-; CHECK: .cfi_offset x26, -40
-; CHECK: .cfi_offset x25, -48
-; CHECK: .cfi_offset x24, -56
-; CHECK: .cfi_offset x23, -64
-; CHECK: .cfi_offset x22, -72
-; CHECK: .cfi_offset x21, -80
-; CHECK: .cfi_offset x20, -88
-; CHECK: .cfi_offset x19, -96
 
   %val1 = load volatile i32* @var
   %val2 = load volatile i32* @var
diff --git a/test/CodeGen/AArch64/regress-wzr-allocatable.ll b/test/CodeGen/AArch64/regress-wzr-allocatable.ll
deleted file mode 100644
index 764d2bc..0000000
--- a/test/CodeGen/AArch64/regress-wzr-allocatable.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0
-
-; When WZR wasn't marked as reserved, this function tried to allocate
-; it at O0 and then generated an internal fault (mostly incidentally)
-; when it discovered that it was already in use for a multiplication.
-
-; I'm not really convinced this is a good test since it could easily
-; stop testing what it does now with no-one any the wiser. However, I
-; can't think of a better way to force the allocator to use WZR
-; specifically.
-
-define void @test() nounwind {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.body, %entry
-  br i1 undef, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  br label %for.cond6
-
-for.cond6:                                        ; preds = %for.body9, %for.end
-  br i1 undef, label %for.body9, label %while.cond30
-
-for.body9:                                        ; preds = %for.cond6
-  store i16 0, i16* undef, align 2
-  %0 = load i32* undef, align 4
-  %1 = load i32* undef, align 4
-  %mul15 = mul i32 %0, %1
-  %add16 = add i32 %mul15, 32768
-  %div = udiv i32 %add16, 65535
-  %add17 = add i32 %div, 1
-  store i32 %add17, i32* undef, align 4
-  br label %for.cond6
-
-while.cond30:                                     ; preds = %for.cond6
-  ret void
-}
diff --git a/test/CodeGen/AArch64/returnaddr.ll b/test/CodeGen/AArch64/returnaddr.ll
index c85f9ec..b136f04 100644
--- a/test/CodeGen/AArch64/returnaddr.ll
+++ b/test/CodeGen/AArch64/returnaddr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i8* @rt0(i32 %x) nounwind readnone {
 entry:
diff --git a/test/CodeGen/AArch64/setcc-takes-i32.ll b/test/CodeGen/AArch64/setcc-takes-i32.ll
index bd79685..ec86159 100644
--- a/test/CodeGen/AArch64/setcc-takes-i32.ll
+++ b/test/CodeGen/AArch64/setcc-takes-i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
 
 ; Most important point here is that the promotion of the i1 works
 ; correctly. Previously LLVM thought that i64 was the appropriate SetCC output,
diff --git a/test/CodeGen/AArch64/sext_inreg.ll b/test/CodeGen/AArch64/sext_inreg.ll
deleted file mode 100644
index 2f76081..0000000
--- a/test/CodeGen/AArch64/sext_inreg.ll
+++ /dev/null
@@ -1,198 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-; For formal arguments, we have the following vector type promotion,
-; v2i8 is promoted to v2i32(f64)
-; v2i16 is promoted to v2i32(f64)
-; v4i8 is promoted to v4i16(f64)
-; v8i1 is promoted to v8i16(f128)
-
-define <2 x i8> @test_sext_inreg_v2i8i16(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i16
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %1 = sext <2 x i8> %v1 to <2 x i16>
-  %2 = sext <2 x i8> %v2 to <2 x i16>
-  %3 = shufflevector <2 x i16> %1, <2 x i16> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i16> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i16_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i16_2
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %a1 = shl <2 x i32> %v1, <i32 24, i32 24>
-  %a2 = ashr <2 x i32> %a1, <i32 24, i32 24>
-  %b1 = shl <2 x i32> %v2, <i32 24, i32 24>
-  %b2 = ashr <2 x i32> %b1, <i32 24, i32 24>
-  %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
-  %d = trunc <2 x i32> %c to <2 x i8>
-  ret <2 x i8> %d
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i32(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i32
-; CHECK: sshll	 v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll	 v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %1 = sext <2 x i8> %v1 to <2 x i32>
-  %2 = sext <2 x i8> %v2 to <2 x i32>
-  %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i32> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i64(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i64
-; CHECK: ushll   v1.2d, v1.2s, #0
-; CHECK: ushll   v0.2d, v0.2s, #0
-; CHECK: shl     v0.2d, v0.2d, #56
-; CHECK: sshr    v0.2d, v0.2d, #56
-; CHECK: shl     v1.2d, v1.2d, #56
-; CHECK: sshr    v1.2d, v1.2d, #56
-  %1 = sext <2 x i8> %v1 to <2 x i64>
-  %2 = sext <2 x i8> %v2 to <2 x i64>
-  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i64> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i16(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i16
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %1 = sext <4 x i8> %v1 to <4 x i16>
-  %2 = sext <4 x i8> %v2 to <4 x i16>
-  %3 = shufflevector <4 x i16> %1, <4 x i16> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %4 = trunc <4 x i16> %3 to <4 x i8>
-  ret <4 x i8> %4
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i16_2(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i16_2
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %a1 = shl <4 x i16> %v1, <i16 8, i16 8, i16 8, i16 8>
-  %a2 = ashr <4 x i16> %a1, <i16 8, i16 8, i16 8, i16 8>
-  %b1 = shl <4 x i16> %v2, <i16 8, i16 8, i16 8, i16 8>
-  %b2 = ashr <4 x i16> %b1, <i16 8, i16 8, i16 8, i16 8>
-  %c = shufflevector <4 x i16> %a2, <4 x i16> %b2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %d = trunc <4 x i16> %c to <4 x i8>
-  ret <4 x i8> %d
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i32(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i32
-; CHECK: ushll   v1.4s, v1.4h, #0
-; CHECK: ushll   v0.4s, v0.4h, #0
-; CHECK: shl     v0.4s, v0.4s, #24
-; CHECK: sshr    v0.4s, v0.4s, #24
-; CHECK: shl     v1.4s, v1.4s, #24
-; CHECK: sshr    v1.4s, v1.4s, #24
-  %1 = sext <4 x i8> %v1 to <4 x i32>
-  %2 = sext <4 x i8> %v2 to <4 x i32>
-  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %4 = trunc <4 x i32> %3 to <4 x i8>
-  ret <4 x i8> %4
-}
-
-define <8 x i8> @test_sext_inreg_v8i8i16(<8 x i8> %v1, <8 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v8i8i16
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK: sshll   v1.8h, v1.8b, #0
-  %1 = sext <8 x i8> %v1 to <8 x i16>
-  %2 = sext <8 x i8> %v2 to <8 x i16>
-  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %4 = trunc <8 x i16> %3 to <8 x i8>
-  ret <8 x i8> %4
-}
-
-define <8 x i1> @test_sext_inreg_v8i1i16(<8 x i1> %v1, <8 x i1> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v8i1i16
-; CHECK: ushll   v1.8h, v1.8b, #0
-; CHECK: ushll   v0.8h, v0.8b, #0
-; CHECK: shl     v0.8h, v0.8h, #15
-; CHECK: sshr    v0.8h, v0.8h, #15
-; CHECK: shl     v1.8h, v1.8h, #15
-; CHECK: sshr    v1.8h, v1.8h, #15
-  %1 = sext <8 x i1> %v1 to <8 x i16>
-  %2 = sext <8 x i1> %v2 to <8 x i16>
-  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %4 = trunc <8 x i16> %3 to <8 x i1>
-  ret <8 x i1> %4
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i32(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i32
-; CHECK: sshll   v0.4s, v0.4h, #0
-; CHECK-NEXT: uzp1    v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: sshll   v1.4s, v1.4h, #0
-; CHECK-NEXT: uzp1    v1.4s, v1.4s, v1.4s
-  %1 = sext <2 x i16> %v1 to <2 x i32>
-  %2 = sext <2 x i16> %v2 to <2 x i32>
-  %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i32> %3 to <2 x i16>
-  ret <2 x i16> %4
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i32_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i32_2
-; CHECK: sshll   v0.4s, v0.4h, #0
-; CHECK-NEXT: uzp1    v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: sshll   v1.4s, v1.4h, #0
-; CHECK-NEXT: uzp1    v1.4s, v1.4s, v1.4s
-  %a1 = shl <2 x i32> %v1, <i32 16, i32 16>
-  %a2 = ashr <2 x i32> %a1, <i32 16, i32 16>
-  %b1 = shl <2 x i32> %v2, <i32 16, i32 16>
-  %b2 = ashr <2 x i32> %b1, <i32 16, i32 16>
-  %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
-  %d = trunc <2 x i32> %c to <2 x i16>
-  ret <2 x i16> %d
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i64(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i64
-; CHECK: ushll   v1.2d, v1.2s, #0
-; CHECK: ushll   v0.2d, v0.2s, #0
-; CHECK: shl     v0.2d, v0.2d, #48
-; CHECK: sshr    v0.2d, v0.2d, #48
-; CHECK: shl     v1.2d, v1.2d, #48
-; CHECK: sshr    v1.2d, v1.2d, #48
-  %1 = sext <2 x i16> %v1 to <2 x i64>
-  %2 = sext <2 x i16> %v2 to <2 x i64>
-  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i64> %3 to <2 x i16>
-  ret <2 x i16> %4
-}
-
-define <4 x i16> @test_sext_inreg_v4i16i32(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i16i32
-; CHECK: sshll v0.4s, v0.4h, #0
-; CHECK: sshll v1.4s, v1.4h, #0
-  %1 = sext <4 x i16> %v1 to <4 x i32>
-  %2 = sext <4 x i16> %v2 to <4 x i32>
-  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %4 = trunc <4 x i32> %3 to <4 x i16>
-  ret <4 x i16> %4
-}
-
-define <2 x i32> @test_sext_inreg_v2i32i64(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i32i64
-; CHECK: sshll v0.2d, v0.2s, #0
-; CHECK: sshll v1.2d, v1.2s, #0
-  %1 = sext <2 x i32> %v1 to <2 x i64>
-  %2 = sext <2 x i32> %v2 to <2 x i64>
-  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i64> %3 to <2 x i32>
-  ret <2 x i32> %4
-}
-
diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll
index 20f1062..34e3bb4 100644
--- a/test/CodeGen/AArch64/sibling-call.ll
+++ b/test/CodeGen/AArch64/sibling-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -aarch64-load-store-opt=0 | FileCheck %s
 
 declare void @callee_stack0()
 declare void @callee_stack8([8 x i32], i64)
@@ -73,10 +73,10 @@ define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
   tail call void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
   ret void
 
-; CHECK: ldr x0,
-; CHECK: ldr x1,
-; CHECK: str x1,
-; CHECK: str x0,
+; CHECK: ldr [[VAL0:x[0-9]+]],
+; CHECK: ldr [[VAL1:x[0-9]+]],
+; CHECK: str [[VAL1]],
+; CHECK: str [[VAL0]],
 
 ; CHECK-NOT: add sp, sp,
 ; CHECK: b callee_stack16
@@ -91,7 +91,7 @@ define void @indirect_tail() {
   %fptr = load void(i32)** @func
   tail call void %fptr(i32 42)
   ret void
-; CHECK: ldr [[FPTR:x[1-9]+]], [{{x[0-9]+}}, #:lo12:func]
-; CHECK: movz w0, #42
+; CHECK: ldr [[FPTR:x[1-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:func]
+; CHECK: movz w0, #{{42|0x2a}}
 ; CHECK: br [[FPTR]]
 }
diff --git a/test/CodeGen/AArch64/sincos-expansion.ll b/test/CodeGen/AArch64/sincos-expansion.ll
index 4cd4449..c3a172d 100644
--- a/test/CodeGen/AArch64/sincos-expansion.ll
+++ b/test/CodeGen/AArch64/sincos-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 define float @test_sincos_f32(float %f) {
   %sin = call float @sinf(float %f) readnone
diff --git a/test/CodeGen/AArch64/sincospow-vector-expansion.ll b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
index 259a55e..22f33a8 100644
--- a/test/CodeGen/AArch64/sincospow-vector-expansion.ll
+++ b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc -o - %s -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+neon | FileCheck %s
 
 
 define <2 x float> @test_cos_v2f64(<2 x double> %v1) {
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index 81885f1..8aab842 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -7,8 +7,10 @@ declare fastcc void @callee_stack16([8 x i32], i64, i64)
 define fastcc void @caller_to0_from0() nounwind {
 ; CHECK-LABEL: caller_to0_from0:
 ; CHECK-NEXT: // BB
+
   tail call fastcc void @callee_stack0()
   ret void
+
 ; CHECK-NEXT: b callee_stack0
 }
 
@@ -17,6 +19,7 @@ define fastcc void @caller_to0_from8([8 x i32], i64) {
 
   tail call fastcc void @callee_stack0()
   ret void
+
 ; CHECK: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack0
 }
@@ -29,8 +32,8 @@ define fastcc void @caller_to8_from0() {
 ; pointer (we didn't have arg space to reuse).
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
 }
 
@@ -41,8 +44,8 @@ define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
 ; Key point is that the "%a" should go where at SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
 }
 
@@ -54,10 +57,10 @@ define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
 ; above %a on the stack. If it tries to go below incoming-SP then the
 ; callee will not deallocate the space, even in fastcc.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
-; CHECK: str {{x[0-9]+}}, [sp, #24]
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK: add sp, sp, #16
-; CHECK: b callee_stack16
+
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack16
   ret void
 }
 
@@ -69,8 +72,8 @@ define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
 ; Key point is that the "%a" should go where at #16 above SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
-; CHECK: str {{x[0-9]+}}, [sp, #32]
-; CHECK-NEXT: add sp, sp, #32
+
+; CHECK: str {{x[0-9]+}}, [sp, #32]!
 ; CHECK-NEXT: b callee_stack8
 }
 
@@ -84,11 +87,8 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
   ret void
 
-; CHECK: ldr x0,
-; CHECK: ldr x1,
-; CHECK: str x1,
-; CHECK: str x0,
-
-; CHECK: add sp, sp, #16
-; CHECK: b callee_stack16
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack16
 }
diff --git a/test/CodeGen/AArch64/tls-dynamic-together.ll b/test/CodeGen/AArch64/tls-dynamic-together.ll
deleted file mode 100644
index b5d7d89..0000000
--- a/test/CodeGen/AArch64/tls-dynamic-together.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -O0 -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-
-; If the .tlsdesccall and blr parts are emitted completely separately (even with
-; glue) then LLVM will separate them quite happily (with a spill at O0, hence
-; the option). This is definitely wrong, so we make sure they are emitted
-; together.
-
-@general_dynamic_var = external thread_local global i32
-
-define i32 @test_generaldynamic() {
-; CHECK-LABEL: test_generaldynamic:
-
-  %val = load i32* @general_dynamic_var
-  ret i32 %val
-
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr {{x[0-9]+}}
-}
diff --git a/test/CodeGen/AArch64/tls-dynamics.ll b/test/CodeGen/AArch64/tls-dynamics.ll
deleted file mode 100644
index 68c481c..0000000
--- a/test/CodeGen/AArch64/tls-dynamics.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-
-@general_dynamic_var = external thread_local global i32
-
-define i32 @test_generaldynamic() {
-; CHECK-LABEL: test_generaldynamic:
-
-  %val = load i32* @general_dynamic_var
-  ret i32 %val
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
-; CHECK: ldr w0, [x[[TP]], x0]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-define i32* @test_generaldynamic_addr() {
-; CHECK-LABEL: test_generaldynamic_addr:
-
-  ret i32* @general_dynamic_var
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
-; CHECK: add x0, [[TP]], x0
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-@local_dynamic_var = external thread_local(localdynamic) global i32
-
-define i32 @test_localdynamic() {
-; CHECK-LABEL: test_localdynamic:
-
-  %val = load i32* @local_dynamic_var
-  ret i32 %val
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-
-; CHECK: ldr w0, [x0, [[DTP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-define i32* @test_localdynamic_addr() {
-; CHECK-LABEL: test_localdynamic_addr:
-
-  ret i32* @local_dynamic_var
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-
-; CHECK: add x0, x0, [[DTP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-; The entire point of the local-dynamic access model is to have a single call to
-; the expensive resolver. Make sure we achieve that goal.
-
-@local_dynamic_var2 = external thread_local(localdynamic) global i32
-
-define i32 @test_localdynamic_deduplicate() {
-; CHECK-LABEL: test_localdynamic_deduplicate:
-
-  %val = load i32* @local_dynamic_var
-  %val2 = load i32* @local_dynamic_var2
-
-  %sum = add i32 %val, %val2
-  ret i32 %sum
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK-NOT: _TLS_MODULE_BASE_
-
-; CHECK: ret
-}
diff --git a/test/CodeGen/AArch64/tls-execs.ll b/test/CodeGen/AArch64/tls-execs.ll
deleted file mode 100644
index 39ceb9a..0000000
--- a/test/CodeGen/AArch64/tls-execs.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-
-@initial_exec_var = external thread_local(initialexec) global i32
-
-define i32 @test_initial_exec() {
-; CHECK-LABEL: test_initial_exec:
-  %val = load i32* @initial_exec_var
-
-; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], #:gottprel_lo12:initial_exec_var]
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
-; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
-; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
-
-  ret i32 %val
-}
-
-define i32* @test_initial_exec_addr() {
-; CHECK-LABEL: test_initial_exec_addr:
-  ret i32* @initial_exec_var
-
-; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], #:gottprel_lo12:initial_exec_var]
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
-; CHECK: add x0, [[TP]], [[TP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
-; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
-
-}
-
-@local_exec_var = thread_local(initialexec) global i32 0
-
-define i32 @test_local_exec() {
-; CHECK-LABEL: test_local_exec:
-  %val = load i32* @local_exec_var
-
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [A,A,0xa0'A',0x92'A']
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
-; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-
-  ret i32 %val
-}
-
-define i32* @test_local_exec_addr() {
-; CHECK-LABEL: test_local_exec_addr:
-  ret i32* @local_exec_var
-
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
-; CHECK: add x0, [[TP]], [[TP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-}
diff --git a/test/CodeGen/AArch64/tst-br.ll b/test/CodeGen/AArch64/tst-br.ll
index 154bc08..8a2fe26 100644
--- a/test/CodeGen/AArch64/tst-br.ll
+++ b/test/CodeGen/AArch64/tst-br.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 ; We've got the usual issues with LLVM reordering blocks here. The
 ; tests are correct for the current order, but who knows when that
@@ -15,7 +15,7 @@ define i32 @test_tbz() {
   %tbit0 = and i32 %val, 32768
   %tst0 = icmp ne i32 %tbit0, 0
   br i1 %tst0, label %test1, label %end1
-; CHECK: tbz {{w[0-9]+}}, #15, [[LBL_end1:.LBB0_[0-9]+]]
+; CHECK: tbz {{w[0-9]+}}, #15, [[LBL_end1:.?LBB0_[0-9]+]]
 
 test1:
   %tbit1 = and i32 %val, 4096
@@ -27,22 +27,22 @@ test2:
   %tbit2 = and i64 %val64, 32768
   %tst2 = icmp ne i64 %tbit2, 0
   br i1 %tst2, label %test3, label %end1
-; CHECK: tbz {{x[0-9]+}}, #15, [[LBL_end1]]
+; CHECK: tbz {{[wx][0-9]+}}, #15, [[LBL_end1]]
 
 test3:
   %tbit3 = and i64 %val64, 4096
   %tst3 = icmp ne i64 %tbit3, 0
   br i1 %tst3, label %end2, label %end1
-; CHECK: tbz {{x[0-9]+}}, #12, [[LBL_end1]]
+; CHECK: tbz {{[wx][0-9]+}}, #12, [[LBL_end1]]
 
 end2:
-; CHECK: movz x0, #1
+; CHECK: {{movz x0, #1|orr w0, wzr, #0x1}}
 ; CHECK-NEXT: ret
   ret i32 1
 
 end1:
 ; CHECK: [[LBL_end1]]:
-; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: {{mov x0, xzr|mov w0, wzr}}
 ; CHECK-NEXT: ret
   ret i32 0
 }
diff --git a/test/CodeGen/AArch64/variadic.ll b/test/CodeGen/AArch64/variadic.ll
deleted file mode 100644
index 1c7f1e0..0000000
--- a/test/CodeGen/AArch64/variadic.ll
+++ /dev/null
@@ -1,241 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 < %s | FileCheck --check-prefix=CHECK-NOFP %s
-
-%va_list = type {i8*, i8*, i8*, i32, i32}
-
-@var = global %va_list zeroinitializer
-
-declare void @llvm.va_start(i8*)
-
-define void @test_simple(i32 %n, ...) {
-; CHECK-LABEL: test_simple:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #112]
-; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK: str x7, [x[[GPRBASE]], #48]
-
-; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK-NOFP: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK-NOFP: str x7, [x[[GPRBASE]], #48]
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOFP: str x1, [sp, #[[GPRFROMSP]]]
-
-; Omit the middle ones
-
-; CHECK: str q0, [sp]
-; CHECK: str x1, [sp, #[[GPRFROMSP]]]
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-
-; CHECK-NOFP-NOT: str q0, [sp]
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #128
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #55
-; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56
-; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-  ret void
-}
-
-define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
-; CHECK-LABEL: test_fewargs:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #96]
-; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK: str x7, [x[[GPRBASE]], #32]
-
-; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOFP: mov x[[GPRBASE:[0-9]+]], sp
-; CHECK-NOFP: str x7, [x[[GPRBASE]], #24]
-
-; Omit the middle ones
-
-; CHECK: str q1, [sp]
-; CHECK: str x3, [sp, #[[GPRFROMSP]]]
-
-; CHECK-NOFP-NOT: str q1, [sp]
-; CHECK-NOFP: str x4, [sp]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #111
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #39
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #112
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #40
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #31
-; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #32
-; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-  ret void
-}
-
-define void @test_nospare([8 x i64], [8 x float], ...) {
-; CHECK-LABEL: test_nospare:
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK-NOT: sub sp, sp
-; CHECK: mov [[STACK:x[0-9]+]], sp
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP-NOT: sub sp, sp
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #64
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-  ret void
-}
-
-; If there are non-variadic arguments on the stack (here two i64s) then the
-; __stack field should point just past them.
-define void @test_offsetstack([10 x i64], [3 x float], ...) {
-; CHECK-LABEL: test_offsetstack:
-; CHECK: sub sp, sp, #80
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #64]
-
-; CHECK-NOT: str x{{[0-9]+}},
-
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOT: str x7,
-
-; Omit the middle ones
-
-; CHECK: str q3, [sp]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #79
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: str wzr, [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #80
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[STACK:x[0-9]+]], sp, #96
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #40
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24]
-  ret void
-}
-
-declare void @llvm.va_end(i8*)
-
-define void @test_va_end() nounwind {
-; CHECK-LABEL: test_va_end:
-; CHECK-NEXT: BB#0
-; CHECK-NOFP: BB#0
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_end(i8* %addr)
-
-  ret void
-; CHECK-NEXT: ret
-; CHECK-NOFP-NEXT: ret
-}
-
-declare void @llvm.va_copy(i8* %dest, i8* %src)
-
-@second_list = global %va_list zeroinitializer
-
-define void @test_va_copy() {
-; CHECK-LABEL: test_va_copy:
-  %srcaddr = bitcast %va_list* @var to i8*
-  %dstaddr = bitcast %va_list* @second_list to i8*
-  call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
-
-; Check beginning and end again:
-
-; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
-; CHECK: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24]
-; CHECK: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list]
-; CHECK: str [[BLOCK2]], [x[[DEST_LIST]], #24]
-
-; CHECK-NOFP: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
-; CHECK-NOFP: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24]
-; CHECK-NOFP: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list]
-; CHECK-NOFP: str [[BLOCK2]], [x[[DEST_LIST]], #24]
-
-  ret void
-; CHECK: ret
-; CHECK-NOFP: ret
-}
-
-%struct.s_3i = type { i32, i32, i32 }
-
-; This checks that, if the last named argument is not a multiple of 8 bytes,
-; and is allocated on the stack, that __va_list.__stack is initialised to the
-; first 8-byte aligned location above it.
-define void @test_va_odd_struct_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, [1 x i64], %struct.s_3i* byval nocapture readnone align 4 %h, ...) {
-; CHECK-LABEL: test_va_odd_struct_on_stack:
-
-; CHECK: sub sp, sp, #128
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #112]
-
-; CHECK-NOT: str x{{[0-9]+}},
-
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOT: str x7,
-
-; Omit the middle ones
-
-; CHECK: str q0, [sp]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: str wzr, [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #128
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; This constant would be #140 if it was not 8-byte aligned
-; CHECK: add [[STACK:x[0-9]+]], sp, #144
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; This constant would be #12 if it was not 8-byte aligned
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #16
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24]
-  ret void
-}
diff --git a/test/CodeGen/AArch64/zero-reg.ll b/test/CodeGen/AArch64/zero-reg.ll
index 9b1e527..bc112ab 100644
--- a/test/CodeGen/AArch64/zero-reg.ll
+++ b/test/CodeGen/AArch64/zero-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -7,9 +7,9 @@ define void @test_zr() {
 ; CHECK-LABEL: test_zr:
 
   store i32 0, i32* @var32
-; CHECK: str wzr, [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str wzr, [{{x[0-9]+}}, {{#?}}:lo12:var32]
   store i64 0, i64* @var64
-; CHECK: str xzr, [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str xzr, [{{x[0-9]+}}, {{#?}}:lo12:var64]
 
   ret void
 ; CHECK: ret
@@ -23,8 +23,7 @@ define void @test_sp(i32 %val) {
 ; instruction (0b11111 in the Rn field would mean "sp").
   %addr = getelementptr i32* null, i64 0
   store i32 %val, i32* %addr
-; CHECK: mov x[[NULL:[0-9]+]], xzr
-; CHECK: str {{w[0-9]+}}, [x[[NULL]]]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+|sp}}]
 
   ret void
 ; CHECK: ret
diff --git a/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll b/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
index 95aa595..dabe620 100644
--- a/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
+++ b/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
@@ -14,4 +14,6 @@ bb3:		; preds = %bb1
 }
 
 ; CHECK-NOT: 255
+; CHECK: .file{{.*}}SxtInRegBug.ll
+; CHECK-NOT: 255
 
diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
index bc4cc98..48de244 100644
--- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
@@ -123,7 +123,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !43 = metadata !{i32 26, i32 0, metadata !39, null}
 !44 = metadata !{i32 786688, metadata !39, metadata !"k", metadata !2, i32 26, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
 !45 = metadata !{i32 27, i32 0, metadata !39, null}
-!46 = metadata !{metadata !0, metadata !9, metadata !16, metadata !17, metadata !20}
+!46 = metadata !{metadata !16, metadata !17, metadata !20}
 !47 = metadata !{}
 !48 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
 !49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll b/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll
index defb946..efb8202 100644
--- a/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll
+++ b/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+v7,+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mattr=+v7,+thumb2 %s -o - | FileCheck %s
 
 define i8 @f1(i8* %call1, i8* %call3, i32 %h, i32 %w, i32 %Width) {
 ; CHECK: f1:
diff --git a/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll b/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
new file mode 100644
index 0000000..1e40e4a
--- /dev/null
+++ b/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
@@ -0,0 +1,50 @@
+; Assertion `Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only"' failed.
+; Broken in r208166, fixed in 208715.
+
+; RUN: llc -mtriple=arm-linux-androideabi -o - -filetype=asm -relocation-model=pic %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv4t--linux-androideabi"
+
+@_ZTIi = external constant i8*
+
+define void @_Z3fn2v() #0 {
+entry:
+  invoke void @_Z3fn1v()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 1
+  %2 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #2
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %3 = extractvalue { i8*, i32 } %0, 0
+  %4 = tail call i8* @__cxa_begin_catch(i8* %3) #2
+  tail call void @__cxa_end_catch() #2
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch
+  ret void
+
+eh.resume:                                        ; preds = %lpad
+  resume { i8*, i32 } %0
+}
+
+declare void @_Z3fn1v() #0
+
+declare i32 @__gxx_personality_v0(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*) #1
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll b/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
new file mode 100644
index 0000000..a82f614
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple thumbv7--windows-itanium -code-model large -filetype obj -o - %s \
+; RUN:    | llvm-objdump -no-show-raw-insn -d - | FileCheck %s
+
+; ModuleID = 'reduced.c'
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7--windows-itanium"
+
+define arm_aapcs_vfpcc i8 @isel(i32 %i) {
+entry:
+  %i.addr = alloca i32, align 4
+  %buffer = alloca [4096 x i8], align 1
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32* %i.addr, align 4
+  %rem = urem i32 %0, 4096
+  %arrayidx = getelementptr inbounds [4096 x i8]* %buffer, i32 0, i32 %rem
+  %1 = load volatile i8* %arrayidx, align 1
+  ret i8 %1
+}
+
+; CHECK-LABEL: isel
+; CHECK: push {r4, r5}
+; CHECK: movw r4, #{{\d*}}
+; CHECK: movw r12, #0
+; CHECK: movt r12, #0
+; CHECK: blx r12
+; CHECK: sub.w sp, sp, r4
+
diff --git a/test/CodeGen/ARM/Windows/chkstk.ll b/test/CodeGen/ARM/Windows/chkstk.ll
new file mode 100644
index 0000000..cb787e1
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/chkstk.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 %s -o - \
+; RUN:  | FileCheck -check-prefix CHECK-DEFAULT-CODE-MODEL %s
+
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -code-model=large %s -o - \
+; RUN:  | FileCheck -check-prefix CHECK-LARGE-CODE-MODEL %s
+
+define arm_aapcs_vfpcc void @check_watermark() {
+entry:
+  %buffer = alloca [4096 x i8], align 1
+  ret void
+}
+
+; CHECK-DEFAULT-CODE-MODEL: check_watermark:
+; CHECK-DEFAULT-CODE-MODEL: 	movw r4, #1024
+; CHECK-DEFAULT-CODE-MODEL: 	bl __chkstk
+; CHECK-DEFAULT-CODE-MODEL: 	sub.w sp, sp, r4
+
+; CHECK-LARGE-CODE-MODEL: check_watermark:
+; CHECK-LARGE-CODE-MODEL: 	movw r12, :lower16:__chkstk
+; CHECK-LARGE-CODE-MODEL: 	movt r12, :upper16:__chkstk
+; CHECK-LARGE-CODE-MODEL: 	movw r4, #1024
+; CHECK-LARGE-CODE-MODEL: 	blx r12
+; CHECK-LARGE-CODE-MODEL: 	sub.w sp, sp, r4
+
diff --git a/test/CodeGen/ARM/Windows/frame-register.ll b/test/CodeGen/ARM/Windows/frame-register.ll
new file mode 100644
index 0000000..31167d7
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/frame-register.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple thumbv7-windows -disable-fp-elim -filetype asm -o - %s \
+; RUN:     | FileCheck %s
+
+declare void @callee(i32)
+
+define i32 @calleer(i32 %i) {
+entry:
+  %i.addr = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32* %i.addr, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %j, align 4
+  %1 = load i32* %j, align 4
+  call void @callee(i32 %1)
+  %2 = load i32* %j, align 4
+  %add1 = add nsw i32 %2, 1
+  ret i32 %add1
+}
+
+; CHECK: push.w {r11, lr}
+
diff --git a/test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll b/test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll
new file mode 100644
index 0000000..acf21a1
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple thumbv7-windows -filetype asm -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc i64 @stoi64(float %f) {
+entry:
+  %conv = fptosi float %f to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: stoi64
+; CHECK: bl __stoi64
+
+define arm_aapcs_vfpcc i64 @stou64(float %f) {
+entry:
+  %conv = fptoui float %f to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: stou64
+; CHECK: bl __stou64
+
+define arm_aapcs_vfpcc float @i64tos(i64 %i64) {
+entry:
+  %conv = sitofp i64 %i64 to float
+  ret float %conv
+}
+
+; CHECK-LABEL: i64tos
+; CHECK: bl __i64tos
+
+define arm_aapcs_vfpcc float @u64tos(i64 %u64) {
+entry:
+  %conv = uitofp i64 %u64 to float
+  ret float %conv
+}
+
+; CHECK-LABEL: u64tos
+; CHECK: bl __u64tos
+
+define arm_aapcs_vfpcc i64 @dtoi64(double %d) {
+entry:
+  %conv = fptosi double %d to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: dtoi64
+; CHECK: bl __dtoi64
+
+define arm_aapcs_vfpcc i64 @dtou64(double %d) {
+entry:
+  %conv = fptoui double %d to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: dtou64
+; CHECK: bl __dtou64
+
+define arm_aapcs_vfpcc double @i64tod(i64 %i64) {
+entry:
+  %conv = sitofp i64 %i64 to double
+  ret double %conv
+}
+
+; CHECK-LABEL: i64tod
+; CHECK: bl __i64tod
+
+define arm_aapcs_vfpcc double @u64tod(i64 %i64) {
+entry:
+  %conv = uitofp i64 %i64 to double
+  ret double %conv
+}
+
+; CHECK-LABEL: u64tod
+; CHECK: bl __u64tod
+
diff --git a/test/CodeGen/ARM/Windows/memset.ll b/test/CodeGen/ARM/Windows/memset.ll
new file mode 100644
index 0000000..500e25e
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/memset.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple thumbv7--windows-itanium -filetype asm -o - %s | FileCheck %s
+
+@source = common global [512 x i8] zeroinitializer, align 4
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+define void @function() {
+entry:
+  call void @llvm.memset.p0i8.i32(i8* bitcast ([512 x i8]* @source to i8*), i8 0, i32 512, i32 0, i1 false)
+  unreachable
+}
+
+; CHECK: movw r0, :lower16:source
+; CHECK: movt r0, :upper16:source
+; CHECK: movs r1, #0
+; CHECK: mov.w r2, #512
+; CHECK: memset
+
diff --git a/test/CodeGen/ARM/Windows/mov32t-bundling.ll b/test/CodeGen/ARM/Windows/mov32t-bundling.ll
new file mode 100644
index 0000000..5f83837
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/mov32t-bundling.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s
+
+@_begin = external global i8
+@_end = external global i8
+
+declare arm_aapcs_vfpcc void @force_emission()
+
+define arm_aapcs_vfpcc void @bundle() {
+entry:
+  br i1 icmp uge (i32 sub (i32 ptrtoint (i8* @_end to i32), i32 ptrtoint (i8* @_begin to i32)), i32 4), label %if.then, label %if.end
+
+if.then:
+  tail call arm_aapcs_vfpcc void @force_emission()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: bundle
+; CHECK-NOT: subs r0, r1, r0
+; CHECK: movw r0, :lower16:_begin
+; CHECK-NEXT: movt r0, :upper16:_begin
+; CHECK-NEXT: movw r1, :lower16:_end
+; CHECK-NEXT: movt r1, :upper16:_end
+; CHECK-NEXT: subs r0, r1, r0
+; CHECK-NEXT: cmp r0, #4
+
diff --git a/test/CodeGen/ARM/Windows/movw-movt-relocations.ll b/test/CodeGen/ARM/Windows/movw-movt-relocations.ll
new file mode 100644
index 0000000..3ae6428
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/movw-movt-relocations.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=thumbv7-windows -o - %s \
+; RUN:   | FileCheck %s -check-prefix CHECK-WINDOWS
+
+; RUN: llc -mtriple=thumbv7-eabi -o - %s \
+; RUN:   | FileCheck %s -check-prefix CHECK-EABI
+
+@i = common global i32 0, align 4
+@j = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize readonly
+define i32 @relocation(i32 %j, i32 %k) {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %add = add nsw i32 %1, %0
+  ret i32 %add
+}
+
+; CHECK-WINDOWS: movw r[[i:[0-4]]], :lower16:i
+; CHECK-WINDOWS-NEXT: movt r[[i]], :upper16:i
+; CHECK-WINDOWS: movw r[[j:[0-4]]], :lower16:j
+; CHECK-WINDOWS-NEXT: movt r[[j]], :upper16:j
+
+; CHECK-EABI: movw r[[i:[0-4]]], :lower16:i
+; CHECK-EABI: movw r[[j:[0-4]]], :lower16:j
+; CHECK-EABI-NEXT: movt r[[i]], :upper16:i
+; CHECK-EABI-NEXT: movt r[[j]], :upper16:j
diff --git a/test/CodeGen/ARM/Windows/no-aeabi.ll b/test/CodeGen/ARM/Windows/no-aeabi.ll
index 4c6676f..3971b9c 100644
--- a/test/CodeGen/ARM/Windows/no-aeabi.ll
+++ b/test/CodeGen/ARM/Windows/no-aeabi.ll
@@ -1,5 +1,27 @@
 ; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s | FileCheck %s
 
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+@source = common global [512 x i8] zeroinitializer, align 4
+@target = common global [512 x i8] zeroinitializer, align 4
+
+define void @move() nounwind {
+entry:
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* bitcast ([512 x i8]* @target to i8*), i8* bitcast ([512 x i8]* @source to i8*), i32 512, i32 0, i1 false)
+  unreachable
+}
+
+; CHECK-NOT: __aeabi_memmove
+
+define void @copy() nounwind {
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([512 x i8]* @target to i8*), i8* bitcast ([512 x i8]* @source to i8*), i32 512, i32 0, i1 false)
+  unreachable
+}
+
+; CHECK-NOT: __aeabi_memcpy
+
 define i32 @divide(i32 %i, i32 %j) nounwind {
 entry:
   %quotient = sdiv i32 %i, %j
diff --git a/test/CodeGen/ARM/Windows/pic.ll b/test/CodeGen/ARM/Windows/pic.ll
new file mode 100644
index 0000000..28d371f
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/pic.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple thumbv7-windows-itanium -relocation-model pic -filetype asm -o - %s \
+; RUN:    | FileCheck %s
+
+@external = external global i8
+
+define arm_aapcs_vfpcc i8 @return_external() {
+entry:
+  %0 = load i8* @external, align 1
+  ret i8 %0
+}
+
+; CHECK-LABEL: return_external
+; CHECK: movw r0, :lower16:external
+; CHECK: movt r0, :upper16:external
+; CHECK: ldrb r0, [r0]
+
diff --git a/test/CodeGen/ARM/Windows/read-only-data.ll b/test/CodeGen/ARM/Windows/read-only-data.ll
new file mode 100644
index 0000000..0ccb5ed
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/read-only-data.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple thumbv7-windows -filetype asm -o - %s | FileCheck %s
+
+@.str = private unnamed_addr constant [7 x i8] c"string\00", align 1
+
+declare arm_aapcs_vfpcc void @callee(i8*)
+
+define arm_aapcs_vfpcc void @function() {
+entry:
+  call arm_aapcs_vfpcc void @callee(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
+  ret void
+}
+
+; CHECK: .section .rdata,"rd"
+; CHECK-NOT: .section ".rodata.str1.1"
+
diff --git a/test/CodeGen/ARM/aapcs-hfa-code.ll b/test/CodeGen/ARM/aapcs-hfa-code.ll
new file mode 100644
index 0000000..396e838
--- /dev/null
+++ b/test/CodeGen/ARM/aapcs-hfa-code.ll
@@ -0,0 +1,111 @@
+; RUN: llc < %s -mtriple=armv7-linux-gnueabihf -o - | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7em-none-eabi -mcpu=cortex-m4 | FileCheck %s --check-prefix=CHECK-M4F
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define arm_aapcs_vfpcc void @test_1float({ float } %a) {
+  call arm_aapcs_vfpcc void @test_1float({ float } { float 1.0 })
+  ret void
+
+; CHECK-LABEL: test_1float:
+; CHECK-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK: bl test_1float
+
+; CHECK-M4F-LABEL: test_1float:
+; CHECK-M4F-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-M4F: bl test_1float
+}
+
+define arm_aapcs_vfpcc void @test_2float({ float, float } %a) {
+  call arm_aapcs_vfpcc void @test_2float({ float, float } { float 1.0, float 2.0 })
+  ret void
+
+; CHECK-LABEL: test_2float:
+; CHECK-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK: bl test_2float
+
+; CHECK-M4F-LABEL: test_2float:
+; CHECK-M4F-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-M4F-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK-M4F: bl test_2float
+}
+
+define arm_aapcs_vfpcc void @test_3float({ float, float, float } %a) {
+  call arm_aapcs_vfpcc void @test_3float({ float, float, float } { float 1.0, float 2.0, float 3.0 })
+  ret void
+
+; CHECK-LABEL: test_3float:
+; CHECK-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK-DAG: vmov.f32 s2, #3.{{0+}}e+00
+; CHECK: bl test_3float
+
+; CHECK-M4F-LABEL: test_3float:
+; CHECK-M4F-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-M4F-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK-M4F-DAG: vmov.f32 s2, #3.{{0+}}e+00
+; CHECK-M4F: bl test_3float
+}
+
+define arm_aapcs_vfpcc void @test_1double({ double } %a) {
+; CHECK-LABEL: test_1double:
+; CHECK-DAG: vmov.f64 d0, #1.{{0+}}e+00
+; CHECK: bl test_1double
+
+; CHECK-M4F-LABEL: test_1double:
+; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
+; CHECK-M4F: movt [[ONEHI]], #16368
+; CHECK-M4F-DAG: vmov s0, [[ONELO]]
+; CHECK-M4F-DAG: vmov s1, [[ONEHI]]
+; CHECK-M4F: bl test_1double
+
+  call arm_aapcs_vfpcc void @test_1double({ double } { double 1.0 })
+  ret void
+}
+
+; Final double argument might be put in s15 & [sp] if we're careless. It should
+; go all on the stack.
+define arm_aapcs_vfpcc void @test_1double_nosplit([4 x float], [4 x double], [3 x float], double %a) {
+; CHECK-LABEL: test_1double_nosplit:
+; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
+; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0
+; CHECK-DAG: movt [[ONEHI]], #16368
+; CHECK: strd [[ONELO]], [[ONEHI]], [sp]
+; CHECK: bl test_1double_nosplit
+
+; CHECK-M4F-LABEL: test_1double_nosplit:
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movt [[ONEHI]], #16368
+; CHECK-M4F-DAG: str [[ONELO]], [sp]
+; CHECK-M4F-DAG: str [[ONEHI]], [sp, #4]
+; CHECK-M4F: bl test_1double_nosplit
+  call arm_aapcs_vfpcc void @test_1double_nosplit([4 x float] undef, [4 x double] undef, [3 x float] undef, double 1.0)
+  ret void
+}
+
+; Final double argument might go at [sp, #4] if we're careless. Should go at
+; [sp, #8] to preserve alignment.
+define arm_aapcs_vfpcc void @test_1double_misaligned([4 x double], [4 x double], float, double) {
+  call arm_aapcs_vfpcc void @test_1double_misaligned([4 x double] undef, [4 x double] undef, float undef, double 1.0)
+
+; CHECK-LABEL: test_1double_misaligned:
+; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
+; CHECK-DAG: mov r[[BASE:[0-9]+]], sp
+; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0
+; CHECK-DAG: movt [[ONEHI]], #16368
+; CHECK-DAG: str [[ONELO]], [r[[BASE]], #8]!
+; CHECK-DAG: str [[ONEHI]], [r[[BASE]], #4]
+
+; CHECK-M4F-LABEL: test_1double_misaligned:
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movt [[ONEHI]], #16368
+; CHECK-M4F-DAG: str [[ONELO]], [sp, #8]
+; CHECK-M4F-DAG: str [[ONEHI]], [sp, #12]
+; CHECK-M4F: bl test_1double_misaligned
+
+  ret void
+}
diff --git a/test/CodeGen/ARM/aapcs-hfa.ll b/test/CodeGen/ARM/aapcs-hfa.ll
new file mode 100644
index 0000000..6448e00
--- /dev/null
+++ b/test/CodeGen/ARM/aapcs-hfa.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -float-abi=hard -debug-only arm-isel 2>&1 | FileCheck %s
+; RUN: llc < %s -float-abi=soft -debug-only arm-isel 2>&1 | FileCheck %s --check-prefix=SOFT
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv7-none--eabi"
+
+; SOFT-NOT: isHA
+
+; CHECK: isHA: 1 { float }
+define void @f0b({ float } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { float, float }
+define void @f1({ float, float } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { float, float, float }
+define void @f1b({ float, float, float } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { float, float, float, float }
+define void @f1c({ float, float, float, float } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { float, float, float, float, float }
+define void @f2({ float, float, float, float, float } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { double }
+define void @f3({ double } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { double, double, double, double }
+define void @f4({ double, double, double, double } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { double, double, double, double, double }
+define void @f5({ double, double, double, double, double } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { i32, i32 }
+define void @f5b({ i32, i32 } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { [1 x float] }
+define void @f6({ [1 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { [4 x float] }
+define void @f7({ [4 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { [5 x float] }
+define void @f8({ [5 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 [1 x float]
+define void @f6b([1 x float] %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 [4 x float]
+define void @f7b([4 x float] %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 [5 x float]
+define void @f8b([5 x float] %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { [2 x float], [2 x float] }
+define void @f9({ [2 x float], [2 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { [1 x float], [3 x float] }
+define void @f9b({ [1 x float], [3 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { [3 x float], [3 x float] }
+define void @f10({ [3 x float], [3 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <2 x float> }
+define void @f11({ <2 x float>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { <3 x float> }
+define void @f12({ <3 x float>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <4 x float> }
+define void @f13({ <4 x float>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <2 x float>, <2 x float> }
+define void @f15({ <2 x float>, <2 x float>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { <2 x float>, float }
+define void @f15b({ <2 x float>, float  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { <2 x float>, [2 x float] }
+define void @f15c({ <2 x float>, [2 x float]  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { <2 x float>, <4 x float> }
+define void @f16({ <2 x float>, <4 x float>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <2 x double> }
+define void @f17({ <2 x double>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <2 x i32> }
+define void @f18({ <2 x i32>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <2 x i64>, <4 x i32> }
+define void @f19({ <2 x i64>, <4 x i32> } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { [4 x <4 x float>] }
+define void @f20({ [4 x <4 x float>]  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { [5 x <4 x float>] }
+define void @f21({ [5 x <4 x float>]  } %a) {
+  ret void
+}
+
+; CHECK-NOT: isHA
+define void @f22({ float } %a, ...) {
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index f55ae10..4de305b 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -29,7 +29,7 @@ define i32 @foo_f() {
 
 @bar_i = alias internal i32* @bar
 
-@A = alias bitcast (i32* @bar to i64*)
+@A = alias i64, i32* @bar
 
 define i32 @test() {
 entry:
diff --git a/test/CodeGen/ARM/argaddr.ll b/test/CodeGen/ARM/argaddr.ll
index 116a32f..40bc5e0 100644
--- a/test/CodeGen/ARM/argaddr.ll
+++ b/test/CodeGen/ARM/argaddr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define void @f(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index a881d5f..9913f30 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -1,12 +1,16 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-THUMB
+; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-LE
+; RUN: llc < %s -mtriple=armebv7 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+; RUN: llc < %s -mtriple=thumbebv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-BE
 
 define i64 @test1(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test1:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: adds [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK: adc [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-LE: adds [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-LE: adc [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE: adds [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE: adc [[REG3:(r[0-9]?[02468])]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -15,8 +19,10 @@ define i64 @test1(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test1:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: adds.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB: adc.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-LE: adds.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-LE: adc.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE: adds.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE: adc.w [[REG3:[a-z0-9]+]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -30,8 +36,10 @@ define i64 @test2(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test2:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: subs [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK: sbc [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-LE: subs [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-LE: sbc [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE: subs [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE: sbc [[REG3:(r[0-9]?[02468])]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -40,8 +48,10 @@ define i64 @test2(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test2:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: subs.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB: sbc.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-LE: subs.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-LE: sbc.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE: subs.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE: sbc.w [[REG3:[a-z0-9]+]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -55,8 +65,10 @@ define i64 @test3(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test3:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-DAG: and [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK-DAG: and [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-LE-DAG: and [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-LE-DAG: and [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: and [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: and [[REG3:(r[0-9]?[02468])]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -65,8 +77,10 @@ define i64 @test3(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test3:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-DAG: and.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-DAG: and.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-LE-DAG: and.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-LE-DAG: and.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: and.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: and.w [[REG3:[a-z0-9]+]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -80,8 +94,10 @@ define i64 @test4(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test4:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-DAG: orr [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK-DAG: orr [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-LE-DAG: orr [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-LE-DAG: orr [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: orr [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: orr [[REG3:(r[0-9]?[02468])]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -90,8 +106,10 @@ define i64 @test4(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test4:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-DAG: orr.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-DAG: orr.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-LE-DAG: orr.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-LE-DAG: orr.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: orr.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: orr.w [[REG3:[a-z0-9]+]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -105,8 +123,10 @@ define i64 @test5(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test5:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-DAG: eor [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK-DAG: eor [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-LE-DAG: eor [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-LE-DAG: eor [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: eor [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: eor [[REG3:(r[0-9]?[02468])]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -115,8 +135,10 @@ define i64 @test5(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test5:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-DAG: eor.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-DAG: eor.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-LE-DAG: eor.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-LE-DAG: eor.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: eor.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: eor.w [[REG3:[a-z0-9]+]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -151,8 +173,10 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-LABEL: test7:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG1]], r1
-; CHECK-DAG: eor     [[MISMATCH_HI:r[0-9]+]], [[REG2]], r2
+; CHECK-LE-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG1]], r1
+; CHECK-LE-DAG: eor     [[MISMATCH_HI:r[0-9]+]], [[REG2]], r2
+; CHECK-BE-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG2]], r2
+; CHECK-BE-DAG: eor     [[MISMATCH_HI:r[0-9]+]], [[REG1]], r1
 ; CHECK: orrs    {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
 ; CHECK: bne
 ; CHECK: strexd {{[a-z0-9]+}}, {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
@@ -163,8 +187,10 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-THUMB-LABEL: test7:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
-; CHECK-THUMB-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
+; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
+; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG2]]
 ; CHECK-THUMB: orrs    [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
@@ -220,9 +246,11 @@ define i64 @test10(i64* %ptr, i64 %val) {
 ; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
-; CHECK: cmp     [[REG1]], r1
+; CHECK-LE: cmp     [[REG1]], r1
+; CHECK-BE: cmp     [[REG2]], r2
 ; CHECK: movwls  [[CARRY_LO]], #1
-; CHECK: cmp     [[REG2]], r2
+; CHECK-LE: cmp     [[REG2]], r2
+; CHECK-BE: cmp     [[REG1]], r1
 ; CHECK: movwle  [[CARRY_HI]], #1
 ; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK: cmp     [[CARRY_HI]], #0
@@ -237,11 +265,13 @@ define i64 @test10(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test10:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
-; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
-; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+|lr]], #0
+; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+|lr]], #0
+; CHECK-THUMB-LE: cmp     [[REG1]], r2
+; CHECK-THUMB-BE: cmp     [[REG2]], r3
 ; CHECK-THUMB: movls.w  [[CARRY_LO]], #1
-; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB-LE: cmp     [[REG2]], r3
+; CHECK-THUMB-BE: cmp     [[REG1]], r2
 ; CHECK-THUMB: movle  [[CARRY_HI]], #1
 ; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
@@ -265,9 +295,11 @@ define i64 @test11(i64* %ptr, i64 %val) {
 ; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
-; CHECK: cmp     [[REG1]], r1
+; CHECK-LE: cmp     [[REG1]], r1
+; CHECK-BE: cmp     [[REG2]], r2
 ; CHECK: movwls  [[CARRY_LO]], #1
-; CHECK: cmp     [[REG2]], r2
+; CHECK-LE: cmp     [[REG2]], r2
+; CHECK-BE: cmp     [[REG1]], r1
 ; CHECK: movwls  [[CARRY_HI]], #1
 ; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK: cmp     [[CARRY_HI]], #0
@@ -279,15 +311,16 @@ define i64 @test11(i64* %ptr, i64 %val) {
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-
 ; CHECK-THUMB-LABEL: test11:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
-; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB-LE: cmp     [[REG1]], r2
+; CHECK-THUMB-BE: cmp     [[REG2]], r3
 ; CHECK-THUMB: movls.w  [[CARRY_LO]], #1
-; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB-LE: cmp     [[REG2]], r3
+; CHECK-THUMB-BE: cmp     [[REG1]], r2
 ; CHECK-THUMB: movls  [[CARRY_HI]], #1
 ; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
@@ -311,9 +344,11 @@ define i64 @test12(i64* %ptr, i64 %val) {
 ; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
-; CHECK: cmp     [[REG1]], r1
+; CHECK-LE: cmp     [[REG1]], r1
+; CHECK-BE: cmp     [[REG2]], r2
 ; CHECK: movwhi  [[CARRY_LO]], #1
-; CHECK: cmp     [[REG2]], r2
+; CHECK-LE: cmp     [[REG2]], r2
+; CHECK-BE: cmp     [[REG1]], r1
 ; CHECK: movwgt  [[CARRY_HI]], #1
 ; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK: cmp     [[CARRY_HI]], #0
@@ -330,9 +365,11 @@ define i64 @test12(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
-; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB-LE: cmp     [[REG1]], r2
+; CHECK-THUMB-BE: cmp     [[REG2]], r3
 ; CHECK-THUMB: movhi.w  [[CARRY_LO]], #1
-; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB-LE: cmp     [[REG2]], r3
+; CHECK-THUMB-BE: cmp     [[REG1]], r2
 ; CHECK-THUMB: movgt  [[CARRY_HI]], #1
 ; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
@@ -356,9 +393,11 @@ define i64 @test13(i64* %ptr, i64 %val) {
 ; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
-; CHECK: cmp     [[REG1]], r1
+; CHECK-LE: cmp     [[REG1]], r1
+; CHECK-BE: cmp     [[REG2]], r2
 ; CHECK: movwhi  [[CARRY_LO]], #1
-; CHECK: cmp     [[REG2]], r2
+; CHECK-LE: cmp     [[REG2]], r2
+; CHECK-BE: cmp     [[REG1]], r1
 ; CHECK: movwhi  [[CARRY_HI]], #1
 ; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK: cmp     [[CARRY_HI]], #0
@@ -375,9 +414,11 @@ define i64 @test13(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
-; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB-LE: cmp     [[REG1]], r2
+; CHECK-THUMB-BE: cmp     [[REG2]], r3
 ; CHECK-THUMB: movhi.w  [[CARRY_LO]], #1
-; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB-LE: cmp     [[REG2]], r3
+; CHECK-THUMB-BE: cmp     [[REG1]], r2
 ; CHECK-THUMB: movhi  [[CARRY_HI]], #1
 ; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll
index 7922e22..a39565e 100644
--- a/test/CodeGen/ARM/atomic-ops-v8.ll
+++ b/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=armv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM
-; RUN: llc -mtriple=thumbv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-THUMB
+; RUN: llc -mtriple=armv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE --check-prefix=CHECK-ARM --check-prefix=CHECK-ARM-LE
+; RUN: llc -mtriple=armebv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE --check-prefix=CHECK-ARM --check-prefix=CHECK-ARM-BE
+; RUN: llc -mtriple=thumbv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-LE
+; RUN: llc -mtriple=thumbebv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-BE
 
 @var8 = global i8 0
 @var16 = global i16 0
@@ -87,8 +89,10 @@ define void @test_atomic_load_add_i64(i64 %offset) nounwind {
 ; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: adds{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK-NEXT: adc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-LE-NEXT: adds{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
+; CHECK-LE-NEXT: adc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-BE-NEXT: adds{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-NEXT: adc{{(\.w)?}}  [[NEW1:r[0-9]+]], r[[OLD1]], r0
 ; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -181,8 +185,10 @@ define void @test_atomic_load_sub_i64(i64 %offset) nounwind {
 ; CHECK: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: subs{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK-NEXT: sbc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-LE-NEXT: subs{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
+; CHECK-LE-NEXT: sbc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-BE-NEXT: subs{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-NEXT: sbc{{(\.w)?}}  [[NEW1:r[0-9]+]], r[[OLD1]], r0
 ; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -275,8 +281,10 @@ define void @test_atomic_load_and_i64(i64 %offset) nounwind {
 ; CHECK: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-DAG: and{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
-; CHECK-DAG: and{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-LE-DAG: and{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
+; CHECK-LE-DAG: and{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: and{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: and{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
 ; CHECK: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -369,8 +377,10 @@ define void @test_atomic_load_or_i64(i64 %offset) nounwind {
 ; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-DAG: orr{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
-; CHECK-DAG: orr{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-LE-DAG: orr{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
+; CHECK-LE-DAG: orr{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: orr{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: orr{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
 ; CHECK: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -463,8 +473,10 @@ define void @test_atomic_load_xor_i64(i64 %offset) nounwind {
 ; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-DAG: eor{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
-; CHECK-DAG: eor{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-LE-DAG: eor{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
+; CHECK-LE-DAG: eor{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: eor{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: eor{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
 ; CHECK: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -657,10 +669,14 @@ define void @test_atomic_load_min_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
 ; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: cmp [[OLD1]], r0
-; CHECK-ARM: movwls [[LOCARRY]], #1
-; CHECK-ARM: cmp [[OLD2]], r1
-; CHECK-ARM: movwle [[HICARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD1]], r0
+; CHECK-ARM-LE: movwls [[LOCARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD2]], r1
+; CHECK-ARM-LE: movwle [[HICARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD2]], r1
+; CHECK-ARM-BE: movwls [[LOCARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD1]], r0
+; CHECK-ARM-BE: movwle [[HICARRY]], #1
 ; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
 ; CHECK-ARM: cmp [[HICARRY]], #0
 ; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
@@ -771,10 +787,14 @@ define void @test_atomic_load_max_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
 ; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: cmp [[OLD1]], r0
-; CHECK-ARM: movwhi [[LOCARRY]], #1
-; CHECK-ARM: cmp [[OLD2]], r1
-; CHECK-ARM: movwgt [[HICARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD1]], r0
+; CHECK-ARM-LE: movwhi [[LOCARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD2]], r1
+; CHECK-ARM-LE: movwgt [[HICARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD2]], r1
+; CHECK-ARM-BE: movwhi [[LOCARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD1]], r0
+; CHECK-ARM-BE: movwgt [[HICARRY]], #1
 ; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
 ; CHECK-ARM: cmp [[HICARRY]], #0
 ; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
@@ -885,10 +905,14 @@ define void @test_atomic_load_umin_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
 ; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: cmp [[OLD1]], r0
-; CHECK-ARM: movwls [[LOCARRY]], #1
-; CHECK-ARM: cmp [[OLD2]], r1
-; CHECK-ARM: movwls [[HICARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD1]], r0
+; CHECK-ARM-LE: movwls [[LOCARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD2]], r1
+; CHECK-ARM-LE: movwls [[HICARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD2]], r1
+; CHECK-ARM-BE: movwls [[LOCARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD1]], r0
+; CHECK-ARM-BE: movwls [[HICARRY]], #1
 ; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
 ; CHECK-ARM: cmp [[HICARRY]], #0
 ; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
@@ -999,10 +1023,14 @@ define void @test_atomic_load_umax_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
 ; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: cmp [[OLD1]], r0
-; CHECK-ARM: movwhi [[LOCARRY]], #1
-; CHECK-ARM: cmp [[OLD2]], r1
-; CHECK-ARM: movwhi [[HICARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD1]], r0
+; CHECK-ARM-LE: movwhi [[LOCARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD2]], r1
+; CHECK-ARM-LE: movwhi [[HICARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD2]], r1
+; CHECK-ARM-BE: movwhi [[LOCARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD1]], r0
+; CHECK-ARM-BE: movwhi [[HICARRY]], #1
 ; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
 ; CHECK-ARM: cmp [[HICARRY]], #0
 ; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
@@ -1112,9 +1140,12 @@ define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK: ldrexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
-; CHECK-DAG: eor{{(\.w)?}} [[MISMATCH_HI:r[0-9]+|lr]], [[OLD2]], r1
-; CHECK: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
+; CHECK-LE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
+; CHECK-LE-DAG: eor{{(\.w)?}} [[MISMATCH_HI:r[0-9]+|lr]], [[OLD2]], r1
+; CHECK-LE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
+; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_HI:r[0-9]+|lr]], [[OLD2]], r1
+; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
+; CHECK-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
 ; CHECK-NEXT: BB#2:
   ; As above, r2, r3 is a reasonable guess.
@@ -1151,7 +1182,8 @@ define i8 @test_atomic_load_monotonic_regoff_i8(i64 %base, i64 %off) nounwind {
   %val = load atomic i8* %addr monotonic, align 1
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: ldrb r0, [r0, r2]
+; CHECK-LE: ldrb r0, [r0, r2]
+; CHECK-BE: ldrb r0, [r1, r3]
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
@@ -1218,7 +1250,8 @@ define i32 @test_atomic_load_monotonic_regoff_i32(i64 %base, i64 %off) nounwind
   %val = load atomic i32* %addr monotonic, align 4
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: ldr r0, [r0, r2]
+; CHECK-LE: ldr r0, [r0, r2]
+; CHECK-BE: ldr r0, [r1, r3]
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
@@ -1259,8 +1292,10 @@ define void @test_atomic_store_monotonic_regoff_i8(i64 %base, i64 %off, i8 %val)
   %addr = inttoptr i64 %addr_int to i8*
 
   store atomic i8 %val, i8* %addr monotonic, align 1
-; CHECK: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp]
-; CHECK: strb [[VAL]], [r0, r2]
+; CHECK-LE: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp]
+; CHECK-LE: strb [[VAL]], [r0, r2]
+; CHECK-BE: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp, #3]
+; CHECK-BE: strb [[VAL]], [r1, r3]
 
   ret void
 }
@@ -1328,7 +1363,8 @@ define void @test_atomic_store_monotonic_regoff_i32(i64 %base, i64 %off, i32 %va
 ; CHECK: ldr [[VAL:r[0-9]+]], [sp]
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: str [[VAL]], [r0, r2]
+; CHECK-LE: str [[VAL]], [r0, r2]
+; CHECK-BE: str [[VAL]], [r1, r3]
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
diff --git a/test/CodeGen/ARM/available_externally.ll b/test/CodeGen/ARM/available_externally.ll
index 0f646d5..d925b5c 100644
--- a/test/CodeGen/ARM/available_externally.ll
+++ b/test/CodeGen/ARM/available_externally.ll
@@ -11,6 +11,8 @@ define i32 @t1() {
 }
 
 ; CHECK:      L_A$non_lazy_ptr:
-; CHECK-NEXT: .long _A
+; CHECK-NEXT: .indirect_symbol _A
+; CHECK-NEXT: .long 0
 ; CHECK:      L_B$non_lazy_ptr:
-; CHECK-NEXT: .long _B
+; CHECK-NEXT: .indirect_symbol _B
+; CHECK-NEXT: .long 0
diff --git a/test/CodeGen/ARM/big-endian-eh-unwind.ll b/test/CodeGen/ARM/big-endian-eh-unwind.ll
new file mode 100644
index 0000000..630dfed
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-eh-unwind.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -mtriple armeb-eabi -mattr v7 -filetype obj -o - | llvm-objdump -s - | FileCheck %s
+
+; ARM EHABI for big endian
+; This test case checks whether frame unwinding instructions are laid out in big endian format.
+; 
+; This is the LLVM assembly generated from following C++ code:
+;
+; extern void foo(int);
+; void test(int a, int b) {
+;   try {
+;   foo(a);
+; } catch (...) {
+;   foo(b);
+; }
+;}
+
+define void @_Z4testii(i32 %a, i32 %b) #0 {
+entry:
+  invoke void @_Z3fooi(i32 %a)
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2
+  invoke void @_Z3fooi(i32 %b)
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %lpad
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %invoke.cont2
+  ret void
+
+lpad1:                                            ; preds = %lpad
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %lpad1
+  resume { i8*, i32 } %3
+
+terminate.lpad:                                   ; preds = %lpad1
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %5 = extractvalue { i8*, i32 } %4, 0
+  tail call void @__clang_call_terminate(i8* %5) #3
+  unreachable
+}
+
+declare void @_Z3fooi(i32) #0
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+define linkonce_odr hidden void @__clang_call_terminate(i8*) #1 {
+  %2 = tail call i8* @__cxa_begin_catch(i8* %0) #2
+  tail call void @_ZSt9terminatev() #3
+  unreachable
+}
+
+declare void @_ZSt9terminatev()
+
+; CHECK-LABEL: Contents of section .ARM.extab:
+; CHECK-NEXT: 0000 00000000 00a8b0b0
+
diff --git a/test/CodeGen/ARM/big-endian-neon-bitconv.ll b/test/CodeGen/ARM/big-endian-neon-bitconv.ll
new file mode 100644
index 0000000..427d2e7
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-neon-bitconv.ll
@@ -0,0 +1,392 @@
+; RUN: llc < %s -march armeb -mtriple arm-eabi -mattr v7,neon -float-abi soft -o - | FileCheck %s
+; RUN: llc < %s -march armeb -mtriple arm-eabi -mattr v7,neon -float-abi hard -o - | FileCheck %s -check-prefix CHECK-HARD
+
+@v2i64 = global <2 x i64> zeroinitializer
+@v2i32 = global <2 x i32> zeroinitializer
+@v4i32 = global <4 x i32> zeroinitializer
+@v4i16 = global <4 x i16> zeroinitializer
+@v8i16 = global <8 x i16> zeroinitializer
+@v8i8 = global <8 x i8> zeroinitializer
+@v16i8 = global <16 x i8> zeroinitializer
+
+@v2f32 = global <2 x float> zeroinitializer
+@v2f64 = global <2 x double> zeroinitializer
+@v4f32 = global <4 x float> zeroinitializer
+
+
+; 64 bit conversions
+define void @conv_i64_to_v8i8( i64 %val,  <8 x i8>* %store ) {
+; CHECK-LABEL: conv_i64_to_v8i8:
+; CHECK: vrev64.8
+  %v = bitcast i64 %val to <8 x i8>
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  store <8 x i8> %a, <8 x i8>* %store
+  ret void
+}
+
+define void @conv_v8i8_to_i64( <8 x i8>* %load, <8 x i8>* %store ) {
+; CHECK-LABEL: conv_v8i8_to_i64:
+; CHECK: vrev64.8
+  %v = load <8 x i8>* %load
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  %f = bitcast <8 x i8> %a to i64
+  call void @conv_i64_to_v8i8( i64 %f, <8 x i8>* %store )
+  ret void
+}
+
+define void @conv_i64_to_v4i16( i64 %val,  <4 x i16>* %store ) {
+; CHECK-LABEL: conv_i64_to_v4i16:
+; CHECK: vrev64.16
+  %v = bitcast i64 %val to <4 x i16>
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  store <4 x i16> %a, <4 x i16>* %store
+  ret void
+}
+
+define void @conv_v4i16_to_i64( <4 x i16>* %load, <4 x i16>* %store ) {
+; CHECK-LABEL: conv_v4i16_to_i64:
+; CHECK: vrev64.16
+  %v = load <4 x i16>* %load
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  %f = bitcast <4 x i16> %a to i64
+  call void @conv_i64_to_v4i16( i64 %f, <4 x i16>* %store )
+  ret void
+}
+
+define void @conv_i64_to_v2i32( i64 %val,  <2 x i32>* %store ) {
+; CHECK-LABEL: conv_i64_to_v2i32:
+; CHECK: vrev64.32
+  %v = bitcast i64 %val to <2 x i32>
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  store <2 x i32> %a, <2 x i32>* %store
+  ret void
+}
+
+define void @conv_v2i32_to_i64( <2 x i32>* %load, <2 x i32>* %store ) {
+; CHECK-LABEL: conv_v2i32_to_i64:
+; CHECK: vrev64.32
+  %v = load <2 x i32>* %load
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  %f = bitcast <2 x i32> %a to i64
+  call void @conv_i64_to_v2i32( i64 %f, <2 x i32>* %store )
+  ret void
+}
+
+define void @conv_i64_to_v2f32( i64 %val,  <2 x float>* %store ) {
+; CHECK-LABEL: conv_i64_to_v2f32:
+; CHECK: vrev64.32
+  %v = bitcast i64 %val to <2 x float>
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  store <2 x float> %a, <2 x float>* %store
+  ret void
+}
+
+define void @conv_v2f32_to_i64( <2 x float>* %load, <2 x float>* %store ) {
+; CHECK-LABEL: conv_v2f32_to_i64:
+; CHECK: vrev64.32
+  %v = load <2 x float>* %load
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  %f = bitcast <2 x float> %a to i64
+  call void @conv_i64_to_v2f32( i64 %f, <2 x float>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v8i8( double %val,  <8 x i8>* %store ) {
+; CHECK-LABEL: conv_f64_to_v8i8:
+; CHECK: vrev64.8
+  %v = bitcast double %val to <8 x i8>
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  store <8 x i8> %a, <8 x i8>* %store
+  ret void
+}
+
+define void @conv_v8i8_to_f64( <8 x i8>* %load, <8 x i8>* %store ) {
+; CHECK-LABEL: conv_v8i8_to_f64:
+; CHECK: vrev64.8
+  %v = load <8 x i8>* %load
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  %f = bitcast <8 x i8> %a to double
+  call void @conv_f64_to_v8i8( double %f, <8 x i8>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v4i16( double %val,  <4 x i16>* %store ) {
+; CHECK-LABEL: conv_f64_to_v4i16:
+; CHECK: vrev64.16
+  %v = bitcast double %val to <4 x i16>
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  store <4 x i16> %a, <4 x i16>* %store
+  ret void
+}
+
+define void @conv_v4i16_to_f64( <4 x i16>* %load, <4 x i16>* %store ) {
+; CHECK-LABEL: conv_v4i16_to_f64:
+; CHECK: vrev64.16
+  %v = load <4 x i16>* %load
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  %f = bitcast <4 x i16> %a to double
+  call void @conv_f64_to_v4i16( double %f, <4 x i16>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v2i32( double %val,  <2 x i32>* %store ) {
+; CHECK-LABEL: conv_f64_to_v2i32:
+; CHECK: vrev64.32
+  %v = bitcast double %val to <2 x i32>
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  store <2 x i32> %a, <2 x i32>* %store
+  ret void
+}
+
+define void @conv_v2i32_to_f64( <2 x i32>* %load, <2 x i32>* %store ) {
+; CHECK-LABEL: conv_v2i32_to_f64:
+; CHECK: vrev64.32
+  %v = load <2 x i32>* %load
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  %f = bitcast <2 x i32> %a to double
+  call void @conv_f64_to_v2i32( double %f, <2 x i32>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v2f32( double %val,  <2 x float>* %store ) {
+; CHECK-LABEL: conv_f64_to_v2f32:
+; CHECK: vrev64.32
+  %v = bitcast double %val to <2 x float>
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  store <2 x float> %a, <2 x float>* %store
+  ret void
+}
+
+define void @conv_v2f32_to_f64( <2 x float>* %load, <2 x float>* %store ) {
+; CHECK-LABEL: conv_v2f32_to_f64:
+; CHECK: vrev64.32
+  %v = load <2 x float>* %load
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  %f = bitcast <2 x float> %a to double
+  call void @conv_f64_to_v2f32( double %f, <2 x float>* %store )
+  ret void
+}
+
+; 128 bit conversions
+
+
+define void @conv_i128_to_v16i8( i128 %val,  <16 x i8>* %store ) {
+; CHECK-LABEL: conv_i128_to_v16i8:
+; CHECK: vrev32.8
+  %v = bitcast i128 %val to <16 x i8>
+  %w = load  <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  store <16 x i8> %a, <16 x i8>* %store
+  ret void
+}
+
+define void @conv_v16i8_to_i128( <16 x i8>* %load, <16 x i8>* %store ) {
+; CHECK-LABEL: conv_v16i8_to_i128:
+; CHECK: vrev32.8
+  %v = load <16 x i8>* %load
+  %w = load <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  %f = bitcast <16 x i8> %a to i128
+  call void @conv_i128_to_v16i8( i128 %f, <16 x i8>* %store )
+  ret void
+}
+
+define void @conv_i128_to_v8i16( i128 %val,  <8 x i16>* %store ) {
+; CHECK-LABEL: conv_i128_to_v8i16:
+; CHECK: vrev32.16
+  %v = bitcast i128 %val to <8 x i16>
+  %w = load  <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  store <8 x i16> %a, <8 x i16>* %store
+  ret void
+}
+
+define void @conv_v8i16_to_i128( <8 x i16>* %load, <8 x i16>* %store ) {
+; CHECK-LABEL: conv_v8i16_to_i128:
+; CHECK: vrev32.16
+  %v = load <8 x i16>* %load
+  %w = load <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  %f = bitcast <8 x i16> %a to i128
+  call void @conv_i128_to_v8i16( i128 %f, <8 x i16>* %store )
+  ret void
+}
+
+define void @conv_i128_to_v4i32( i128 %val,  <4 x i32>* %store ) {
+; CHECK-LABEL: conv_i128_to_v4i32:
+; CHECK: vrev64.32
+  %v = bitcast i128 %val to <4 x i32>
+  %w = load <4 x i32>* @v4i32
+  %a = add <4 x i32> %v, %w
+  store <4 x i32> %a, <4 x i32>* %store
+  ret void
+}
+
+define void @conv_v4i32_to_i128( <4 x i32>* %load, <4 x i32>* %store ) {
+; CHECK-LABEL: conv_v4i32_to_i128:
+; CHECK: vrev64.32
+  %v = load <4 x i32>* %load
+  %w = load <4 x i32>* @v4i32
+  %a = add <4 x i32> %v, %w
+  %f = bitcast <4 x i32> %a to i128
+  call void @conv_i128_to_v4i32( i128 %f, <4 x i32>* %store )
+  ret void
+}
+
+define void @conv_i128_to_v4f32( i128 %val,  <4 x float>* %store ) {
+; CHECK-LABEL: conv_i128_to_v4f32:
+; CHECK: vrev64.32
+  %v = bitcast i128 %val to <4 x float>
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  store <4 x float> %a, <4 x float>* %store
+  ret void
+}
+
+define void @conv_v4f32_to_i128( <4 x float>* %load, <4 x float>* %store ) {
+; CHECK-LABEL: conv_v4f32_to_i128:
+; CHECK: vrev64.32
+  %v = load <4 x float>* %load
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  %f = bitcast <4 x float> %a to i128
+  call void @conv_i128_to_v4f32( i128 %f, <4 x float>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v2f64( fp128 %val,  <2 x double>* %store ) {
+; CHECK-LABEL: conv_f128_to_v2f64:
+; CHECK: vrev64.32
+  %v = bitcast fp128 %val to <2 x double>
+  %w = load <2 x double>* @v2f64
+  %a = fadd <2 x double> %v, %w
+  store <2 x double> %a, <2 x double>* %store
+  ret void
+}
+
+define void @conv_v2f64_to_f128( <2 x double>* %load, <2 x double>* %store ) {
+; CHECK-LABEL: conv_v2f64_to_f128:
+; CHECK: vrev64.32
+  %v = load <2 x double>* %load
+  %w = load <2 x double>* @v2f64
+  %a = fadd <2 x double> %v, %w
+  %f = bitcast <2 x double> %a to fp128
+  call void @conv_f128_to_v2f64( fp128 %f, <2 x double>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v16i8( fp128 %val,  <16 x i8>* %store ) {
+; CHECK-LABEL: conv_f128_to_v16i8:
+; CHECK: vrev32.8
+  %v = bitcast fp128 %val to <16 x i8>
+  %w = load  <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  store <16 x i8> %a, <16 x i8>* %store
+  ret void
+}
+
+define void @conv_v16i8_to_f128( <16 x i8>* %load, <16 x i8>* %store ) {
+; CHECK-LABEL: conv_v16i8_to_f128:
+; CHECK: vrev32.8
+  %v = load <16 x i8>* %load
+  %w = load <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  %f = bitcast <16 x i8> %a to fp128
+  call void @conv_f128_to_v16i8( fp128 %f, <16 x i8>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v8i16( fp128 %val,  <8 x i16>* %store ) {
+; CHECK-LABEL: conv_f128_to_v8i16:
+; CHECK: vrev32.16
+  %v = bitcast fp128 %val to <8 x i16>
+  %w = load  <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  store <8 x i16> %a, <8 x i16>* %store
+  ret void
+}
+
+define void @conv_v8i16_to_f128( <8 x i16>* %load, <8 x i16>* %store ) {
+; CHECK-LABEL: conv_v8i16_to_f128:
+; CHECK: vrev32.16
+  %v = load <8 x i16>* %load
+  %w = load <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  %f = bitcast <8 x i16> %a to fp128
+  call void @conv_f128_to_v8i16( fp128 %f, <8 x i16>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v4f32( fp128 %val,  <4 x float>* %store ) {
+; CHECK-LABEL: conv_f128_to_v4f32:
+; CHECK: vrev64.32
+  %v = bitcast fp128 %val to <4 x float>
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  store <4 x float> %a, <4 x float>* %store
+  ret void
+}
+
+define void @conv_v4f32_to_f128( <4 x float>* %load, <4 x float>* %store ) {
+; CHECK-LABEL: conv_v4f32_to_f128:
+; CHECK: vrev64.32
+  %v = load <4 x float>* %load
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  %f = bitcast <4 x float> %a to fp128
+  call void @conv_f128_to_v4f32( fp128 %f, <4 x float>* %store )
+  ret void
+}
+
+define void @arg_v4i32( <4 x i32> %var, <4 x i32>* %store ) {
+; CHECK-LABEL: arg_v4i32:
+; CHECK: vmov   [[REG2:d[0-9]+]], r3, r2
+; CHECK: vmov   [[REG1:d[0-9]+]], r1, r0
+; CHECK: vst1.64 {[[REG1]], [[REG2]]},
+; CHECK-HARD-LABEL: arg_v4i32:
+; CHECK-HARD-NOT: vmov
+; CHECK-HARD: vst1.64 {d0, d1}
+  store <4 x i32> %var, <4 x i32>* %store
+  ret void
+}
+
+define void @arg_v8i16( <8 x i16> %var, <8 x i16>* %store ) {
+; CHECK-LABEL: arg_v8i16:
+; CHECK: vmov   [[REG2:d[0-9]+]], r3, r2
+; CHECK: vmov   [[REG1:d[0-9]+]], r1, r0
+; CHECK: vst1.64 {[[REG1]], [[REG2]]},
+; CHECK-HARD-LABEL: arg_v8i16:
+; CHECK-HARD-NOT: vmov
+; CHECK-HARD: vst1.64 {d0, d1}
+  store <8 x i16> %var, <8 x i16>* %store
+  ret void
+}
+
+define void @arg_v16i8( <16 x i8> %var, <16 x i8>* %store ) {
+; CHECK-LABEL: arg_v16i8:
+; CHECK: vmov   [[REG2:d[0-9]+]], r3, r2
+; CHECK: vmov   [[REG1:d[0-9]+]], r1, r0
+; CHECK: vst1.64 {[[REG1]], [[REG2]]},
+; CHECK-HARD-LABEL: arg_v16i8:
+; CHECK-HARD-NOT: vmov
+; CHECK-HARD: vst1.64 {d0, d1}
+  store <16 x i8> %var, <16 x i8>* %store
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/big-endian-vector-callee.ll b/test/CodeGen/ARM/big-endian-vector-callee.ll
new file mode 100644
index 0000000..4db8bde
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-vector-callee.ll
@@ -0,0 +1,1172 @@
+; RUN: llc -mtriple armeb-eabi -mattr v7,neon -float-abi soft %s -o - | FileCheck %s -check-prefix CHECK -check-prefix SOFT
+; RUN: llc -mtriple armeb-eabi -mattr v7,neon -float-abi hard %s -o - | FileCheck %s -check-prefix CHECK -check-prefix HARD
+
+; CHECK-LABEL: test_i64_f64:
+define i64 @test_i64_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+define i64 @test_i64_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+define i64 @test_i64_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+define i64 @test_i64_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+define i64 @test_i64_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+define i64 @test_i64_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_f64_i64:
+define double @test_f64_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+define double @test_f64_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+define double @test_f64_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+define double @test_f64_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+define double @test_f64_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+define double @test_f64_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+define <1 x i64> @test_v1i64_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+define <1 x i64> @test_v1i64_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+define <1 x i64> @test_v1i64_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+define <1 x i64> @test_v1i64_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+define <1 x i64> @test_v1i64_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+define <1 x i64> @test_v1i64_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+define <2 x float> @test_v2f32_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+define <2 x float> @test_v2f32_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+define <2 x float> @test_v2f32_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+define <2 x float> @test_v2f32_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+define <2 x float> @test_v2f32_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+define <2 x float> @test_v2f32_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+define <2 x i32> @test_v2i32_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+define <2 x i32> @test_v2i32_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+define <2 x i32> @test_v2i32_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+define <2 x i32> @test_v2i32_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+define <2 x i32> @test_v2i32_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+define <2 x i32> @test_v2i32_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+define <4 x i16> @test_v4i16_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+define <4 x i16> @test_v4i16_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+define <4 x i16> @test_v4i16_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+define <4 x i16> @test_v4i16_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+define <4 x i16> @test_v4i16_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+define <4 x i16> @test_v4i16_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+define <8 x i8> @test_v8i8_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+define <8 x i8> @test_v8i8_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+define <8 x i8> @test_v8i8_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+define <8 x i8> @test_v8i8_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+define <8 x i8> @test_v8i8_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+define <8 x i8> @test_v8i8_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+define fp128 @test_f128_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64 d{{[0-9]+}}, d1
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+define fp128 @test_f128_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64 q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+define fp128 @test_f128_v4f32(<4 x float> %p) {
+; HARD: vrev64.32 q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+define fp128 @test_f128_v4i32(<4 x i32> %p) {
+; HARD: vrev64.32 q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+define fp128 @test_f128_v8i16(<8 x i16> %p) {
+; HARD: vrev64.16 q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+define fp128 @test_f128_v16i8(<16 x i8> %p) {
+; HARD: vrev64.8 q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+define <2 x double> @test_v2f64_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+define <2 x double> @test_v2f64_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64 q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+define <2 x double> @test_v2f64_v4f32(<4 x float> %p) {
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+define <2 x double> @test_v2f64_v4i32(<4 x i32> %p) {
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+define <2 x double> @test_v2f64_v8i16(<8 x i16> %p) {
+; HARD: vrev64.16  q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+define <2 x double> @test_v2f64_v16i8(<16 x i8> %p) {
+; HARD: vrev64.8  q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+define <2 x i64> @test_v2i64_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+define <2 x i64> @test_v2i64_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64  d{{[0-9]+}}, d1
+; HARD: vadd.f64  d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+define <2 x i64> @test_v2i64_v4f32(<4 x float> %p) {
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+define <2 x i64> @test_v2i64_v4i32(<4 x i32> %p) {
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+define <2 x i64> @test_v2i64_v8i16(<8 x i16> %p) {
+; HARD: vrev64.16  q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+define <2 x i64> @test_v2i64_v16i8(<16 x i8> %p) {
+; HARD: vrev64.8  q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+define <4 x float> @test_v4f32_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+define <4 x float> @test_v4f32_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64  d{{[0-9]+}}, d1
+; HARD: vadd.f64  d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+define <4 x float> @test_v4f32_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64  q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+define <4 x float> @test_v4f32_v4i32(<4 x i32> %p) {
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+define <4 x float> @test_v4f32_v8i16(<8 x i16> %p) {
+; HARD: vrev64.16  q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+define <4 x float> @test_v4f32_v16i8(<16 x i8> %p) {
+; HARD: vrev64.8  q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+define <4 x i32> @test_v4i32_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+define <4 x i32> @test_v4i32_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64  d{{[0-9]+}}, d1
+; HARD: vadd.f64  d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+define <4 x i32> @test_v4i32_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64  q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+define <4 x i32> @test_v4i32_v4f32(<4 x float> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+define <4 x i32> @test_v4i32_v8i16(<8 x i16> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.16  q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+define <4 x i32> @test_v4i32_v16i8(<16 x i8> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.8  q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+define <8 x i16> @test_v8i16_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+define <8 x i16> @test_v8i16_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64  d{{[0-9]+}}, d1
+; HARD: vadd.f64  d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+define <8 x i16> @test_v8i16_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64  q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+define <8 x i16> @test_v8i16_v4f32(<4 x float> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+define <8 x i16> @test_v8i16_v4i32(<4 x i32> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+define <8 x i16> @test_v8i16_v16i8(<16 x i8> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.8 q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+define <16 x i8> @test_v16i8_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+define <16 x i8> @test_v16i8_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64  d{{[0-9]+}}, d1
+; HARD: vadd.f64  d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+define <16 x i8> @test_v16i8_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64  q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+define <16 x i8> @test_v16i8_v4f32(<4 x float> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.32 q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+define <16 x i8> @test_v16i8_v4i32(<4 x i32> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.32 q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+define <16 x i8> @test_v16i8_v8i16(<8 x i16> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.16 q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
diff --git a/test/CodeGen/ARM/big-endian-vector-caller.ll b/test/CodeGen/ARM/big-endian-vector-caller.ll
new file mode 100644
index 0000000..d01b0a7
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-vector-caller.ll
@@ -0,0 +1,1369 @@
+; RUN: llc -mtriple armeb-eabi -mattr v7,neon -float-abi soft %s -o - | FileCheck %s -check-prefix CHECK -check-prefix SOFT
+; RUN: llc -mtriple armeb-eabi -mattr v7,neon -float-abi hard %s -o - | FileCheck %s -check-prefix CHECK -check-prefix HARD
+
+; CHECK-LABEL: test_i64_f64:
+declare i64 @test_i64_f64_helper(double %p)
+define void @test_i64_f64(double* %p, i64* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call i64 @test_i64_f64_helper(double %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+declare i64 @test_i64_v1i64_helper(<1 x i64> %p)
+define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+declare i64 @test_i64_v2f32_helper(<2 x float> %p)
+define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call i64 @test_i64_v2f32_helper(<2 x float> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+declare i64 @test_i64_v2i32_helper(<2 x i32> %p)
+define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+declare i64 @test_i64_v4i16_helper(<4 x i16> %p)
+define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+declare i64 @test_i64_v8i8_helper(<8 x i8> %p)
+define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_f64_i64:
+declare double @test_f64_i64_helper(i64 %p)
+define void @test_f64_i64(i64* %p, double* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call double @test_f64_i64_helper(i64 %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+declare double @test_f64_v1i64_helper(<1 x i64> %p)
+define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call double @test_f64_v1i64_helper(<1 x i64> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+declare double @test_f64_v2f32_helper(<2 x float> %p)
+define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call double @test_f64_v2f32_helper(<2 x float> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+declare double @test_f64_v2i32_helper(<2 x i32> %p)
+define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call double @test_f64_v2i32_helper(<2 x i32> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+declare double @test_f64_v4i16_helper(<4 x i16> %p)
+define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call double @test_f64_v4i16_helper(<4 x i16> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+declare double @test_f64_v8i8_helper(<8 x i8> %p)
+define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call double @test_f64_v8i8_helper(<8 x i8> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+declare <1 x i64> @test_v1i64_i64_helper(i64 %p)
+define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+declare <1 x i64> @test_v1i64_f64_helper(double %p)
+define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <1 x i64> @test_v1i64_f64_helper(double %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+declare <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %p)
+define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
+; HARD: vrev64.32 d0
+; SOFT: vadd.f32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+declare <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %p)
+define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
+; HARD: vrev64.32 d0
+; SOFT: vadd.i32 [[REG:d[0-9]+]]
+; SOFT: vrev64.32 [[REG]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+declare <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %p)
+define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+declare <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %p)
+define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+declare <2 x float> @test_v2f32_i64_helper(i64 %p)
+define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <2 x float> @test_v2f32_i64_helper(i64 %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+declare <2 x float> @test_v2f32_f64_helper(double %p)
+define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <2 x float> @test_v2f32_f64_helper(double %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+declare <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %p)
+define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+declare <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %p)
+define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
+; HARD: vrev64.32 d0
+; SOFT: vadd.i32 [[REG:d[0-9]+]]
+; SOFT: vrev64.32 [[REG]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+declare <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %p)
+define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+declare <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %p)
+define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+declare <2 x i32> @test_v2i32_i64_helper(i64 %p)
+define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+declare <2 x i32> @test_v2i32_f64_helper(double %p)
+define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <2 x i32> @test_v2i32_f64_helper(double %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+declare <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %p)
+define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+declare <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %p)
+define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
+; HARD: vadd.f32 [[REG:d[0-9]+]]
+; HARD: vrev64.32 d0, [[REG]]
+; SOFT: vadd.f32 [[REG:d[0-9]+]]
+; SOFT: vrev64.32 [[REG]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+declare <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %p)
+define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+declare <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %p)
+define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+declare <4 x i16> @test_v4i16_i64_helper(i64 %p)
+define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+declare <4 x i16> @test_v4i16_f64_helper(double %p)
+define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <4 x i16> @test_v4i16_f64_helper(double %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+declare <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %p)
+define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+declare <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %p)
+define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
+; HARD: vadd.f32 [[REG:d[0-9]+]]
+; HARD: vrev64.32 d0, [[REG]]
+; SOFT: vadd.f32 [[REG:d[0-9]+]]
+; SOFT: vrev64.32 [[REG]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+declare <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %p)
+define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
+; HARD: vadd.i32 [[REG:d[0-9]+]]
+; HARD: vrev64.32 d0, [[REG]]
+; SOFT: vadd.i32 [[REG:d[0-9]+]]
+; SOFT: vrev64.32 [[REG]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+declare <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %p)
+define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+declare <8 x i8> @test_v8i8_i64_helper(i64 %p)
+define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+declare <8 x i8> @test_v8i8_f64_helper(double %p)
+define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <8 x i8> @test_v8i8_f64_helper(double %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+declare <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %p)
+define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+declare <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %p)
+define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+declare <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %p)
+define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+declare <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %p)
+define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+declare fp128 @test_f128_v2f64_helper(<2 x double> %p)
+define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG1]]
+; SOFT: vmov r3, r2, [[REG2]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+declare fp128 @test_f128_v2i64_helper(<2 x i64> %p)
+define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+declare fp128 @test_f128_v4f32_helper(<4 x float> %p)
+define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+declare fp128 @test_f128_v4i32_helper(<4 x i32> %p)
+define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+declare fp128 @test_f128_v8i16_helper(<8 x i16> %p)
+define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+declare fp128 @test_f128_v16i8_helper(<16 x i8> %p)
+define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+declare <2 x double> @test_v2f64_f128_helper(fp128 %p)
+define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+declare <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %p)
+define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+declare <2 x double> @test_v2f64_v4f32_helper(<4 x float> %p)
+define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+declare <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %p)
+define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+declare <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %p)
+define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+declare <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %p)
+define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+declare <2 x i64> @test_v2i64_f128_helper(fp128 %p)
+define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+declare <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %p)
+define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
+; SOFT: vmov r1, r0, [[REG1]]
+; SOFT: vmov r3, r2, [[REG2]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+declare <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %p) 
+define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+declare <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %p)
+define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+declare <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %p)
+define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+declare <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %p)
+define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+declare <4 x float> @test_v4f32_f128_helper(fp128 %p)
+define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+declare <4 x float> @test_v4f32_v2f64_helper(<2 x double> %p)
+define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.f64  d1
+; HARD: vadd.f64  d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+declare <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %p)
+define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+declare <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %p)
+define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+declare <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %p)
+define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+declare <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %p)
+define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+declare <4 x i32> @test_v4i32_f128_helper(fp128 %p)
+define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+declare <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %p)
+define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+declare <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %p)
+define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+declare <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %p)
+define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+declare <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %p)
+define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+declare <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %p)
+define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+declare <8 x i16> @test_v8i16_f128_helper(fp128 %p)
+define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+declare <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %p)
+define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+declare <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %p)
+define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+declare <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %p)
+define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+declare <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %p)
+define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+declare <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %p)
+define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+declare <16 x i8> @test_v16i8_f128_helper(fp128 %p)
+define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+declare <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %p)
+define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+declare <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %p)
+define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+declare <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %p)
+define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+declare <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %p)
+define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+declare <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %p)
+define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
diff --git a/test/CodeGen/ARM/bswap16.ll b/test/CodeGen/ARM/bswap16.ll
new file mode 100644
index 0000000..70c62d2
--- /dev/null
+++ b/test/CodeGen/ARM/bswap16.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mtriple=arm-darwin  -mattr=v6 < %s | FileCheck %s
+; RUN: llc -mtriple=thumb-darwin  -mattr=v6 < %s | FileCheck %s
+
+
+define void @test1(i16* nocapture %data) {
+entry:
+  %0 = load i16* %data, align 2
+  %1 = tail call i16 @llvm.bswap.i16(i16 %0)
+  store i16 %1, i16* %data, align 2
+  ret void
+
+  ; CHECK-LABEL: test1:
+  ; CHECK: ldrh r[[R1:[0-9]+]], [r0]
+  ; CHECK: rev16 r[[R1]], r[[R1]]
+  ; CHECK: strh r[[R1]], [r0]
+}
+
+
+define void @test2(i16* nocapture %data, i16 zeroext %in) {
+entry:
+  %0 = tail call i16 @llvm.bswap.i16(i16 %in)
+  store i16 %0, i16* %data, align 2
+  ret void
+
+  ; CHECK-LABEL: test2:
+  ; CHECK: rev16 r[[R1:[0-9]+]], r1
+  ; CHECK: strh r[[R1]], [r0]
+}
+
+
+define i16 @test3(i16* nocapture %data) {
+entry:
+  %0 = load i16* %data, align 2
+  %1 = tail call i16 @llvm.bswap.i16(i16 %0)
+  ret i16 %1
+
+  ; CHECK-LABEL: test3:
+  ; CHECK: ldrh r[[R0:[0-9]+]], [r0]
+  ; CHECK: rev16 r[[R0]], r0
+}
+
+declare i16 @llvm.bswap.i16(i16)
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index 3e825e8..d75d55d 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -33,6 +33,11 @@
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2,-vfp3,-vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-NOFPU
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,,+d16,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=pic | FileCheck %s --check-prefix=RELOC-PIC
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=static | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=default | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=dynamic-no-pic | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi | FileCheck %s --check-prefix=RELOC-OTHER
 
 ; XSCALE:      .eabi_attribute 6, 5
 ; XSCALE:      .eabi_attribute 8, 1
@@ -453,6 +458,11 @@
 ; CORTEX-A57-NOT:  .eabi_attribute 44
 ; CORTEX-A57:  .eabi_attribute 68, 3
 
+; RELOC-PIC:  .eabi_attribute 15, 1
+; RELOC-PIC:  .eabi_attribute 16, 1
+; RELOC-PIC:  .eabi_attribute 17, 2
+; RELOC-OTHER:  .eabi_attribute 17, 1
+
 define i32 @f(i64 %z) {
 	ret i32 0
 }
diff --git a/test/CodeGen/ARM/dagcombine-concatvector.ll b/test/CodeGen/ARM/dagcombine-concatvector.ll
index 2927ea2..62ed87f 100644
--- a/test/CodeGen/ARM/dagcombine-concatvector.ll
+++ b/test/CodeGen/ARM/dagcombine-concatvector.ll
@@ -1,11 +1,14 @@
-; RUN: llc < %s -mtriple=thumbv7s-apple-ios3.0.0 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7s-apple-ios3.0.0 -mcpu=generic | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=thumbeb -mattr=v7,neon | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
 
 ; PR15525
 ; CHECK-LABEL: test1:
 ; CHECK: ldr.w	[[REG:r[0-9]+]], [sp]
-; CHECK-NEXT: vmov	{{d[0-9]+}}, r1, r2
-; CHECK-NEXT: vmov	{{d[0-9]+}}, r3, [[REG]]
-; CHECK-NEXT: vst1.8	{{{d[0-9]+}}, {{d[0-9]+}}}, [r0]
+; CHECK-LE-NEXT: vmov	{{d[0-9]+}}, r1, r2
+; CHECK-LE-NEXT: vmov	{{d[0-9]+}}, r3, [[REG]]
+; CHECK-BE-NEXT: vmov	{{d[0-9]+}}, r2, r1
+; CHECK-BE: vmov	{{d[0-9]+}}, [[REG]], r3
+; CHECK: vst1.8	{{{d[0-9]+}}, {{d[0-9]+}}}, [r0]
 ; CHECK-NEXT: bx	lr
 define void @test1(i8* %arg, [4 x i64] %vec.coerce) {
 bb:
diff --git a/test/CodeGen/ARM/debug-frame-vararg.ll b/test/CodeGen/ARM/debug-frame-vararg.ll
index 9b39525..42ff82d 100644
--- a/test/CodeGen/ARM/debug-frame-vararg.ll
+++ b/test/CodeGen/ARM/debug-frame-vararg.ll
@@ -75,12 +75,13 @@
 ; CHECK-FP-ELIM: .cfi_startproc
 ; CHECK-FP-ELIM: sub    sp, sp, #16
 ; CHECK-FP-ELIM: .cfi_def_cfa_offset 16
-; CHECK-FP-ELIM: push   {r4, r11, lr}
-; CHECK-FP-ELIM: .cfi_def_cfa_offset 28
+; CHECK-FP-ELIM: push   {r4, r10, r11, lr}
+; CHECK-FP-ELIM: .cfi_def_cfa_offset 32
 ; CHECK-FP-ELIM: .cfi_offset lr, -20
 ; CHECK-FP-ELIM: .cfi_offset r11, -24
-; CHECK-FP-ELIM: .cfi_offset r4, -28
-; CHECK-FP-ELIM: add    r11, sp, #4
+; CHECK-FP-ELIM: .cfi_offset r10, -28
+; CHECK-FP-ELIM: .cfi_offset r4, -32
+; CHECK-FP-ELIM: add    r11, sp, #8
 ; CHECK-FP-ELIM: .cfi_def_cfa r11, 24
 
 ; CHECK-THUMB-FP-LABEL: sum
diff --git a/test/CodeGen/ARM/debug-frame.ll b/test/CodeGen/ARM/debug-frame.ll
index cf68767..cb54aa8 100644
--- a/test/CodeGen/ARM/debug-frame.ll
+++ b/test/CodeGen/ARM/debug-frame.ll
@@ -201,12 +201,13 @@ declare void @_ZSt9terminatev()
 
 ; CHECK-V7-FP-LABEL: _Z4testiiiiiddddd:
 ; CHECK-V7-FP:   .cfi_startproc
-; CHECK-V7-FP:   push   {r4, r11, lr}
-; CHECK-V7-FP:   .cfi_def_cfa_offset 12
+; CHECK-V7-FP:   push   {r4, r10, r11, lr}
+; CHECK-V7-FP:   .cfi_def_cfa_offset 16
 ; CHECK-V7-FP:   .cfi_offset lr, -4
 ; CHECK-V7-FP:   .cfi_offset r11, -8
-; CHECK-V7-FP:   .cfi_offset r4, -12
-; CHECK-V7-FP:   add    r11, sp, #4
+; CHECK-V7-FP:   .cfi_offset r10, -12
+; CHECK-V7-FP:   .cfi_offset r4, -16
+; CHECK-V7-FP:   add    r11, sp, #8
 ; CHECK-V7-FP:   .cfi_def_cfa r11, 8
 ; CHECK-V7-FP:   vpush  {d8, d9, d10, d11, d12}
 ; CHECK-V7-FP:   .cfi_offset d12, -24
@@ -214,7 +215,7 @@ declare void @_ZSt9terminatev()
 ; CHECK-V7-FP:   .cfi_offset d10, -40
 ; CHECK-V7-FP:   .cfi_offset d9, -48
 ; CHECK-V7-FP:   .cfi_offset d8, -56
-; CHECK-V7-FP:   sub    sp, sp, #28
+; CHECK-V7-FP:   sub    sp, sp, #24
 ; CHECK-V7-FP:   .cfi_endproc
 
 ; CHECK-V7-FP-ELIM-LABEL: _Z4testiiiiiddddd:
diff --git a/test/CodeGen/ARM/debug-segmented-stacks.ll b/test/CodeGen/ARM/debug-segmented-stacks.ll
index b0dc467..e866b4e 100644
--- a/test/CodeGen/ARM/debug-segmented-stacks.ll
+++ b/test/CodeGen/ARM/debug-segmented-stacks.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs -filetype=asm | FileCheck %s -check-prefix=ARM-linux
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -verify-machineinstrs -filetype=asm | FileCheck %s -check-prefix=ARM-linux
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -filetype=obj
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-define void @test_basic() {
+define void @test_basic() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -78,3 +78,5 @@ define void @test_basic() {
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/ARM/dwarf-eh.ll b/test/CodeGen/ARM/dwarf-eh.ll
new file mode 100644
index 0000000..0b8a072
--- /dev/null
+++ b/test/CodeGen/ARM/dwarf-eh.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=arm-netbsd-eabi -o - -filetype=asm %s | \
+; RUN: FileCheck %s
+; RUN: llc -mtriple=arm-netbsd-eabi -o - -filetype=asm %s \
+; RUN: -relocation-model=pic | FileCheck -check-prefix=CHECK-PIC %s
+
+; ModuleID = 'test.cc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv5e--netbsd-eabi"
+
+%struct.exception = type { i8 }
+
+@_ZTVN10__cxxabiv117__class_type_infoE = external global i8*
+@_ZTS9exception = linkonce_odr constant [11 x i8] c"9exception\00"
+@_ZTI9exception = linkonce_odr unnamed_addr constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8** @_ZTVN10__cxxabiv117__class_type_infoE, i32 2) to i8*), i8* getelementptr inbounds ([11 x i8]* @_ZTS9exception, i32 0, i32 0) }
+
+define void @f() uwtable {
+  %1 = alloca i8*
+  %2 = alloca i32
+  %e = alloca %struct.exception*, align 4
+  invoke void @g()
+          to label %3 unwind label %4
+
+  br label %16
+
+  %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast ({ i8*, i8* }* @_ZTI9exception to i8*)
+  %6 = extractvalue { i8*, i32 } %5, 0
+  store i8* %6, i8** %1
+  %7 = extractvalue { i8*, i32 } %5, 1
+  store i32 %7, i32* %2
+  br label %8
+
+  %9 = load i32* %2
+  %10 = call i32 @llvm.eh.typeid.for(i8* bitcast ({ i8*, i8* }* @_ZTI9exception to i8*)) nounwind
+  %11 = icmp eq i32 %9, %10
+  br i1 %11, label %12, label %17
+
+  %13 = load i8** %1
+  %14 = call i8* @__cxa_begin_catch(i8* %13) #3
+  %15 = bitcast i8* %14 to %struct.exception*
+  store %struct.exception* %15, %struct.exception** %e
+  call void @__cxa_end_catch()
+  br label %16
+
+  ret void
+
+  %18 = load i8** %1
+  %19 = load i32* %2
+  %20 = insertvalue { i8*, i32 } undef, i8* %18, 0
+  %21 = insertvalue { i8*, i32 } %20, i32 %19, 1
+  resume { i8*, i32 } %21
+}
+
+declare void @g()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; CHECK: .cfi_personality 0,
+; CHECK: .cfi_lsda 0,
+; CHECK: @TType Encoding = absptr
+; CHECK: @ Call site Encoding = udata4
+; CHECK-PIC: .cfi_personality 155,
+; CHECK-PIC: .cfi_lsda 27,
+; CHECK-PIC: @TType Encoding = indirect pcrel sdata4
+; CHECK-PIC: @ Call site Encoding = udata4
diff --git a/test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll b/test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll
new file mode 100644
index 0000000..42ca988
--- /dev/null
+++ b/test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll
@@ -0,0 +1,61 @@
+; Test for handlerdata when the function has landingpad and nounwind.
+
+; This test case checks whether the handlerdata is generated for the function
+; with landingpad instruction, even if the function has "nounwind" atttribute.
+;
+; For example, although the following function never throws any exception,
+; however, it is still required to generate LSDA, otherwise, we can't catch
+; the exception properly.
+;
+; void test1() noexcept {
+;   try {
+;     throw_exception();
+;   } catch (...) {
+;   }
+; }
+
+; RUN: llc -mtriple arm-unknown-linux-gnueabi -filetype=asm -o - %s \
+; RUN:   | FileCheck %s
+
+declare void @throw_exception()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+define void @test1() nounwind {
+entry:
+  invoke void @throw_exception() to label %try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1)
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret void
+}
+
+; CHECK:   .globl test1
+; CHECK:   .align 2
+; CHECK:   .type test1,%function
+; CHECK-LABEL: test1:
+; CHECK:   .fnstart
+
+; CHECK-NOT: .cantunwind
+
+; CHECK:   .personality __gxx_personality_v0
+; CHECK:   .handlerdata
+; CHECK:   .align 2
+; CHECK-LABEL: GCC_except_table0:
+; CHECK-LABEL: .Lexception0:
+; CHECK:   .byte 255                     @ @LPStart Encoding = omit
+; CHECK:   .byte 0                       @ @TType Encoding = absptr
+; CHECK:   .asciz
+; CHECK:   .byte 3                       @ Call site Encoding = udata4
+; CHECK:   .fnend
diff --git a/test/CodeGen/ARM/ehabi-handlerdata.ll b/test/CodeGen/ARM/ehabi-handlerdata.ll
new file mode 100644
index 0000000..7045902
--- /dev/null
+++ b/test/CodeGen/ARM/ehabi-handlerdata.ll
@@ -0,0 +1,59 @@
+; ARM EHABI test for the handlerdata.
+
+; This test case checks whether the handlerdata for exception
+; handling is generated properly.
+;
+; (1) The handlerdata must not be empty.
+; (2) LPStartEncoding == DW_EH_PE_omit
+; (3) TTypeEncoding == DW_EH_PE_absptr
+; (4) CallSiteEncoding == DW_EH_PE_udata4
+
+; RUN: llc -mtriple arm-unknown-linux-gnueabi -filetype=asm -o - %s \
+; RUN:   | FileCheck %s
+
+; RUN: llc -mtriple arm-unknown-linux-gnueabi -filetype=asm -o - %s \
+; RUN:     -relocation-model=pic \
+; RUN:   | FileCheck %s
+
+declare void @throw_exception()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+define void @test1() {
+entry:
+  invoke void @throw_exception() to label %try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1)
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret void
+}
+
+; CHECK:   .globl test1
+; CHECK:   .align 2
+; CHECK:   .type test1,%function
+; CHECK-LABEL: test1:
+; CHECK:   .fnstart
+; CHECK:   .personality __gxx_personality_v0
+; CHECK:   .handlerdata
+; CHECK:   .align 2
+; CHECK-LABEL: GCC_except_table0:
+; CHECK-LABEL: .Lexception0:
+; CHECK:   .byte 255                     @ @LPStart Encoding = omit
+; CHECK:   .byte 0                       @ @TType Encoding = absptr
+; CHECK:   .asciz
+; CHECK:   .byte 3                       @ Call site Encoding = udata4
+; CHECK:   .long
+; CHECK:   .long
+; CHECK:   .long
+; CHECK:   .fnend
diff --git a/test/CodeGen/ARM/ehabi.ll b/test/CodeGen/ARM/ehabi.ll
index 720cc3c..ebf0c2a 100644
--- a/test/CodeGen/ARM/ehabi.ll
+++ b/test/CodeGen/ARM/ehabi.ll
@@ -50,6 +50,22 @@
 ; RUN:     -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-V7-FP-ELIM
 
+; RUN: llc -mtriple arm-unknown-netbsd-eabi \
+; RUN:     -disable-fp-elim -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=DWARF-FP
+
+; RUN: llc -mtriple arm-unknown-netbsd-eabi \
+; RUN:     -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=DWARF-FP-ELIM
+
+; RUN: llc -mtriple armv7-unknown-netbsd-eabi \
+; RUN:     -disable-fp-elim -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=DWARF-V7-FP
+
+; RUN: llc -mtriple armv7-unknown-netbsd-eabi \
+; RUN:     -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=DWARF-V7-FP-ELIM
+
 ;-------------------------------------------------------------------------------
 ; Test 1
 ;-------------------------------------------------------------------------------
@@ -148,14 +164,14 @@ declare void @_ZSt9terminatev()
 
 ; CHECK-V7-FP-LABEL: _Z4testiiiiiddddd:
 ; CHECK-V7-FP:   .fnstart
-; CHECK-V7-FP:   .save  {r4, r11, lr}
-; CHECK-V7-FP:   push   {r4, r11, lr}
-; CHECK-V7-FP:   .setfp r11, sp, #4
-; CHECK-V7-FP:   add    r11, sp, #4
+; CHECK-V7-FP:   .save  {r4, r10, r11, lr}
+; CHECK-V7-FP:   push   {r4, r10, r11, lr}
+; CHECK-V7-FP:   .setfp r11, sp, #8
+; CHECK-V7-FP:   add    r11, sp, #8
 ; CHECK-V7-FP:   .vsave {d8, d9, d10, d11, d12}
 ; CHECK-V7-FP:   vpush  {d8, d9, d10, d11, d12}
-; CHECK-V7-FP:   .pad   #28
-; CHECK-V7-FP:   sub    sp, sp, #28
+; CHECK-V7-FP:   .pad   #24
+; CHECK-V7-FP:   sub    sp, sp, #24
 ; CHECK-V7-FP:   .personality __gxx_personality_v0
 ; CHECK-V7-FP:   .handlerdata
 ; CHECK-V7-FP:   .fnend
@@ -172,6 +188,93 @@ declare void @_ZSt9terminatev()
 ; CHECK-V7-FP-ELIM:   .handlerdata
 ; CHECK-V7-FP-ELIM:   .fnend
 
+; DWARF-FP-LABEL: _Z4testiiiiiddddd:
+; DWARF-FP:    .cfi_startproc
+; DWARF-FP:    .cfi_personality 0, __gxx_personality_v0
+; DWARF-FP:    .cfi_lsda 0, .Lexception0
+; DWARF-FP:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; DWARF-FP:    .cfi_def_cfa_offset 36
+; DWARF-FP:    .cfi_offset lr, -4
+; DWARF-FP:    .cfi_offset r11, -8
+; DWARF-FP:    .cfi_offset r10, -12
+; DWARF-FP:    .cfi_offset r9, -16
+; DWARF-FP:    .cfi_offset r8, -20
+; DWARF-FP:    .cfi_offset r7, -24
+; DWARF-FP:    .cfi_offset r6, -28
+; DWARF-FP:    .cfi_offset r5, -32
+; DWARF-FP:    .cfi_offset r4, -36
+; DWARF-FP:    add r11, sp, #28
+; DWARF-FP:    .cfi_def_cfa r11, 8
+; DWARF-FP:    sub sp, sp, #28
+; DWARF-FP:    sub sp, r11, #28
+; DWARF-FP:    pop {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; DWARF-FP:    mov pc, lr
+; DWARF-FP:    .cfi_endproc
+
+; DWARF-FP-ELIM-LABEL: _Z4testiiiiiddddd:
+; DWARF-FP-ELIM:    .cfi_startproc
+; DWARF-FP-ELIM:    .cfi_personality 0, __gxx_personality_v0
+; DWARF-FP-ELIM:    .cfi_lsda 0, .Lexception0
+; DWARF-FP-ELIM:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; DWARF-FP-ELIM:    .cfi_def_cfa_offset 36
+; DWARF-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-FP-ELIM:    .cfi_offset r11, -8
+; DWARF-FP-ELIM:    .cfi_offset r10, -12
+; DWARF-FP-ELIM:    .cfi_offset r9, -16
+; DWARF-FP-ELIM:    .cfi_offset r8, -20
+; DWARF-FP-ELIM:    .cfi_offset r7, -24
+; DWARF-FP-ELIM:    .cfi_offset r6, -28
+; DWARF-FP-ELIM:    .cfi_offset r5, -32
+; DWARF-FP-ELIM:    .cfi_offset r4, -36
+; DWARF-FP-ELIM:    sub sp, sp, #28
+; DWARF-FP-ELIM:    .cfi_def_cfa_offset 64
+; DWARF-FP-ELIM:    add sp, sp, #28
+; DWARF-FP-ELIM:    pop {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; DWARF-FP-ELIM:    mov pc, lr
+; DWARF-FP-ELIM:    .cfi_endproc
+
+; DWARF-V7-FP-LABEL: _Z4testiiiiiddddd:
+; DWARF-V7-FP:    .cfi_startproc
+; DWARF-V7-FP:    .cfi_personality 0, __gxx_personality_v0
+; DWARF-V7-FP:    .cfi_lsda 0, .Lexception0
+; DWARF-V7-FP:    push {r4, r10, r11, lr}
+; DWARF-V7-FP:    .cfi_def_cfa_offset 16
+; DWARF-V7-FP:    .cfi_offset lr, -4
+; DWARF-V7-FP:    .cfi_offset r11, -8
+; DWARF-V7-FP:    .cfi_offset r10, -12
+; DWARF-V7-FP:    .cfi_offset r4, -16
+; DWARF-V7-FP:    add r11, sp, #8
+; DWARF-V7-FP:    .cfi_def_cfa r11, 8
+; DWARF-V7-FP:    vpush {d8, d9, d10, d11, d12}
+; DWARF-V7-FP:    .cfi_offset d12, -24
+; DWARF-V7-FP:    .cfi_offset d11, -32
+; DWARF-V7-FP:    .cfi_offset d10, -40
+; DWARF-V7-FP:    .cfi_offset d9, -48
+; DWARF-V7-FP:    sub sp, sp, #24
+; DWARF-V7-FP:    sub sp, r11, #48
+; DWARF-V7-FP:    vpop {d8, d9, d10, d11, d12}
+; DWARF-V7-FP:    pop {r4, r10, r11, pc}
+; DWARF-V7-FP:    .cfi_endproc
+
+; DWARF-V7-FP-ELIM-LABEL: _Z4testiiiiiddddd:
+; DWARF-V7-FP-ELIM:    .cfi_startproc
+; DWARF-V7-FP-ELIM:    .cfi_personality 0, __gxx_personality_v0
+; DWARF-V7-FP-ELIM:    .cfi_lsda 0, .Lexception0
+; DWARF-V7-FP-ELIM:    push {r4, lr}
+; DWARF-V7-FP-ELIM:    .cfi_def_cfa_offset 8
+; DWARF-V7-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-V7-FP-ELIM:    .cfi_offset r4, -8
+; DWARF-V7-FP-ELIM:    vpush {d8, d9, d10, d11, d12}
+; DWARF-V7-FP-ELIM:    .cfi_offset d12, -16
+; DWARF-V7-FP-ELIM:    .cfi_offset d11, -24
+; DWARF-V7-FP-ELIM:    .cfi_offset d10, -32
+; DWARF-V7-FP-ELIM:    .cfi_offset d9, -40
+; DWARF-V7-FP-ELIM:    sub sp, sp, #24
+; DWARF-V7-FP-ELIM:    .cfi_def_cfa_offset 72
+; DWARF-V7-FP-ELIM:    add sp, sp, #24
+; DWARF-V7-FP-ELIM:    vpop {d8, d9, d10, d11, d12}
+; DWARF-V7-FP-ELIM:    pop {r4, pc}
+; DWARF-V7-FP-ELIM:    .cfi_endproc
 
 ;-------------------------------------------------------------------------------
 ; Test 2
@@ -219,6 +322,48 @@ entry:
 ; CHECK-V7-FP-ELIM:   pop   {r11, pc}
 ; CHECK-V7-FP-ELIM:   .fnend
 
+; DWARF-FP-LABEL: test2:
+; DWARF-FP:    .cfi_startproc
+; DWARF-FP:    push {r11, lr}
+; DWARF-FP:    .cfi_def_cfa_offset 8
+; DWARF-FP:    .cfi_offset lr, -4
+; DWARF-FP:    .cfi_offset r11, -8
+; DWARF-FP:    mov  r11, sp
+; DWARF-FP:    .cfi_def_cfa_register r11
+; DWARF-FP:    pop  {r11, lr}
+; DWARF-FP:    mov  pc, lr
+; DWARF-FP:    .cfi_endproc
+
+; DWARF-FP-ELIM-LABEL: test2:
+; DWARF-FP-ELIM:    .cfi_startproc
+; DWARF-FP-ELIM:    push {r11, lr}
+; DWARF-FP-ELIM:    .cfi_def_cfa_offset 8
+; DWARF-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-FP-ELIM:    .cfi_offset r11, -8
+; DWARF-FP-ELIM:    pop  {r11, lr}
+; DWARF-FP-ELIM:    mov  pc, lr
+; DWARF-FP-ELIM:    .cfi_endproc
+
+; DWARF-V7-FP-LABEL: test2:
+; DWARF-V7-FP:    .cfi_startproc
+; DWARF-V7-FP:    push {r11, lr}
+; DWARF-V7-FP:    .cfi_def_cfa_offset 8
+; DWARF-V7-FP:    .cfi_offset lr, -4
+; DWARF-V7-FP:    .cfi_offset r11, -8
+; DWARF-V7-FP:    mov  r11, sp
+; DWARF-V7-FP:    .cfi_def_cfa_register r11
+; DWARF-V7-FP:    pop  {r11, pc}
+; DWARF-V7-FP:    .cfi_endproc
+
+; DWARF-V7-FP-ELIM-LABEL: test2:
+; DWARF-V7-FP-ELIM:    .cfi_startproc
+; DWARF-V7-FP-ELIM:    push {r11, lr}
+; DWARF-V7-FP-ELIM:    .cfi_def_cfa_offset 8
+; DWARF-V7-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-V7-FP-ELIM:    .cfi_offset r11, -8
+; DWARF-V7-FP-ELIM:    pop  {r11, pc}
+; DWARF-V7-FP-ELIM:    .cfi_endproc
+
 
 ;-------------------------------------------------------------------------------
 ; Test 3
@@ -275,6 +420,56 @@ entry:
 ; CHECK-V7-FP-ELIM:   pop   {r4, r5, r11, pc}
 ; CHECK-V7-FP-ELIM:   .fnend
 
+; DWARF-FP-LABEL: test3:
+; DWARF-FP:    .cfi_startproc
+; DWARF-FP:    push {r4, r5, r11, lr}
+; DWARF-FP:    .cfi_def_cfa_offset 16
+; DWARF-FP:    .cfi_offset lr, -4
+; DWARF-FP:    .cfi_offset r11, -8
+; DWARF-FP:    .cfi_offset r5, -12
+; DWARF-FP:    .cfi_offset r4, -16
+; DWARF-FP:    add  r11, sp, #8
+; DWARF-FP:    .cfi_def_cfa r11, 8
+; DWARF-FP:    pop  {r4, r5, r11, lr}
+; DWARF-FP:    mov  pc, lr
+; DWARF-FP:    .cfi_endproc
+
+; DWARF-FP-ELIM-LABEL: test3:
+; DWARF-FP-ELIM:    .cfi_startproc
+; DWARF-FP-ELIM:    push {r4, r5, r11, lr}
+; DWARF-FP-ELIM:    .cfi_def_cfa_offset 16
+; DWARF-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-FP-ELIM:    .cfi_offset r11, -8
+; DWARF-FP-ELIM:    .cfi_offset r5, -12
+; DWARF-FP-ELIM:    .cfi_offset r4, -16
+; DWARF-FP-ELIM:    pop  {r4, r5, r11, lr}
+; DWARF-FP-ELIM:    mov  pc, lr
+; DWARF-FP-ELIM:    .cfi_endproc
+
+; DWARF-V7-FP-LABEL: test3:
+; DWARF-V7-FP:    .cfi_startproc
+; DWARF-V7-FP:    push {r4, r5, r11, lr}
+; DWARF-V7-FP:    .cfi_def_cfa_offset 16
+; DWARF-V7-FP:    .cfi_offset lr, -4
+; DWARF-V7-FP:    .cfi_offset r11, -8
+; DWARF-V7-FP:    .cfi_offset r5, -12
+; DWARF-V7-FP:    .cfi_offset r4, -16
+; DWARF-V7-FP:    add  r11, sp, #8
+; DWARF-V7-FP:    .cfi_def_cfa r11, 8
+; DWARF-V7-FP:    pop  {r4, r5, r11, pc}
+; DWARF-V7-FP:    .cfi_endproc
+
+; DWARF-V7-FP-ELIM-LABEL: test3:
+; DWARF-V7-FP-ELIM:    .cfi_startproc
+; DWARF-V7-FP-ELIM:    push {r4, r5, r11, lr}
+; DWARF-V7-FP-ELIM:    .cfi_def_cfa_offset 16
+; DWARF-V7-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-V7-FP-ELIM:    .cfi_offset r11, -8
+; DWARF-V7-FP-ELIM:    .cfi_offset r5, -12
+; DWARF-V7-FP-ELIM:    .cfi_offset r4, -16
+; DWARF-V7-FP-ELIM:    pop  {r4, r5, r11, pc}
+; DWARF-V7-FP-ELIM:    .cfi_endproc
+
 
 ;-------------------------------------------------------------------------------
 ; Test 4
@@ -308,3 +503,27 @@ entry:
 ; CHECK-V7-FP-ELIM:   bx lr
 ; CHECK-V7-FP-ELIM:   .cantunwind
 ; CHECK-V7-FP-ELIM:   .fnend
+
+; DWARF-FP-LABEL: test4:
+; DWARF-FP-NOT: .cfi_startproc
+; DWARF-FP:    mov pc, lr
+; DWARF-FP-NOT: .cfi_endproc
+; DWARF-FP:    .size test4,
+
+; DWARF-FP-ELIM-LABEL: test4:
+; DWARF-FP-ELIM-NOT: .cfi_startproc
+; DWARF-FP-ELIM:     mov pc, lr
+; DWARF-FP-ELIM-NOT: .cfi_endproc
+; DWARF-FP-ELIM:     .size test4,
+
+; DWARF-V7-FP-LABEL: test4:
+; DWARF-V7-FP-NOT: .cfi_startproc
+; DWARF-V7-FP:    bx lr
+; DWARF-V7-FP-NOT: .cfi_endproc
+; DWARF-V7-FP:    .size test4,
+
+; DWARF-V7-FP-ELIM-LABEL: test4:
+; DWARF-V7-FP-ELIM-NOT: .cfi_startproc
+; DWARF-V7-FP-ELIM:     bx lr
+; DWARF-V7-FP-ELIM-NOT: .cfi_endproc
+; DWARF-V7-FP-ELIM:     .size test4,
diff --git a/test/CodeGen/ARM/frame-register.ll b/test/CodeGen/ARM/frame-register.ll
new file mode 100644
index 0000000..e6a55bd
--- /dev/null
+++ b/test/CodeGen/ARM/frame-register.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple arm-eabi -disable-fp-elim -filetype asm -o - %s \
+; RUN:     | FileCheck -check-prefix CHECK-ARM %s
+
+; RUN: llc -mtriple thumb-eabi -disable-fp-elim -filetype asm -o - %s \
+; RUN:     | FileCheck -check-prefix CHECK-THUMB %s
+
+; RUN: llc -mtriple arm-darwin -disable-fp-elim -filetype asm -o - %s \
+; RUN:     | FileCheck -check-prefix CHECK-DARWIN-ARM %s
+
+; RUN: llc -mtriple thumb-darwin -disable-fp-elim -filetype asm -o - %s \
+; RUN:     | FileCheck -check-prefix CHECK-DARWIN-THUMB %s
+
+declare void @callee(i32)
+
+define i32 @calleer(i32 %i) {
+entry:
+  %i.addr = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32* %i.addr, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %j, align 4
+  %1 = load i32* %j, align 4
+  call void @callee(i32 %1)
+  %2 = load i32* %j, align 4
+  %add1 = add nsw i32 %2, 1
+  ret i32 %add1
+}
+
+; CHECK-ARM: push {r11, lr}
+; CHECK-ARM: mov r11, sp
+
+; CHECK-THUMB: push {r4, r6, r7, lr}
+; CHECK-THUMB: add r7, sp, #8
+
+; CHECK-DARWIN-ARM: push {r7, lr}
+; CHECK-DARWIN-THUMB: push {r4, r7, lr}
+
diff --git a/test/CodeGen/ARM/func-argpassing-endian.ll b/test/CodeGen/ARM/func-argpassing-endian.ll
new file mode 100644
index 0000000..26f0597
--- /dev/null
+++ b/test/CodeGen/ARM/func-argpassing-endian.ll
@@ -0,0 +1,122 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm-eabi -mattr=v7,neon | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=armeb-eabi -mattr=v7,neon | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+
+@var32 = global i32 0
+@vardouble = global double 0.0
+
+define void @arg_longint( i64 %val ) {
+; CHECK-LABEL: arg_longint:
+; CHECK-LE: str r0, [r1]
+; CHECK-BE: str r1, [r0]
+   %tmp = trunc i64 %val to i32 
+   store i32 %tmp, i32* @var32
+   ret void
+}
+
+define void @arg_double( double %val ) {
+; CHECK-LABEL: arg_double:
+; CHECK: strd r0, r1, [r2]
+    store double  %val, double* @vardouble
+    ret void
+}
+
+define void @arg_v4i32(<4 x i32> %vec ) {
+; CHECK-LABEL: arg_v4i32:
+; CHECK-LE: vmov {{d[0-9]+}}, r2, r3
+; CHECK-LE: vmov [[ARG_V4I32_REG:d[0-9]+]], r0, r1
+; CHECK-BE: vmov {{d[0-9]+}}, r3, r2
+; CHECK-BE: vmov [[ARG_V4I32_REG:d[0-9]+]], r1, r0
+; CHECK: vst1.32 {[[ARG_V4I32_REG]][0]}, [r0:32]
+    %tmp = extractelement <4 x i32> %vec, i32 0
+    store i32 %tmp, i32* @var32
+    ret void
+}
+
+define void @arg_v2f64(<2 x double> %vec ) {
+; CHECK-LABEL: arg_v2f64:
+; CHECK: strd r0, r1, [r2]
+    %tmp = extractelement <2 x double> %vec, i32 0
+    store double %tmp, double* @vardouble
+    ret void
+}
+
+define i64 @return_longint() {
+; CHECK-LABEL: return_longint:
+; CHECK-LE: mov r0, #42
+; CHECK-LE: mov r1, #0
+; CHECK-BE: mov r0, #0
+; CHECK-BE: mov r1, #42
+    ret i64 42
+}
+
+define double @return_double() {
+; CHECK-LABEL: return_double:
+; CHECK-LE: vmov r0, r1, {{d[0-9]+}}
+; CHECK-BE: vmov r1, r0, {{d[0-9]+}}
+    ret double 1.0
+}
+
+define <4 x i32> @return_v4i32() {
+; CHECK-LABEL: return_v4i32:
+; CHECK-LE: vmov r0, r1, {{d[0-9]+}}
+; CHECK-LE: vmov r2, r3, {{d[0-9]+}}
+; CHECK-BE: vmov r1, r0, {{d[0-9]+}}
+; CHECK-BE: vmov r3, r2, {{d[0-9]+}}
+   ret < 4 x i32> < i32 42, i32 43, i32 44, i32 45 >
+}
+
+define <2 x double> @return_v2f64() {
+; CHECK-LABEL: return_v2f64:
+; CHECK-LE: vmov r0, r1, {{d[0-9]+}}
+; CHECK-LE: vmov r2, r3, {{d[0-9]+}}
+; CHECK-BE: vmov r1, r0, {{d[0-9]+}}
+; CHECK-BE: vmov r3, r2, {{d[0-9]+}}
+   ret <2 x double> < double 3.14, double 6.28 >
+}
+
+define void @caller_arg_longint() {
+; CHECK-LABEL: caller_arg_longint:
+; CHECK-LE: mov r0, #42
+; CHECK-LE: mov r1, #0
+; CHECK-BE: mov r0, #0
+; CHECK-BE: mov r1, #42
+   call void @arg_longint( i64 42 )
+   ret void
+}
+
+define void @caller_arg_double() {
+; CHECK-LABEL: caller_arg_double:
+; CHECK-LE: vmov r0, r1, {{d[0-9]+}}
+; CHECK-BE: vmov r1, r0, {{d[0-9]+}}
+   call void @arg_double( double 1.0 )
+   ret void
+}
+
+define void @caller_return_longint() {
+; CHECK-LABEL: caller_return_longint:
+; CHECK-LE: str r0, [r1]
+; CHECK-BE: str r1, [r0]
+   %val = call i64 @return_longint()
+   %tmp = trunc i64 %val to i32 
+   store i32 %tmp, i32* @var32
+   ret void
+}
+
+define void @caller_return_double() {
+; CHECK-LABEL: caller_return_double:
+; CHECK-LE: vmov {{d[0-9]+}}, r0, r1
+; CHECK-BE: vmov {{d[0-9]+}}, r1, r0
+  %val = call double @return_double( )
+  %tmp = fadd double %val, 3.14
+  store double  %tmp, double* @vardouble
+  ret void
+}
+
+define void @caller_return_v2f64() {
+; CHECK-LABEL: caller_return_v2f64:
+; CHECK: strd r0, r1, [r2]
+   %val = call <2 x double> @return_v2f64( )
+   %tmp = extractelement <2 x double> %val, i32 0
+    store double %tmp, double* @vardouble
+    ret void
+}
diff --git a/test/CodeGen/ARM/hfa-in-contiguous-registers.ll b/test/CodeGen/ARM/hfa-in-contiguous-registers.ll
new file mode 100644
index 0000000..f9ec6e0
--- /dev/null
+++ b/test/CodeGen/ARM/hfa-in-contiguous-registers.ll
@@ -0,0 +1,94 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv7-none--gnueabihf"
+
+%struct.s = type { float, float }
+%union.t = type { [4 x float] }
+
+; Equivalent C code:
+; struct s { float a; float b; };
+; float foo(float a, double b, struct s c) { return c.a; }
+; Argument allocation:
+; a -> s0
+; b -> d1
+; c -> s4, s5
+; s1 is unused
+; return in s0
+define float @test1(float %a, double %b, %struct.s %c) {
+entry:
+; CHECK-LABEL: test1
+; CHECK: vmov.f32  s0, s4
+; CHECK-NOT: vmov.f32        s0, s1
+
+  %result = extractvalue %struct.s %c, 0
+  ret float %result
+}
+
+; Equivalent C code:
+; union t { float a[4] };
+; float foo(float a, double b, union s c) { return c.a[0]; }
+; Argument allocation:
+; a -> s0
+; b -> d1
+; c -> s4..s7
+define float @test2(float %a, double %b, %union.t %c) #0 {
+entry:
+; CHECK-LABEL: test2
+; CHECK: vmov.f32  s0, s4
+; CHECK-NOT: vmov.f32        s0, s1
+
+  %result = extractvalue %union.t %c, 0, 0
+  ret float %result
+}
+
+; Equivalent C code:
+; struct s { float a; float b; };
+; float foo(float a, double b, struct s c, float d) { return d; }
+; Argument allocation:
+; a -> s0
+; b -> d1
+; c -> s4, s5
+; d -> s1
+; return in s0
+define float @test3(float %a, double %b, %struct.s %c, float %d) {
+entry:
+; CHECK-LABEL: test3
+; CHECK: vmov.f32  s0, s1
+; CHECK-NOT: vmov.f32        s0, s5
+
+  ret float %d
+}
+
+; Equivalent C code:
+; struct s { float a; float b; };
+; float foo(struct s a, struct s b) { return b.b; }
+; Argument allocation:
+; a -> s0, s1
+; b -> s2, s3
+; return in s0
+define float @test4(%struct.s %a, %struct.s %b) {
+entry:
+; CHECK-LABEL: test4
+; CHECK: vmov.f32  s0, s3
+
+  %result = extractvalue %struct.s %b, 1
+  ret float %result
+}
+
+; Equivalent C code:
+; struct s { float a; float b; };
+; float foo(struct s a, float b, struct s c) { return c.a; }
+; Argument allocation:
+; a -> s0, s1
+; b -> s2
+; c -> s3, s4
+; return in s0
+define float @test5(%struct.s %a, float %b, %struct.s %c) {
+entry:
+; CHECK-LABEL: test5
+; CHECK: vmov.f32  s0, s3
+
+  %result = extractvalue %struct.s %c, 0
+  ret float %result
+}
diff --git a/test/CodeGen/ARM/hints.ll b/test/CodeGen/ARM/hints.ll
new file mode 100644
index 0000000..18abbbe
--- /dev/null
+++ b/test/CodeGen/ARM/hints.ll
@@ -0,0 +1,69 @@
+; RUN: llc -mtriple armv7-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv6m-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv7-eabi -o - %s | FileCheck %s
+
+declare void @llvm.arm.hint(i32) nounwind
+
+define void @hint_nop() {
+entry:
+  tail call void @llvm.arm.hint(i32 0) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_nop
+; CHECK: nop
+
+define void @hint_yield() {
+entry:
+  tail call void @llvm.arm.hint(i32 1) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_yield
+; CHECK: yield
+
+define void @hint_wfe() {
+entry:
+  tail call void @llvm.arm.hint(i32 2) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_wfe
+; CHECK: wfe
+
+define void @hint_wfi() {
+entry:
+  tail call void @llvm.arm.hint(i32 3) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_wfi
+; CHECK: wfi
+
+define void @hint_sev() {
+entry:
+  tail call void @llvm.arm.hint(i32 4) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_sev
+; CHECK: sev
+
+define void @hint_sevl() {
+entry:
+  tail call void @llvm.arm.hint(i32 5) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_sevl
+; CHECK: hint #5
+
+define void @hint_undefined() {
+entry:
+  tail call void @llvm.arm.hint(i32 8) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_undefined
+; CHECK: hint #8
+
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
index 86ed5b2..5d8e477 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
@@ -24,7 +24,7 @@ entry:
 ; CHECK: BB#1: derived from LLVM BB %for.body
 ; CHECK: Successors according to CFG: BB#2(130023362) BB#4(62)
 for.body:
-  br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i
+  br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1
 
 for.cond.backedge:
   %tobool = icmp eq %classL* undef, null
@@ -60,3 +60,4 @@ declare void @_ZN1F10handleMoveEb(%classF*, i1 zeroext)
 declare void @_Z3fn1v()
 
 !0 = metadata !{metadata !"clang version 3.5"}
+!1 = metadata !{metadata !"branch_weights", i32 62, i32 62}
diff --git a/test/CodeGen/ARM/indirect-hidden.ll b/test/CodeGen/ARM/indirect-hidden.ll
new file mode 100644
index 0000000..ae1c505
--- /dev/null
+++ b/test/CodeGen/ARM/indirect-hidden.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=thumbv7s-apple-ios7.0 -o - %s | FileCheck %s
+
+@var = external global i32
+@var_hidden = external hidden global i32
+
+define i32* @get_var() {
+  ret i32* @var
+}
+
+define i32* @get_var_hidden() {
+  ret i32* @var_hidden
+}
+
+; CHECK: .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+
+; CHECK: .indirect_symbol _var
+; CHECK-NEXT: .long 0
+
+; CHECK-NOT: __DATA,__data
+
+; CHECK: .indirect_symbol _var_hidden
+; CHECK-NEXT: .long 0
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/interrupt-attr.ll b/test/CodeGen/ARM/interrupt-attr.ll
index 9b7b41b..c5be667 100644
--- a/test/CodeGen/ARM/interrupt-attr.ll
+++ b/test/CodeGen/ARM/interrupt-attr.ll
@@ -12,13 +12,13 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
 
   ; Also need special function return setting pc and CPSR simultaneously.
 ; CHECK-A-LABEL: irq_fn:
-; CHECK-A: push {r0, r1, r2, r3, r11, r12, lr}
-; CHECK-A: add r11, sp, #16
-; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
+; CHECK-A: add r11, sp, #20
+; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
 ; CHECK-A: bic sp, sp, #7
 ; CHECK-A: bl bar
-; CHECK-A: sub sp, r11, #16
-; CHECK-A: pop {r0, r1, r2, r3, r11, r12, lr}
+; CHECK-A: sub sp, r11, #20
+; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
 ; CHECK-A: subs pc, lr, #4
 
 ; CHECK-A-THUMB-LABEL: irq_fn:
@@ -35,15 +35,15 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
   ; Normal AAPCS function (r0-r3 pushed onto stack by hardware, lr set to
   ; appropriate sentinel so no special return needed).
 ; CHECK-M-LABEL: irq_fn:
-; CHECK-M: push {r4, r7, lr}
-; CHECK-M: add r7, sp, #4
+; CHECK-M: push {r4, r6, r7, lr}
+; CHECK-M: add r7, sp, #8
 ; CHECK-M: mov r4, sp
 ; CHECK-M: bic r4, r4, #7
 ; CHECK-M: mov sp, r4
 ; CHECK-M: blx _bar
-; CHECK-M: subs r4, r7, #4
+; CHECK-M: sub.w r4, r7, #8
 ; CHECK-M: mov sp, r4
-; CHECK-M: pop {r4, r7, pc}
+; CHECK-M: pop {r4, r6, r7, pc}
 
   call arm_aapcscc void @bar()
   ret void
@@ -88,13 +88,13 @@ define arm_aapcscc void @swi_fn() alignstack(8) "interrupt"="SWI" {
 
 define arm_aapcscc void @undef_fn() alignstack(8) "interrupt"="UNDEF" {
 ; CHECK-A-LABEL: undef_fn:
-; CHECK-A: push {r0, r1, r2, r3, r11, r12, lr}
-; CHECK-A: add r11, sp, #16
-; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
+; CHECK-A: add r11, sp, #20
+; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
 ; CHECK-A: bic sp, sp, #7
 ; [...]
-; CHECK-A: sub sp, r11, #16
-; CHECK-A: pop {r0, r1, r2, r3, r11, r12, lr}
+; CHECK-A: sub sp, r11, #20
+; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
 ; CHECK-A: subs pc, lr, #0
 
   call void @bar()
@@ -103,13 +103,13 @@ define arm_aapcscc void @undef_fn() alignstack(8) "interrupt"="UNDEF" {
 
 define arm_aapcscc void @abort_fn() alignstack(8) "interrupt"="ABORT" {
 ; CHECK-A-LABEL: abort_fn:
-; CHECK-A: push {r0, r1, r2, r3, r11, r12, lr}
-; CHECK-A: add r11, sp, #16
-; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
+; CHECK-A: add r11, sp, #20
+; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
 ; CHECK-A: bic sp, sp, #7
 ; [...]
-; CHECK-A: sub sp, r11, #16
-; CHECK-A: pop {r0, r1, r2, r3, r11, r12, lr}
+; CHECK-A: sub sp, r11, #20
+; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
 ; CHECK-A: subs pc, lr, #4
 
   call void @bar()
diff --git a/test/CodeGen/ARM/intrinsics-overflow.ll b/test/CodeGen/ARM/intrinsics-overflow.ll
new file mode 100644
index 0000000..af3dd9d
--- /dev/null
+++ b/test/CodeGen/ARM/intrinsics-overflow.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=arm-linux -mcpu=generic | FileCheck %s
+
+define i32 @uadd_overflow(i32 %a, i32 %b) #0 {
+  %sadd = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %sadd, 1
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+
+  ; CHECK-LABEL: uadd_overflow:
+  ; CHECK: add r[[R2:[0-9]+]], r[[R0:[0-9]+]], r[[R1:[0-9]+]]
+  ; CHECK: mov r[[R1]], #1
+  ; CHECK: cmp r[[R2]], r[[R0]]
+  ; CHECK: movhs r[[R1]], #0
+}
+
+
+define i32 @sadd_overflow(i32 %a, i32 %b) #0 {
+  %sadd = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %sadd, 1
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+
+  ; CHECK-LABEL: sadd_overflow:
+  ; CHECK: add r[[R2:[0-9]+]], r[[R0:[0-9]+]], r[[R1:[0-9]+]]
+  ; CHECK: mov r[[R1]], #1
+  ; CHECK: cmp r[[R2]], r[[R0]]
+  ; CHECK: movvc r[[R1]], #0
+}
+
+define i32 @usub_overflow(i32 %a, i32 %b) #0 {
+  %sadd = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %sadd, 1
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+
+  ; CHECK-LABEL: usub_overflow:
+  ; CHECK: mov r[[R2]], #1
+  ; CHECK: cmp r[[R0]], r[[R1]]
+  ; CHECK: movhs r[[R2]], #0
+}
+
+define i32 @ssub_overflow(i32 %a, i32 %b) #0 {
+  %sadd = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %sadd, 1
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+
+  ; CHECK-LABEL: ssub_overflow:
+  ; CHECK: mov r[[R2]], #1
+  ; CHECK: cmp r[[R0]], r[[R1]]
+  ; CHECK: movvc r[[R2]], #0
+}
+
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) #2
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #3
+declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) #4
diff --git a/test/CodeGen/ARM/intrinsics-v8.ll b/test/CodeGen/ARM/intrinsics-v8.ll
index 247bfc1..ab1c3c0 100644
--- a/test/CodeGen/ARM/intrinsics-v8.ll
+++ b/test/CodeGen/ARM/intrinsics-v8.ll
@@ -10,10 +10,10 @@ define void @test() {
   ; CHECK: dsb ishld
   call void @llvm.arm.dsb(i32 9)
   ; CHECK: sevl
-  tail call void @llvm.arm.sevl() nounwind
+  tail call void @llvm.arm.hint(i32 5) nounwind
   ret void
 }
 
 declare void @llvm.arm.dmb(i32)
 declare void @llvm.arm.dsb(i32)
-declare void @llvm.arm.sevl() nounwind
+declare void @llvm.arm.hint(i32) nounwind
diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll
index 5636a12..fed6ec0 100644
--- a/test/CodeGen/ARM/longMAC.ll
+++ b/test/CodeGen/ARM/longMAC.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
-; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s --check-prefix=CHECK-V7
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s -check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s --check-prefix=CHECK-V7-LE
+; RUN: llc -mtriple=armeb-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+; RUN: llc -mtriple=armebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-BE
 ; Check generated signed and unsigned multiply accumulate long.
 
 define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
@@ -53,13 +55,18 @@ define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
 ;      function, both after the umlal. With it, *some* move has to happen
 ;      before the umlal.
 define i64 @MACLongTest5(i64 %c, i32 %a, i32 %b) {
-; CHECK-V7-LABEL: MACLongTest5:
-; CHECK-V7-LABEL: umlal r0, r1, r0, r0
+; CHECK-V7-LE-LABEL: MACLongTest5:
+; CHECK-V7-LE-LABEL: umlal r0, r1, r0, r0
+; CHECK-V7-BE-LABEL: MACLongTest5:
+; CHECK-V7-BE-LABEL: umlal r1, r0, r1, r1
 
 ; CHECK-LABEL: MACLongTest5:
-; CHECK: mov [[RDLO:r[0-9]+]], r0
-; CHECK: umlal [[RDLO]], r1, r0, r0
-; CHECK: mov r0, [[RDLO]]
+; CHECK-LE: mov [[RDLO:r[0-9]+]], r0
+; CHECK-LE: umlal [[RDLO]], r1, r0, r0
+; CHECK-LE: mov r0, [[RDLO]]
+; CHECK-BE: mov [[RDLO:r[0-9]+]], r1
+; CHECK-BE: umlal [[RDLO]], r0, r1, r1
+; CHECK-BE: mov r1, [[RDLO]]
 
   %conv.trunc = trunc i64 %c to i32
   %conv = zext i32 %conv.trunc to i64
diff --git a/test/CodeGen/ARM/long_shift.ll b/test/CodeGen/ARM/long_shift.ll
index 48b0ba7..3ec5fa4 100644
--- a/test/CodeGen/ARM/long_shift.ll
+++ b/test/CodeGen/ARM/long_shift.ll
@@ -1,11 +1,16 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+; RUN: llc -mtriple=armeb-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
 
 define i64 @f0(i64 %A, i64 %B) {
 ; CHECK-LABEL: f0:
-; CHECK:      lsrs    r3, r3, #1
-; CHECK-NEXT: rrx     r2, r2
-; CHECK-NEXT: subs    r0, r0, r2
-; CHECK-NEXT: sbc     r1, r1, r3
+; CHECK-LE:      lsrs    r3, r3, #1
+; CHECK-LE-NEXT: rrx     r2, r2
+; CHECK-LE-NEXT: subs    r0, r0, r2
+; CHECK-LE-NEXT: sbc     r1, r1, r3
+; CHECK-BE:      lsrs    r2, r2, #1
+; CHECK-BE-NEXT: rrx     r3, r3
+; CHECK-BE-NEXT: subs    r1, r1, r3
+; CHECK-BE-NEXT: sbc     r0, r0, r2
 	%tmp = bitcast i64 %A to i64
 	%tmp2 = lshr i64 %B, 1
 	%tmp3 = sub i64 %tmp, %tmp2
@@ -14,7 +19,8 @@ define i64 @f0(i64 %A, i64 %B) {
 
 define i32 @f1(i64 %x, i64 %y) {
 ; CHECK-LABEL: f1:
-; CHECK: lsl{{.*}}r2
+; CHECK-LE: lsl{{.*}}r2
+; CHECK-BE: lsl{{.*}}r3
 	%a = shl i64 %x, %y
 	%b = trunc i64 %a to i32
 	ret i32 %b
@@ -22,12 +28,20 @@ define i32 @f1(i64 %x, i64 %y) {
 
 define i32 @f2(i64 %x, i64 %y) {
 ; CHECK-LABEL: f2:
-; CHECK:      lsr{{.*}}r2
-; CHECK-NEXT: rsb     r3, r2, #32
-; CHECK-NEXT: sub     r2, r2, #32
-; CHECK-NEXT: orr     r0, r0, r1, lsl r3
-; CHECK-NEXT: cmp     r2, #0
-; CHECK-NEXT: asrge   r0, r1, r2
+; CHECK-LE:      lsr{{.*}}r2
+; CHECK-LE-NEXT: rsb     r3, r2, #32
+; CHECK-LE-NEXT: sub     r2, r2, #32
+; CHECK-LE-NEXT: orr     r0, r0, r1, lsl r3
+; CHECK-LE-NEXT: cmp     r2, #0
+; CHECK-LE-NEXT: asrge   r0, r1, r2
+
+; CHECK-BE:      lsr{{.*}}r3
+; CHECK-BE-NEXT: rsb     r2, r3, #32
+; CHECK-BE-NEXT: orr     r1, r1, r0, lsl r2
+; CHECK-BE-NEXT: sub     r2, r3, #32
+; CHECK-BE-NEXT: cmp     r2, #0
+; CHECK-BE-NEXT: asrge   r1, r0, r2
+
 	%a = ashr i64 %x, %y
 	%b = trunc i64 %a to i32
 	ret i32 %b
@@ -35,12 +49,20 @@ define i32 @f2(i64 %x, i64 %y) {
 
 define i32 @f3(i64 %x, i64 %y) {
 ; CHECK-LABEL: f3:
-; CHECK:      lsr{{.*}}r2
-; CHECK-NEXT: rsb     r3, r2, #32
-; CHECK-NEXT: sub     r2, r2, #32
-; CHECK-NEXT: orr     r0, r0, r1, lsl r3
-; CHECK-NEXT: cmp     r2, #0
-; CHECK-NEXT: lsrge   r0, r1, r2
+; CHECK-LE:      lsr{{.*}}r2
+; CHECK-LE-NEXT: rsb     r3, r2, #32
+; CHECK-LE-NEXT: sub     r2, r2, #32
+; CHECK-LE-NEXT: orr     r0, r0, r1, lsl r3
+; CHECK-LE-NEXT: cmp     r2, #0
+; CHECK-LE-NEXT: lsrge   r0, r1, r2
+
+; CHECK-BE:      lsr{{.*}}r3
+; CHECK-BE-NEXT: rsb     r2, r3, #32
+; CHECK-BE-NEXT: orr     r1, r1, r0, lsl r2
+; CHECK-BE-NEXT: sub     r2, r3, #32
+; CHECK-BE-NEXT: cmp     r2, #0
+; CHECK-BE-NEXT: lsrge   r1, r0, r2
+
 	%a = lshr i64 %x, %y
 	%b = trunc i64 %a to i32
 	ret i32 %b
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index 14d84de..84ce4a7 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s
-
+; RUN: llc < %s -mtriple=thumbv6m-apple-ios -mcpu=cortex-m0 -pre-RA-sched=source -disable-post-ra | FileCheck %s -check-prefix=CHECK-T1
 %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
 
 @src = external global %struct.x
@@ -17,7 +17,12 @@ define i32 @t0() {
 entry:
 ; CHECK-LABEL: t0:
 ; CHECK: vldr [[REG1:d[0-9]+]],
-; CHECK: vstr [[REG1]], 
+; CHECK: vstr [[REG1]],
+; CHECK-T1-LABEL: t0:
+; CHECK-T1: ldrb [[TREG1:r[0-9]]],
+; CHECK-T1: strb [[TREG1]],
+; CHECK-T1: ldrh [[TREG2:r[0-9]]],
+; CHECK-T1: strh [[TREG2]]
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
   ret i32 0
 }
@@ -83,6 +88,11 @@ entry:
 ; CHECK: movw [[REG7:r[0-9]+]], #18500
 ; CHECK: movt [[REG7:r[0-9]+]], #22866
 ; CHECK: str [[REG7]]
+; CHECK-T1-LABEL: t5:
+; CHECK-T1: movs [[TREG3:r[0-9]]],
+; CHECK-T1: strb [[TREG3]],
+; CHECK-T1: movs [[TREG4:r[0-9]]],
+; CHECK-T1: strb [[TREG4]],
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
   ret void
 }
@@ -90,12 +100,17 @@ entry:
 define void @t6() nounwind {
 entry:
 ; CHECK-LABEL: t6:
-; CHECK: vld1.8 {[[REG8:d[0-9]+]]}, [r0]
-; CHECK: vstr [[REG8]], [r1]
+; CHECK: vld1.8 {[[REG9:d[0-9]+]]}, [r0]
+; CHECK: vstr [[REG9]], [r1]
 ; CHECK: adds r1, #6
 ; CHECK: adds r0, #6
 ; CHECK: vld1.8
 ; CHECK: vst1.16
+; CHECK-T1-LABEL: t6:
+; CHECK-T1: movs [[TREG5:r[0-9]]],
+; CHECK-T1: strh [[TREG5]],
+; CHECK-T1: ldr [[TREG6:r[0-9]]],
+; CHECK-T1: str [[TREG6]]
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
   ret void
 }
@@ -104,9 +119,12 @@ entry:
 
 define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
 entry:
-; CHECK: t7
+; CHECK-LABEL: t7:
 ; CHECK: vld1.32
 ; CHECK: vst1.32
+; CHECK-T1-LABEL: t7:
+; CHECK-T1: ldr
+; CHECK-T1: str
   %0 = bitcast %struct.Foo* %a to i8*
   %1 = bitcast %struct.Foo* %b to i8*
   tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
diff --git a/test/CodeGen/ARM/misched-copy-arm.ll b/test/CodeGen/ARM/misched-copy-arm.ll
index 5da335f..26adf0c 100644
--- a/test/CodeGen/ARM/misched-copy-arm.ll
+++ b/test/CodeGen/ARM/misched-copy-arm.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=thumb -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched %s -o - 2>&1 | FileCheck %s
 ;
 ; Loop counter copies should be eliminated.
 ; There is also a MUL here, but we don't care where it is scheduled.
diff --git a/test/CodeGen/ARM/movt.ll b/test/CodeGen/ARM/movt.ll
index 735d949..94c022e 100644
--- a/test/CodeGen/ARM/movt.ll
+++ b/test/CodeGen/ARM/movt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 ; rdar://7317664
 
 define i32 @t(i32 %X) nounwind {
diff --git a/test/CodeGen/ARM/mul.ll b/test/CodeGen/ARM/mul.ll
index 466a802..5e150b0 100644
--- a/test/CodeGen/ARM/mul.ll
+++ b/test/CodeGen/ARM/mul.ll
@@ -1,11 +1,12 @@
-; RUN: llc < %s -march=arm | grep mul | count 2
-; RUN: llc < %s -march=arm | grep lsl | count 2
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @f1(i32 %u) {
     %tmp = mul i32 %u, %u
     ret i32 %tmp
 }
 
+; CHECK: mul
+
 define i32 @f2(i32 %u, i32 %v) {
     %tmp = mul i32 %u, %v
     ret i32 %tmp
@@ -16,7 +17,16 @@ define i32 @f3(i32 %u) {
         ret i32 %tmp
 }
 
+; CHECK: mul
+; CHECK: lsl
+
 define i32 @f4(i32 %u) {
 	%tmp = mul i32 %u, 4
         ret i32 %tmp
 }
+
+; CHECK-NOT: mul
+
+; CHECK: lsl
+; CHECK-NOT: lsl
+
diff --git a/test/CodeGen/ARM/mvn.ll b/test/CodeGen/ARM/mvn.ll
index 489f247..e40ab1e 100644
--- a/test/CodeGen/ARM/mvn.ll
+++ b/test/CodeGen/ARM/mvn.ll
@@ -73,7 +73,8 @@ entry:
 	ret i1 %tmp102
 }
 
-; CHECK-LABEL: f1
+; CHECK-LABEL: mvn.ll
+; CHECK-LABEL: @f1
 ; CHECK: mvn
 ; CHECK: mvn
 ; CHECK: mvn
diff --git a/test/CodeGen/ARM/named-reg-alloc.ll b/test/CodeGen/ARM/named-reg-alloc.ll
new file mode 100644
index 0000000..3c27d22
--- /dev/null
+++ b/test/CodeGen/ARM/named-reg-alloc.ll
@@ -0,0 +1,14 @@
+; RUN: not llc < %s -mtriple=arm-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"r5\00"}
diff --git a/test/CodeGen/ARM/named-reg-notareg.ll b/test/CodeGen/ARM/named-reg-notareg.ll
new file mode 100644
index 0000000..af38b60
--- /dev/null
+++ b/test/CodeGen/ARM/named-reg-notareg.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -mtriple=arm-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"notareg\00"}
diff --git a/test/CodeGen/ARM/phi.ll b/test/CodeGen/ARM/phi.ll
index 94bced5..5a8f623 100644
--- a/test/CodeGen/ARM/phi.ll
+++ b/test/CodeGen/ARM/phi.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v4t -addr-sink-using-gep=1 %s -o - | FileCheck %s
 
 ; <rdar://problem/8686347>
 
diff --git a/test/CodeGen/ARM/ret_i64_arg2.ll b/test/CodeGen/ARM/ret_i64_arg2.ll
index c51d2b8..5313600 100644
--- a/test/CodeGen/ARM/ret_i64_arg2.ll
+++ b/test/CodeGen/ARM/ret_i64_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm -mattr=+vfp2 %s -o /dev/null
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define i64 @test_i64(i64 %a1, i64 %a2) {
         ret i64 %a2
diff --git a/test/CodeGen/ARM/ret_i64_arg3.ll b/test/CodeGen/ARM/ret_i64_arg3.ll
index 602997e..ce8da0a 100644
--- a/test/CodeGen/ARM/ret_i64_arg3.ll
+++ b/test/CodeGen/ARM/ret_i64_arg3.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm -mattr=+vfp2 %s -o /dev/null
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define i64 @test_i64_arg3(i64 %a1, i64 %a2, i64 %a3) {
         ret i64 %a3
diff --git a/test/CodeGen/ARM/segmented-stacks-dynamic.ll b/test/CodeGen/ARM/segmented-stacks-dynamic.ll
index 13b5bcf..86f8ff8 100644
--- a/test/CodeGen/ARM/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/ARM/segmented-stacks-dynamic.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-android
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-linux
-; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -filetype=obj
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-androideabi -verify-machineinstrs | FileCheck %s -check-prefix=ARM-android
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=ARM-linux
+; RUN: llc < %s -mtriple=arm-linux-androideabi -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -filetype=obj
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define i32 @test_basic(i32 %l) {
+define i32 @test_basic(i32 %l) #0 {
         %mem = alloca i32, i32 %l
         call void @dummy_use (i32* %mem, i32 %l)
         %terminate = icmp eq i32 %l, 0
@@ -29,7 +29,7 @@ false:
 ; ARM-linux-NEXT: cmp     r4, r5
 ; ARM-linux-NEXT: blo     .LBB0_2
 
-; ARM-linux:      mov     r4, #24
+; ARM-linux:      mov     r4, #16
 ; ARM-linux-NEXT: mov     r5, #0
 ; ARM-linux-NEXT: stmdb   sp!, {lr}
 ; ARM-linux-NEXT: bl      __morestack
@@ -49,7 +49,7 @@ false:
 ; ARM-android-NEXT: cmp     r4, r5
 ; ARM-android-NEXT: blo     .LBB0_2
 
-; ARM-android:      mov     r4, #24
+; ARM-android:      mov     r4, #16
 ; ARM-android-NEXT: mov     r5, #0
 ; ARM-android-NEXT: stmdb   sp!, {lr}
 ; ARM-android-NEXT: bl      __morestack
@@ -60,3 +60,5 @@ false:
 ; ARM-android:      pop     {r4, r5}
 
 }
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/ARM/segmented-stacks.ll b/test/CodeGen/ARM/segmented-stacks.ll
index 5eff633..9873bf3 100644
--- a/test/CodeGen/ARM/segmented-stacks.ll
+++ b/test/CodeGen/ARM/segmented-stacks.ll
@@ -1,15 +1,15 @@
-; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-android
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-linux
+; RUN: llc < %s -mtriple=arm-linux-androideabi -verify-machineinstrs | FileCheck %s -check-prefix=ARM-android
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=ARM-linux
 
 ; We used to crash with filetype=obj
-; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -filetype=obj
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-androideabi -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -filetype=obj
 
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define void @test_basic() {
+define void @test_basic() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -54,9 +54,11 @@ define void @test_basic() {
 
 }
 
-define i32 @test_nested(i32 * nest %closure, i32 %other) {
+define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        %addend = load i32 * %closure
        %result = add i32 %other, %addend
+       %mem = alloca i32, i32 10
+       call void @dummy_use (i32* %mem, i32 10)
        ret i32 %result
 
 ; ARM-linux:      test_nested:
@@ -68,7 +70,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; ARM-linux-NEXT: cmp     r4, r5
 ; ARM-linux-NEXT: blo     .LBB1_2
 
-; ARM-linux:      mov     r4, #0
+; ARM-linux:      mov     r4, #56
 ; ARM-linux-NEXT: mov     r5, #0
 ; ARM-linux-NEXT: stmdb   sp!, {lr}
 ; ARM-linux-NEXT: bl      __morestack
@@ -87,7 +89,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; ARM-android-NEXT: cmp     r4, r5
 ; ARM-android-NEXT: blo     .LBB1_2
 
-; ARM-android:      mov     r4, #0
+; ARM-android:      mov     r4, #56
 ; ARM-android-NEXT: mov     r5, #0
 ; ARM-android-NEXT: stmdb   sp!, {lr}
 ; ARM-android-NEXT: bl      __morestack
@@ -99,7 +101,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 
 }
 
-define void @test_large() {
+define void @test_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -144,7 +146,7 @@ define void @test_large() {
 
 }
 
-define fastcc void @test_fastcc() {
+define fastcc void @test_fastcc() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
         ret void
@@ -189,7 +191,7 @@ define fastcc void @test_fastcc() {
 
 }
 
-define fastcc void @test_fastcc_large() {
+define fastcc void @test_fastcc_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -233,3 +235,15 @@ define fastcc void @test_fastcc_large() {
 ; ARM-android:      pop     {r4, r5}
 
 }
+
+define void @test_nostack() #0 {
+	ret void
+
+; ARM-linux-LABEL: test_nostack:
+; ARM-linux-NOT:   bl __morestack
+
+; ARM-android-LABEL: test_nostack:
+; ARM-android-NOT:   bl __morestack
+}
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/ARM/smml.ll b/test/CodeGen/ARM/smml.ll
index 99df0d4..fc73eb7 100644
--- a/test/CodeGen/ARM/smml.ll
+++ b/test/CodeGen/ARM/smml.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+
 define i32 @f(i32 %a, i32 %b, i32 %c) nounwind readnone ssp {
 entry:
 ; CHECK-NOT: smmls
diff --git a/test/CodeGen/ARM/stack-frame.ll b/test/CodeGen/ARM/stack-frame.ll
index a419074..a3b0b66 100644
--- a/test/CodeGen/ARM/stack-frame.ll
+++ b/test/CodeGen/ARM/stack-frame.ll
@@ -1,14 +1,14 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi < %s -o - | FileCheck %s
 
 define void @f1() {
 	%c = alloca i8, align 1
 	ret void
 }
+; CHECK-LABEL: f1:
+; CHECK: add
 
 define i32 @f2() {
 	ret i32 1
 }
-
-; CHECK: add
+; CHECK-LABEL: f2:
 ; CHECK-NOT: add
-
diff --git a/test/CodeGen/ARM/stackpointer.ll b/test/CodeGen/ARM/stackpointer.ll
new file mode 100644
index 0000000..420a916
--- /dev/null
+++ b/test/CodeGen/ARM/stackpointer.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin  | FileCheck %s
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; CHECK-LABEL: get_stack:
+; CHECK: mov   r0, sp
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+define void @set_stack(i32 %val) nounwind {
+entry:
+; CHECK-LABEL: set_stack:
+; CHECK: mov   sp, r0
+  call void @llvm.write_register.i32(metadata !0, i32 %val)
+  ret void
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+declare void @llvm.write_register.i32(metadata, i32) nounwind
+
+; register unsigned long current_stack_pointer asm("sp");
+; CHECK-NOT: .asciz  "sp"
+!0 = metadata !{metadata !"sp\00"}
diff --git a/test/CodeGen/ARM/sub.ll b/test/CodeGen/ARM/sub.ll
index 67bde2a..9ac314d 100644
--- a/test/CodeGen/ARM/sub.ll
+++ b/test/CodeGen/ARM/sub.ll
@@ -1,10 +1,13 @@
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+; RUN: llc -mtriple=armeb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
 ; CHECK: f1
-; CHECK: subs r0, r0, #171
-; CHECK: sbc r1, r1, #0
+; CHECK-LE: subs r0, r0, #171
+; CHECK-LE: sbc r1, r1, #0
+; CHECK-BE: subs r1, r1, #171
+; CHECK-BE: sbc r0, r0, #0
     %tmp = sub i64 %a, 171
     ret i64 %tmp
 }
@@ -12,8 +15,10 @@ define i64 @f1(i64 %a) {
 ; 66846720 = 0x03fc0000
 define i64 @f2(i64 %a) {
 ; CHECK: f2
-; CHECK: subs r0, r0, #66846720
-; CHECK: sbc r1, r1, #0
+; CHECK-LE: subs r0, r0, #66846720
+; CHECK-LE: sbc r1, r1, #0
+; CHECK-BE: subs r1, r1, #66846720
+; CHECK-BE: sbc r0, r0, #0
     %tmp = sub i64 %a, 66846720
     ret i64 %tmp
 }
@@ -21,8 +26,10 @@ define i64 @f2(i64 %a) {
 ; 734439407618 = 0x000000ab00000002
 define i64 @f3(i64 %a) {
 ; CHECK: f3
-; CHECK: subs r0, r0, #2
-; CHECK: sbc r1, r1, #171
+; CHECK-LE: subs r0, r0, #2
+; CHECK-LE: sbc r1, r1, #171
+; CHECK-BE: subs r1, r1, #2
+; CHECK-BE: sbc r0, r0, #171
    %tmp = sub i64 %a, 734439407618
    ret i64 %tmp
 }
diff --git a/test/CodeGen/ARM/t2-imm.ll b/test/CodeGen/ARM/t2-imm.ll
index dd75cd1..23463b8 100644
--- a/test/CodeGen/ARM/t2-imm.ll
+++ b/test/CodeGen/ARM/t2-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f6(i32 %a) {
 ; CHECK:f6
diff --git a/test/CodeGen/ARM/thumb2-it-block.ll b/test/CodeGen/ARM/thumb2-it-block.ll
index d954760..c5e699c 100644
--- a/test/CodeGen/ARM/thumb2-it-block.ll
+++ b/test/CodeGen/ARM/thumb2-it-block.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s
 ; PR11107
 
 define i32 @test(i32 %a, i32 %b) {
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index 6cb26e3..0baf50b 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=INSTR
 ; RUN: llc < %s -mtriple=arm-apple-darwin -trap-func=_trap | FileCheck %s -check-prefix=FUNC
+; RUN: llc < %s -mtriple=arm-apple-darwin -trap-func=_trap -O0 | FileCheck %s -check-prefix=FUNC
 ; RUN: llc -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
 ; RUN:  | llvm-objdump -disassemble -triple armv7-unknown-nacl - \
 ; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
diff --git a/test/CodeGen/ARM/undefined.ll b/test/CodeGen/ARM/undefined.ll
new file mode 100644
index 0000000..86422fb
--- /dev/null
+++ b/test/CodeGen/ARM/undefined.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple armv7-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv6m-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv7-eabi -o - %s | FileCheck %s
+
+declare void @llvm.arm.undefined(i32) nounwind
+
+define void @undefined_trap() {
+entry:
+  tail call void @llvm.arm.undefined(i32 254)
+  ret void
+}
+
+; CHECK-LABEL: undefined_trap
+; CHECK: udf #254
diff --git a/test/CodeGen/ARM/vcombine.ll b/test/CodeGen/ARM/vcombine.ll
index d611267..33aa71d 100644
--- a/test/CodeGen/ARM/vcombine.ll
+++ b/test/CodeGen/ARM/vcombine.ll
@@ -1,9 +1,12 @@
-; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+; RUN: llc -mtriple=armeb-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
 
 define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ; CHECK: vcombine8
-; CHECK: vmov r0, r1, d16
-; CHECK: vmov r2, r3, d17
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-LE: vmov r2, r3, d17
+; CHECK-BE: vmov r1, r0, d16
+; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -12,8 +15,10 @@ define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 
 define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ; CHECK: vcombine16
-; CHECK: vmov r0, r1, d16
-; CHECK: vmov r2, r3, d17
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-LE: vmov r2, r3, d17
+; CHECK-BE: vmov r1, r0, d16
+; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
 	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -22,8 +27,10 @@ define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 
 define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ; CHECK: vcombine32
-; CHECK: vmov r0, r1, d16
-; CHECK: vmov r2, r3, d17
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-LE: vmov r2, r3, d17
+; CHECK-BE: vmov r1, r0, d16
+; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
 	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -32,8 +39,10 @@ define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind {
 ; CHECK: vcombinefloat
-; CHECK: vmov r0, r1, d16
-; CHECK: vmov r2, r3, d17
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-LE: vmov r2, r3, d17
+; CHECK-BE: vmov r1, r0, d16
+; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
 	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -42,8 +51,10 @@ define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind {
 
 define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ; CHECK: vcombine64
-; CHECK: vmov r0, r1, d16
-; CHECK: vmov r2, r3, d17
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-LE: vmov r2, r3, d17
+; CHECK-BE: vmov r1, r0, d16
+; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <1 x i64>* %A
 	%tmp2 = load <1 x i64>* %B
 	%tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> <i32 0, i32 1>
@@ -56,7 +67,8 @@ define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 define <4 x i16> @vget_low16(<8 x i16>* %A) nounwind {
 ; CHECK: vget_low16
 ; CHECK-NOT: vst
-; CHECK: vmov r0, r1, d16
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-BE: vmov r1, r0, d16
 	%tmp1 = load <8 x i16>* %A
         %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i16> %tmp2
@@ -65,7 +77,8 @@ define <4 x i16> @vget_low16(<8 x i16>* %A) nounwind {
 define <8 x i8> @vget_high8(<16 x i8>* %A) nounwind {
 ; CHECK: vget_high8
 ; CHECK-NOT: vst
-; CHECK: vmov r0, r1, d17
+; CHECK-LE: vmov r0, r1, d17
+; CHECK-BE: vmov r1, r0, d16
 	%tmp1 = load <16 x i8>* %A
         %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <8 x i8> %tmp2
diff --git a/test/CodeGen/ARM/vfp-libcalls.ll b/test/CodeGen/ARM/vfp-libcalls.ll
new file mode 100644
index 0000000..9d4e194
--- /dev/null
+++ b/test/CodeGen/ARM/vfp-libcalls.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=armv6-apple-ios -mcpu=arm1136jf-s -o - %s | FileCheck %s --check-prefix=CHECK-HARD
+; RUN: llc -mtriple=thumbv6-apple-ios -mcpu=arm1136jf-s -o - %s | FileCheck %s --check-prefix=CHECK-SOFTISH
+; RUN: llc -mtriple=armv7s-apple-ios -soft-float -mcpu=arm1136jf-s -o - %s | FileCheck %s --check-prefix=CHECK-SOFT
+
+define float @test_call(float %a, float %b) {
+; CHECK-HARD: vadd.f32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-SOFTISH: blx ___addsf3vfp
+; CHECK-SOFT: bl ___addsf3{{$}}
+  %sum = fadd float %a, %b
+  ret float %sum
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/vrev.ll b/test/CodeGen/ARM/vrev.ll
index eb76ba6..7215ad6 100644
--- a/test/CodeGen/ARM/vrev.ll
+++ b/test/CodeGen/ARM/vrev.ll
@@ -178,3 +178,11 @@ entry:
   ret void
 }
 
+define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
+; CHECK-LABEL: test_vrev32_bswap:
+; CHECK: vrev32.8
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
+  ret <4 x i32> %bswap
+}
+
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/zextload_demandedbits.ll b/test/CodeGen/ARM/zextload_demandedbits.ll
index 3d3269c..6b6ce97 100644
--- a/test/CodeGen/ARM/zextload_demandedbits.ll
+++ b/test/CodeGen/ARM/zextload_demandedbits.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
 %struct.spam = type { [3 x i32] }
 %struct.barney = type { [2 x i32], [2 x i32] }
 
-; Make sure that the sext op does not get lost due to ComputeMaskedBits.
+; Make sure that the sext op does not get lost due to computeKnownBits.
 ; CHECK: quux
 ; CHECK: lsl
 ; CHECK: asr
diff --git a/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S b/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
deleted file mode 100644
index 250732d..0000000
--- a/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o /dev/null %s
-
-        .text
-        .globl _foo
-        .cfi_startproc
-_foo:
-        stp x29, x30, [sp, #-16]!
- .cfi_adjust_cfa_offset 16
-
-        ldp x29, x30, [sp], #16
- .cfi_adjust_cfa_offset -16
-        .cfi_restore x29
-        .cfi_restore x30
-
-        ret
-
-        .cfi_endproc
diff --git a/test/CodeGen/ARM64/fast-isel-rem.ll b/test/CodeGen/ARM64/fast-isel-rem.ll
deleted file mode 100644
index 0c68401..0000000
--- a/test/CodeGen/ARM64/fast-isel-rem.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i32 @t1(i32 %a, i32 %b) {
-; CHECK: @t1
-; CHECK: sdiv w2, w0, w1
-; CHECK: msub w2, w2, w1, w0
-  %1 = srem i32 %a, %b
-  ret i32 %1
-}
-
-define i64 @t2(i64 %a, i64 %b) {
-; CHECK: @t2
-; CHECK: sdiv x2, x0, x1
-; CHECK: msub x2, x2, x1, x0
-  %1 = srem i64 %a, %b
-  ret i64 %1
-}
-
-define i32 @t3(i32 %a, i32 %b) {
-; CHECK: @t3
-; CHECK: udiv w2, w0, w1
-; CHECK: msub w2, w2, w1, w0
-  %1 = urem i32 %a, %b
-  ret i32 %1
-}
-
-define i64 @t4(i64 %a, i64 %b) {
-; CHECK: @t4
-; CHECK: udiv x2, x0, x1
-; CHECK: msub x2, x2, x1, x0
-  %1 = urem i64 %a, %b
-  ret i64 %1
-}
diff --git a/test/CodeGen/ARM64/fmax.ll b/test/CodeGen/ARM64/fmax.ll
deleted file mode 100644
index 53ecf86..0000000
--- a/test/CodeGen/ARM64/fmax.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
-
-define double @test_direct(float %in) #1 {
-entry:
-  %cmp = fcmp olt float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double 0.000000e+00, double %longer
-  ret double %val
-
-; CHECK: fmax
-}
-
-define double @test_cross(float %in) #1 {
-entry:
-  %cmp = fcmp olt float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double %longer, double 0.000000e+00
-  ret double %val
-
-; CHECK: fmin
-}
diff --git a/test/CodeGen/ARM64/fminv.ll b/test/CodeGen/ARM64/fminv.ll
deleted file mode 100644
index ca706d8..0000000
--- a/test/CodeGen/ARM64/fminv.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-
-define float @test_fminv_v2f32(<2 x float> %in) {
-; CHECK: test_fminv_v2f32:
-; CHECK: fminp s0, v0.2s
-  %min = call float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float> %in)
-  ret float %min
-}
-
-define float @test_fminv_v4f32(<4 x float> %in) {
-; CHECK: test_fminv_v4f32:
-; CHECK: fminv s0, v0.4s
-  %min = call float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float> %in)
-  ret float %min
-}
-
-define double @test_fminv_v2f64(<2 x double> %in) {
-; CHECK: test_fminv_v2f64:
-; CHECK: fminp d0, v0.2d
-  %min = call double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double> %in)
-  ret double %min
-}
-
-declare float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double>)
-
-define float @test_fmaxv_v2f32(<2 x float> %in) {
-; CHECK: test_fmaxv_v2f32:
-; CHECK: fmaxp s0, v0.2s
-  %max = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %in)
-  ret float %max
-}
-
-define float @test_fmaxv_v4f32(<4 x float> %in) {
-; CHECK: test_fmaxv_v4f32:
-; CHECK: fmaxv s0, v0.4s
-  %max = call float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float> %in)
-  ret float %max
-}
-
-define double @test_fmaxv_v2f64(<2 x double> %in) {
-; CHECK: test_fmaxv_v2f64:
-; CHECK: fmaxp d0, v0.2d
-  %max = call double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double> %in)
-  ret double %max
-}
-
-declare float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double>)
-
-define float @test_fminnmv_v2f32(<2 x float> %in) {
-; CHECK: test_fminnmv_v2f32:
-; CHECK: fminnmp s0, v0.2s
-  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float> %in)
-  ret float %minnm
-}
-
-define float @test_fminnmv_v4f32(<4 x float> %in) {
-; CHECK: test_fminnmv_v4f32:
-; CHECK: fminnmv s0, v0.4s
-  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float> %in)
-  ret float %minnm
-}
-
-define double @test_fminnmv_v2f64(<2 x double> %in) {
-; CHECK: test_fminnmv_v2f64:
-; CHECK: fminnmp d0, v0.2d
-  %minnm = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
-  ret double %minnm
-}
-
-declare float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
-
-define float @test_fmaxnmv_v2f32(<2 x float> %in) {
-; CHECK: test_fmaxnmv_v2f32:
-; CHECK: fmaxnmp s0, v0.2s
-  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
-  ret float %maxnm
-}
-
-define float @test_fmaxnmv_v4f32(<4 x float> %in) {
-; CHECK: test_fmaxnmv_v4f32:
-; CHECK: fmaxnmv s0, v0.4s
-  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
-  ret float %maxnm
-}
-
-define double @test_fmaxnmv_v2f64(<2 x double> %in) {
-; CHECK: test_fmaxnmv_v2f64:
-; CHECK: fmaxnmp d0, v0.2d
-  %maxnm = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
-  ret double %maxnm
-}
-
-declare float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/ldxr-stxr.ll b/test/CodeGen/ARM64/ldxr-stxr.ll
deleted file mode 100644
index d50ba94..0000000
--- a/test/CodeGen/ARM64/ldxr-stxr.ll
+++ /dev/null
@@ -1,143 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
-
-%0 = type { i64, i64 }
-
-define i128 @f0(i8* %p) nounwind readonly {
-; CHECK-LABEL: f0:
-; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
-entry:
-  %ldrexd = tail call %0 @llvm.arm64.ldxp(i8* %p)
-  %0 = extractvalue %0 %ldrexd, 1
-  %1 = extractvalue %0 %ldrexd, 0
-  %2 = zext i64 %0 to i128
-  %3 = zext i64 %1 to i128
-  %shl = shl nuw i128 %2, 64
-  %4 = or i128 %shl, %3
-  ret i128 %4
-}
-
-define i32 @f1(i8* %ptr, i128 %val) nounwind {
-; CHECK-LABEL: f1:
-; CHECK: stxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
-entry:
-  %tmp4 = trunc i128 %val to i64
-  %tmp6 = lshr i128 %val, 64
-  %tmp7 = trunc i128 %tmp6 to i64
-  %strexd = tail call i32 @llvm.arm64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
-  ret i32 %strexd
-}
-
-declare %0 @llvm.arm64.ldxp(i8*) nounwind
-declare i32 @llvm.arm64.stxp(i64, i64, i8*) nounwind
-
-@var = global i64 0, align 8
-
-define void @test_load_i8(i8* %addr) {
-; CHECK-LABEL: test_load_i8:
-; CHECK: ldxrb w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxtb
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i8(i8* %addr)
-  %shortval = trunc i64 %val to i8
-  %extval = zext i8 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_i16(i16* %addr) {
-; CHECK-LABEL: test_load_i16:
-; CHECK: ldxrh w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxth
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i16(i16* %addr)
-  %shortval = trunc i64 %val to i16
-  %extval = zext i16 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_i32(i32* %addr) {
-; CHECK-LABEL: test_load_i32:
-; CHECK: ldxr w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxtw
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i32(i32* %addr)
-  %shortval = trunc i64 %val to i32
-  %extval = zext i32 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_i64(i64* %addr) {
-; CHECK-LABEL: test_load_i64:
-; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i64(i64* %addr)
-  store i64 %val, i64* @var, align 8
-  ret void
-}
-
-
-declare i64 @llvm.arm64.ldxr.p0i8(i8*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i16(i16*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i32(i32*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i64(i64*) nounwind
-
-define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
-; CHECK-LABEL: test_store_i8:
-; CHECK-NOT: uxtb
-; CHECK-NOT: and
-; CHECK: stxrb w0, w1, [x2]
-  %extval = zext i8 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i8(i64 %extval, i8* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
-; CHECK-LABEL: test_store_i16:
-; CHECK-NOT: uxth
-; CHECK-NOT: and
-; CHECK: stxrh w0, w1, [x2]
-  %extval = zext i16 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i16(i64 %extval, i16* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
-; CHECK-LABEL: test_store_i32:
-; CHECK-NOT: uxtw
-; CHECK-NOT: and
-; CHECK: stxr w0, w1, [x2]
-  %extval = zext i32 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i32(i64 %extval, i32* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
-; CHECK-LABEL: test_store_i64:
-; CHECK: stxr w0, x1, [x2]
-  %res = call i32 @llvm.arm64.stxr.p0i64(i64 %val, i64* %addr)
-  ret i32 %res
-}
-
-declare i32 @llvm.arm64.stxr.p0i8(i64, i8*) nounwind
-declare i32 @llvm.arm64.stxr.p0i16(i64, i16*) nounwind
-declare i32 @llvm.arm64.stxr.p0i32(i64, i32*) nounwind
-declare i32 @llvm.arm64.stxr.p0i64(i64, i64*) nounwind
-
-; CHECK: test_clear:
-; CHECK: clrex
-define void @test_clear() {
-  call void @llvm.arm64.clrex()
-  ret void
-}
-
-declare void @llvm.arm64.clrex() nounwind
-
diff --git a/test/CodeGen/ARM64/leaf-compact-unwind.ll b/test/CodeGen/ARM64/leaf-compact-unwind.ll
deleted file mode 100644
index 0a58717..0000000
--- a/test/CodeGen/ARM64/leaf-compact-unwind.ll
+++ /dev/null
@@ -1,161 +0,0 @@
-; Use the -disable-cfi flag so that we get the compact unwind info in the
-; emitted assembly. Compact unwind info is omitted when CFI directives
-; are emitted.
-;
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios -disable-cfi < %s | FileCheck %s
-;
-; rdar://13070556
-
-@bar = common global i32 0, align 4
-
-; Leaf function with no stack allocation and no saving/restoring
-; of non-volatile registers.
-define i32 @foo1(i32 %a) #0 {
-entry:
-  %add = add nsw i32 %a, 42
-  ret i32 %add
-}
-
-; Leaf function with stack allocation but no saving/restoring
-; of non-volatile registers.
-define i32 @foo2(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #0 {
-entry:
-  %stack = alloca [36 x i32], align 4
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.body ]
-  %arrayidx = getelementptr inbounds [36 x i32]* %stack, i64 0, i64 %indvars.iv19
-  %0 = trunc i64 %indvars.iv19 to i32
-  store i32 %0, i32* %arrayidx, align 4, !tbaa !0
-  %indvars.iv.next20 = add i64 %indvars.iv19, 1
-  %lftr.wideiv21 = trunc i64 %indvars.iv.next20 to i32
-  %exitcond22 = icmp eq i32 %lftr.wideiv21, 36
-  br i1 %exitcond22, label %for.body4, label %for.body
-
-for.body4:                                        ; preds = %for.body, %for.body4
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body ]
-  %z1.016 = phi i32 [ %add, %for.body4 ], [ 0, %for.body ]
-  %arrayidx6 = getelementptr inbounds [36 x i32]* %stack, i64 0, i64 %indvars.iv
-  %1 = load i32* %arrayidx6, align 4, !tbaa !0
-  %add = add nsw i32 %1, %z1.016
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 36
-  br i1 %exitcond, label %for.end9, label %for.body4
-
-for.end9:                                         ; preds = %for.body4
-  ret i32 %add
-}
-
-; Leaf function with no stack allocation but with saving restoring of
-; non-volatile registers.
-define i32 @foo3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #1 {
-entry:
-  %0 = load volatile i32* @bar, align 4, !tbaa !0
-  %1 = load volatile i32* @bar, align 4, !tbaa !0
-  %2 = load volatile i32* @bar, align 4, !tbaa !0
-  %3 = load volatile i32* @bar, align 4, !tbaa !0
-  %4 = load volatile i32* @bar, align 4, !tbaa !0
-  %5 = load volatile i32* @bar, align 4, !tbaa !0
-  %6 = load volatile i32* @bar, align 4, !tbaa !0
-  %7 = load volatile i32* @bar, align 4, !tbaa !0
-  %8 = load volatile i32* @bar, align 4, !tbaa !0
-  %9 = load volatile i32* @bar, align 4, !tbaa !0
-  %10 = load volatile i32* @bar, align 4, !tbaa !0
-  %11 = load volatile i32* @bar, align 4, !tbaa !0
-  %12 = load volatile i32* @bar, align 4, !tbaa !0
-  %13 = load volatile i32* @bar, align 4, !tbaa !0
-  %14 = load volatile i32* @bar, align 4, !tbaa !0
-  %15 = load volatile i32* @bar, align 4, !tbaa !0
-  %16 = load volatile i32* @bar, align 4, !tbaa !0
-  %17 = load volatile i32* @bar, align 4, !tbaa !0
-  %factor = mul i32 %h, -2
-  %factor56 = mul i32 %g, -2
-  %factor57 = mul i32 %f, -2
-  %factor58 = mul i32 %e, -2
-  %factor59 = mul i32 %d, -2
-  %factor60 = mul i32 %c, -2
-  %factor61 = mul i32 %b, -2
-  %sum = add i32 %1, %0
-  %sum62 = add i32 %sum, %2
-  %sum63 = add i32 %sum62, %3
-  %sum64 = add i32 %sum63, %4
-  %sum65 = add i32 %sum64, %5
-  %sum66 = add i32 %sum65, %6
-  %sum67 = add i32 %sum66, %7
-  %sum68 = add i32 %sum67, %8
-  %sum69 = add i32 %sum68, %9
-  %sum70 = add i32 %sum69, %10
-  %sum71 = add i32 %sum70, %11
-  %sum72 = add i32 %sum71, %12
-  %sum73 = add i32 %sum72, %13
-  %sum74 = add i32 %sum73, %14
-  %sum75 = add i32 %sum74, %15
-  %sum76 = add i32 %sum75, %16
-  %sub10 = sub i32 %17, %sum76
-  %sub11 = add i32 %sub10, %factor
-  %sub12 = add i32 %sub11, %factor56
-  %sub13 = add i32 %sub12, %factor57
-  %sub14 = add i32 %sub13, %factor58
-  %sub15 = add i32 %sub14, %factor59
-  %sub16 = add i32 %sub15, %factor60
-  %add = add i32 %sub16, %factor61
-  ret i32 %add
-}
-
-; Leaf function with stack allocation and saving/restoring of non-volatile
-; registers.
-define i32 @foo4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #0 {
-entry:
-  %stack = alloca [128 x i32], align 4
-  %0 = zext i32 %a to i64
-  br label %for.body
-
-for.cond2.preheader:                              ; preds = %for.body
-  %1 = sext i32 %f to i64
-  br label %for.body4
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv22 = phi i64 [ 0, %entry ], [ %indvars.iv.next23, %for.body ]
-  %2 = add nsw i64 %indvars.iv22, %0
-  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %indvars.iv22
-  %3 = trunc i64 %2 to i32
-  store i32 %3, i32* %arrayidx, align 4, !tbaa !0
-  %indvars.iv.next23 = add i64 %indvars.iv22, 1
-  %lftr.wideiv25 = trunc i64 %indvars.iv.next23 to i32
-  %exitcond26 = icmp eq i32 %lftr.wideiv25, 128
-  br i1 %exitcond26, label %for.cond2.preheader, label %for.body
-
-for.body4:                                        ; preds = %for.body4, %for.cond2.preheader
-  %indvars.iv = phi i64 [ 0, %for.cond2.preheader ], [ %indvars.iv.next, %for.body4 ]
-  %z1.018 = phi i32 [ 0, %for.cond2.preheader ], [ %add8, %for.body4 ]
-  %4 = add nsw i64 %indvars.iv, %1
-  %arrayidx7 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %4
-  %5 = load i32* %arrayidx7, align 4, !tbaa !0
-  %add8 = add nsw i32 %5, %z1.018
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 128
-  br i1 %exitcond, label %for.end11, label %for.body4
-
-for.end11:                                        ; preds = %for.body4
-  ret i32 %add8
-}
-
-attributes #0 = { readnone "target-cpu"="cyclone" }
-attributes #1 = { "target-cpu"="cyclone" }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-
-; CHECK:        .section        __LD,__compact_unwind,regular,debug
-; CHECK:        .quad   _foo1                   ; Range Start
-; CHECK:        .long   33554432                ; Compact Unwind Encoding: 0x2000000
-; CHECK:        .quad   _foo2                   ; Range Start
-; CHECK:        .long   33591296                ; Compact Unwind Encoding: 0x2009000
-; CHECK:        .quad   _foo3                   ; Range Start
-; CHECK:        .long   33570831                ; Compact Unwind Encoding: 0x200400f
-; CHECK:        .quad   _foo4                   ; Range Start
-; CHECK:        .long   33689616                ; Compact Unwind Encoding: 0x2021010
diff --git a/test/CodeGen/ARM64/lit.local.cfg b/test/CodeGen/ARM64/lit.local.cfg
deleted file mode 100644
index 48af100..0000000
--- a/test/CodeGen/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,11 +0,0 @@
-import re
-
-config.suffixes = ['.ll', '.c', '.cpp']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
-# For now we don't test arm64-win32.
-if re.search(r'cygwin|mingw32|win32', config.target_triple):
-    config.unsupported = True
diff --git a/test/CodeGen/ARM64/register-offset-addressing.ll b/test/CodeGen/ARM64/register-offset-addressing.ll
deleted file mode 100644
index c273602..0000000
--- a/test/CodeGen/ARM64/register-offset-addressing.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i8 @t1(i16* %a, i64 %b) {
-; CHECK: t1
-; CHECK: lsl [[REG:x[0-9]+]], x1, #1
-; CHECK: ldrb w0, [x0, [[REG]]]
-; CHECK: ret
-  %tmp1 = getelementptr inbounds i16* %a, i64 %b
-  %tmp2 = load i16* %tmp1
-  %tmp3 = trunc i16 %tmp2 to i8
-  ret i8 %tmp3
-}
diff --git a/test/CodeGen/ARM64/simd-scalar-to-vector.ll b/test/CodeGen/ARM64/simd-scalar-to-vector.ll
deleted file mode 100644
index 6c0b840..0000000
--- a/test/CodeGen/ARM64/simd-scalar-to-vector.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -O0 | FileCheck %s --check-prefix=CHECK-FAST
-
-define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
-; CHECK: uaddlv.16b h0, v0
-; CHECK: rshrn.8b v0, v0, #4
-; CHECK: dup.16b v0, v0[0]
-; CHECK: ret
-
-; CHECK-FAST: uaddlv.16b
-; CHECK-FAST: rshrn.8b
-; CHECK-FAST: dup.16b
-  %tmp = tail call i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
-  %tmp1 = trunc i32 %tmp to i16
-  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
-  %tmp3 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
-  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
-  ret <16 x i8> %tmp4
-}
-
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/tbl.ll b/test/CodeGen/ARM64/tbl.ll
deleted file mode 100644
index e1edd21..0000000
--- a/test/CodeGen/ARM64/tbl.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
-; CHECK: tbl1_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
-; CHECK: tbl1_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
-; CHECK: tbl2_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-; CHECK: tbl2_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK: tbl3_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK: tbl3_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK: tbl4_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK: tbl4_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
-  ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
-define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
-; CHECK: tbx1_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
-; CHECK: tbx1_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK: tbx2_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK: tbx2_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK: tbx3_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK: tbx3_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
-; CHECK: tbx4_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
-; CHECK: tbx4_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
-  ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/vcnt.ll b/test/CodeGen/ARM64/vcnt.ll
deleted file mode 100644
index e00658a..0000000
--- a/test/CodeGen/ARM64/vcnt.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: cls_8b:
-;CHECK: cls.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8> %tmp1)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: cls_16b:
-;CHECK: cls.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8> %tmp1)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: cls_4h:
-;CHECK: cls.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16> %tmp1)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: cls_8h:
-;CHECK: cls.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16> %tmp1)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: cls_2s:
-;CHECK: cls.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32> %tmp1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: cls_4s:
-;CHECK: cls.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32> %tmp1)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_n.ll b/test/CodeGen/ARM64/vcvt_n.ll
deleted file mode 100644
index 46de557..0000000
--- a/test/CodeGen/ARM64/vcvt_n.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf32fxpu:
-; CHECK: ucvtf.2s	v0, v0, #9
-; CHECK: ret
-  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
-  ret <2 x float> %vcvt_n1
-}
-
-define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf32fxps:
-; CHECK: scvtf.2s	v0, v0, #12
-; CHECK: ret
-  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
-  ret <2 x float> %vcvt_n1
-}
-
-define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtqf32fxpu:
-; CHECK: ucvtf.4s	v0, v0, #18
-; CHECK: ret
-  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
-  ret <4 x float> %vcvt_n1
-}
-
-define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtqf32fxps:
-; CHECK: scvtf.4s	v0, v0, #30
-; CHECK: ret
-  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
-  ret <4 x float> %vcvt_n1
-}
-define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
-  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
-  ret <2 x double> %vcvt_n1
-}
-
-define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
-  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
-  ret <2 x double> %vcvt_n1
-}
-
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vminmaxnm.ll b/test/CodeGen/ARM64/vminmaxnm.ll
deleted file mode 100644
index 6286407..0000000
--- a/test/CodeGen/ARM64/vminmaxnm.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2s	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
-  ret <2 x float> %vmaxnm2.i
-}
-
-define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.4s	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
-  ret <4 x float> %vmaxnm2.i
-}
-
-define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2d	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
-  ret <2 x double> %vmaxnm2.i
-}
-
-define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.2s	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
-  ret <2 x float> %vminnm2.i
-}
-
-define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.4s	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
-  ret <4 x float> %vminnm2.i
-}
-
-define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fminnm.2d	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
-  ret <2 x double> %vminnm2.i
-}
-
-declare <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-
-
-define double @test_fmaxnmv(<2 x double> %in) {
-; CHECK-LABEL: test_fmaxnmv:
-; CHECK: fmaxnmp.2d d0, v0
-  %max = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
-  ret double %max
-}
-
-define double @test_fminnmv(<2 x double> %in) {
-; CHECK-LABEL: test_fminnmv:
-; CHECK: fminnmp.2d d0, v0
-  %min = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
-  ret double %min
-}
-
-declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
-declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/vqsub.ll b/test/CodeGen/ARM64/vqsub.ll
deleted file mode 100644
index 0afeb68..0000000
--- a/test/CodeGen/ARM64/vqsub.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqsub8b:
-;CHECK: sqsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqsub4h:
-;CHECK: sqsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqsub2s:
-;CHECK: sqsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqsub8b:
-;CHECK: uqsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqsub4h:
-;CHECK: uqsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqsub2s:
-;CHECK: uqsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqsub16b:
-;CHECK: sqsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqsub8h:
-;CHECK: sqsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqsub4s:
-;CHECK: sqsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqsub2d:
-;CHECK: sqsub.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqsub16b:
-;CHECK: uqsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqsub8h:
-;CHECK: uqsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqsub4s:
-;CHECK: uqsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqsub2d:
-;CHECK: uqsub.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index 4e858f7..9537489 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -46,8 +46,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !8 = metadata !{null, metadata !9, metadata !9}
 !9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
 !10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !13, metadata !14, metadata !15}
+!11 = metadata !{metadata !13, metadata !14, metadata !15}
 !13 = metadata !{i32 786689, metadata !5, metadata !"a", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 1]
 !14 = metadata !{i32 786689, metadata !5, metadata !"b", metadata !6, i32 33554433, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 1]
 !15 = metadata !{i32 786688, metadata !16, metadata !"i", metadata !6, i32 2, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2]
diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll
index b6ba22e..2559e23 100644
--- a/test/CodeGen/MSP430/fp.ll
+++ b/test/CodeGen/MSP430/fp.ll
@@ -21,7 +21,7 @@ entry:
 ; does not happen anymore. Note that the only reason an ISR is used here is that
 ; the register allocator selects r4 first instead of fifth in a normal function.
 define msp430_intrcc void @fpb_alloced() #0 {
-; CHECK_LABEL: fpb_alloced:
+; CHECK-LABEL: fpb_alloced:
 ; CHECK-NOT: mov.b #0, r4
 ; CHECK: nop
   call void asm sideeffect "nop", "r"(i8 0)
diff --git a/test/CodeGen/Mips/2010-07-20-Switch.ll b/test/CodeGen/Mips/2010-07-20-Switch.ll
index 38d7b7e..5c84077 100644
--- a/test/CodeGen/Mips/2010-07-20-Switch.ll
+++ b/test/CodeGen/Mips/2010-07-20-Switch.ll
@@ -2,10 +2,14 @@
 ; RUN: FileCheck %s -check-prefix=STATIC-O32 
 ; RUN: llc < %s -march=mips -relocation-model=pic | \
 ; RUN: FileCheck %s -check-prefix=PIC-O32 
+; RUN: llc < %s -march=mips64 -relocation-model=pic -mcpu=mips4 | \
+; RUN:     FileCheck %s -check-prefix=N64
+; RUN: llc < %s -march=mips64 -relocation-model=static -mcpu=mips4 | \
+; RUN:     FileCheck %s -check-prefix=N64
 ; RUN: llc < %s -march=mips64 -relocation-model=pic -mcpu=mips64 | \
-; RUN: FileCheck %s -check-prefix=N64
+; RUN:     FileCheck %s -check-prefix=N64
 ; RUN: llc < %s -march=mips64 -relocation-model=static -mcpu=mips64 | \
-; RUN: FileCheck %s -check-prefix=N64
+; RUN:     FileCheck %s -check-prefix=N64
 
 define i32 @main() nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/Fast-ISel/nullvoid.ll b/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
new file mode 100644
index 0000000..eeaff87
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+
+; Function Attrs: nounwind
+define void @foo() {
+entry:
+  ret void
+; CHECK: jr	$ra
+}
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestore.ll b/test/CodeGen/Mips/Fast-ISel/simplestore.ll
new file mode 100644
index 0000000..5d52481
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/simplestore.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+
+@abcd = external global i32
+
+; Function Attrs: nounwind
+define void @foo()  {
+entry:
+  store i32 12345, i32* @abcd, align 4
+; CHECK: 	addiu	$[[REG1:[0-9]+]], $zero, 12345
+; CHECK: 	lw	$[[REG2:[0-9]+]], %got(abcd)(${{[0-9]+}})
+; CHECK: 	sw	$[[REG1]], 0($[[REG2]])
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestorei.ll b/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
new file mode 100644
index 0000000..7d2c8e7
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+
+@ijk = external global i32
+
+; Function Attrs: nounwind
+define void @si2_1() #0 {
+entry:
+  store i32 32767, i32* @ijk, align 4
+; CHECK:        .ent    si2_1
+; CHECK:        addiu   $[[REG1:[0-9]+]], $zero, 32767
+; CHECK:        lw      $[[REG2:[0-9]+]], %got(ijk)(${{[0-9]+}})
+; CHECK:        sw      $[[REG1]], 0($[[REG2]])
+
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @si2_2() #0 {
+entry:
+  store i32 -32768, i32* @ijk, align 4
+; CHECK:        .ent    si2_2
+; CHECK:        addiu   $[[REG1:[0-9]+]], $zero, -32768
+; CHECK:        lw      $[[REG2:[0-9]+]], %got(ijk)(${{[0-9]+}})
+; CHECK:        sw      $[[REG1]], 0($[[REG2]])
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ui2_1() #0 {
+entry:
+  store i32 65535, i32* @ijk, align 4
+; CHECK:        .ent    ui2_1
+; CHECK:        ori     $[[REG1:[0-9]+]], $zero, 65535
+; CHECK:        lw      $[[REG2:[0-9]+]], %got(ijk)(${{[0-9]+}})
+; CHECK:        sw      $[[REG1]], 0($[[REG2]])
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ui4_1() #0 {
+entry:
+  store i32 983040, i32* @ijk, align 4
+; CHECK:        .ent    ui4_1
+; CHECK:        lui     $[[REG1:[0-9]+]], 15
+; CHECK:        lw      $[[REG2:[0-9]+]], %got(ijk)(${{[0-9]+}})
+; CHECK:        sw      $[[REG1]], 0($[[REG2]])
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ui4_2() #0 {
+entry:
+  store i32 719566, i32* @ijk, align 4
+; CHECK:        .ent    ui4_2
+; CHECK:        lui	$[[REG1:[0-9]+]], 10
+; CHECK: 	ori	$[[REG1]], $[[REG1]], 64206
+; CHECK: 	lw	$[[REG2:[0-9]+]], %got(ijk)(${{[0-9]+}})
+; CHECK: 	sw	$[[REG1]], 0($[[REG2]])
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
diff --git a/test/CodeGen/Mips/abicalls.ll b/test/CodeGen/Mips/abicalls.ll
index 7b98b02..6fa33aa 100644
--- a/test/CodeGen/Mips/abicalls.ll
+++ b/test/CodeGen/Mips/abicalls.ll
@@ -7,6 +7,7 @@
 
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-STATIC %s
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck -check-prefix=CHECK-PIC %s
+; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips4 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-PIC %s
 ; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-PIC %s
 
 ; CHECK-STATIC: .abicalls
diff --git a/test/CodeGen/Mips/cconv/arguments-float.ll b/test/CodeGen/Mips/cconv/arguments-float.ll
new file mode 100644
index 0000000..e2119ec
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-float.ll
@@ -0,0 +1,222 @@
+; RUN: llc -march=mips -relocation-model=static -soft-float < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
+; RUN: llc -march=mipsel -relocation-model=static -soft-float < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+
+; RUN-TODO: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+
+; Test the floating point arguments for all ABI's and byte orders as specified
+; by section 5 of MD00305 (MIPS ABIs Described).
+;
+; N32/N64 are identical in this area so their checks have been combined into
+; the 'NEW' prefix (the N stands for New).
+
+@bytes = global [11 x i8] zeroinitializer
+@dwords = global [11 x i64] zeroinitializer
+@floats = global [11 x float] zeroinitializer
+@doubles = global [11 x double] zeroinitializer
+
+define void @double_args(double %a, double %b, double %c, double %d, double %e,
+                         double %f, double %g, double %h, double %i) nounwind {
+entry:
+        %0 = getelementptr [11 x double]* @doubles, i32 0, i32 1
+        store volatile double %a, double* %0
+        %1 = getelementptr [11 x double]* @doubles, i32 0, i32 2
+        store volatile double %b, double* %1
+        %2 = getelementptr [11 x double]* @doubles, i32 0, i32 3
+        store volatile double %c, double* %2
+        %3 = getelementptr [11 x double]* @doubles, i32 0, i32 4
+        store volatile double %d, double* %3
+        %4 = getelementptr [11 x double]* @doubles, i32 0, i32 5
+        store volatile double %e, double* %4
+        %5 = getelementptr [11 x double]* @doubles, i32 0, i32 6
+        store volatile double %f, double* %5
+        %6 = getelementptr [11 x double]* @doubles, i32 0, i32 7
+        store volatile double %g, double* %6
+        %7 = getelementptr [11 x double]* @doubles, i32 0, i32 8
+        store volatile double %h, double* %7
+        %8 = getelementptr [11 x double]* @doubles, i32 0, i32 9
+        store volatile double %i, double* %8
+        ret void
+}
+
+; ALL-LABEL: double_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(doubles)(
+
+; The first four arguments are the same in O32/N32/N64.
+; The first argument is floating point but soft-float is enabled so floating
+; point registers are not used.
+; O32-DAG:           sw $4, 8([[R2]])
+; O32-DAG:           sw $5, 12([[R2]])
+; NEW-DAG:           sd $4, 8([[R2]])
+
+; O32-DAG:           sw $6, 16([[R2]])
+; O32-DAG:           sw $7, 20([[R2]])
+; NEW-DAG:           sd $5, 16([[R2]])
+
+; O32 has run out of argument registers and starts using the stack
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 24($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 28($sp)
+; O32-DAG:           sw [[R3]], 24([[R2]])
+; O32-DAG:           sw [[R4]], 28([[R2]])
+; NEW-DAG:           sd $6, 24([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 36($sp)
+; O32-DAG:           sw [[R3]], 32([[R2]])
+; O32-DAG:           sw [[R4]], 36([[R2]])
+; NEW-DAG:           sd $7, 32([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 40($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 44($sp)
+; O32-DAG:           sw [[R3]], 40([[R2]])
+; O32-DAG:           sw [[R4]], 44([[R2]])
+; NEW-DAG:           sd $8, 40([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 48($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 52($sp)
+; O32-DAG:           sw [[R3]], 48([[R2]])
+; O32-DAG:           sw [[R4]], 52([[R2]])
+; NEW-DAG:           sd $9, 48([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 56($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 60($sp)
+; O32-DAG:           sw [[R3]], 56([[R2]])
+; O32-DAG:           sw [[R4]], 60([[R2]])
+; NEW-DAG:           sd $10, 56([[R2]])
+
+; N32/N64 have run out of registers and starts using the stack too
+; O32-DAG:           lw [[R3:\$[0-9]+]], 64($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 68($sp)
+; O32-DAG:           sw [[R3]], 64([[R2]])
+; O32-DAG:           sw [[R4]], 68([[R2]])
+; NEW-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           sd $11, 64([[R2]])
+
+define void @float_args(float %a, float %b, float %c, float %d, float %e,
+                        float %f, float %g, float %h, float %i, float %j)
+                       nounwind {
+entry:
+        %0 = getelementptr [11 x float]* @floats, i32 0, i32 1
+        store volatile float %a, float* %0
+        %1 = getelementptr [11 x float]* @floats, i32 0, i32 2
+        store volatile float %b, float* %1
+        %2 = getelementptr [11 x float]* @floats, i32 0, i32 3
+        store volatile float %c, float* %2
+        %3 = getelementptr [11 x float]* @floats, i32 0, i32 4
+        store volatile float %d, float* %3
+        %4 = getelementptr [11 x float]* @floats, i32 0, i32 5
+        store volatile float %e, float* %4
+        %5 = getelementptr [11 x float]* @floats, i32 0, i32 6
+        store volatile float %f, float* %5
+        %6 = getelementptr [11 x float]* @floats, i32 0, i32 7
+        store volatile float %g, float* %6
+        %7 = getelementptr [11 x float]* @floats, i32 0, i32 8
+        store volatile float %h, float* %7
+        %8 = getelementptr [11 x float]* @floats, i32 0, i32 9
+        store volatile float %i, float* %8
+        %9 = getelementptr [11 x float]* @floats, i32 0, i32 10
+        store volatile float %j, float* %9
+        ret void
+}
+
+; ALL-LABEL: float_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(floats)(
+
+; The first four arguments are the same in O32/N32/N64.
+; The first argument isn't floating point so floating point registers are not
+; used.
+; MD00305 and GCC disagree on this one. MD00305 says that floats are treated
+; as 8-byte aligned and occupy two slots on O32. GCC is treating them as 4-byte
+; aligned and occupying one slot. We'll use GCC's definition.
+; ALL-DAG:           sw $4, 4([[R2]])
+; ALL-DAG:           sw $5, 8([[R2]])
+; ALL-DAG:           sw $6, 12([[R2]])
+; ALL-DAG:           sw $7, 16([[R2]])
+
+; O32 has run out of argument registers and starts using the stack
+; O32-DAG:           lw [[R3:\$[0-9]+]], 16($sp)
+; O32-DAG:           sw [[R3]], 20([[R2]])
+; NEW-DAG:           sw $8, 20([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 20($sp)
+; O32-DAG:           sw [[R3]], 24([[R2]])
+; NEW-DAG:           sw $9, 24([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 24($sp)
+; O32-DAG:           sw [[R3]], 28([[R2]])
+; NEW-DAG:           sw $10, 28([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 28($sp)
+; O32-DAG:           sw [[R3]], 32([[R2]])
+; NEW-DAG:           sw $11, 32([[R2]])
+
+; N32/N64 have run out of registers and start using the stack too
+; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
+; O32-DAG:           sw [[R3]], 36([[R2]])
+; NEW-DAG:           lw [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           sw [[R3]], 36([[R2]])
+
+define void @double_arg2(i8 %a, double %b) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x double]* @doubles, i32 0, i32 1
+        store volatile double %b, double* %1
+        ret void
+}
+
+; ALL-LABEL: double_arg2:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(doubles)(
+
+; The first four arguments are the same in O32/N32/N64.
+; The first argument isn't floating point so floating point registers are not
+; used.
+; The second slot is insufficiently aligned for double on O32 so it is skipped.
+; Also, double occupies two slots on O32 and only one for N32/N64.
+; ALL-DAG:           sb $4, 1([[R1]])
+; O32-DAG:           sw $6, 8([[R2]])
+; O32-DAG:           sw $7, 12([[R2]])
+; NEW-DAG:           sd $5, 8([[R2]])
+
+define void @float_arg2(i8 %a, float %b) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x float]* @floats, i32 0, i32 1
+        store volatile float %b, float* %1
+        ret void
+}
+
+; ALL-LABEL: float_arg2:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(floats)(
+
+; The first four arguments are the same in O32/N32/N64.
+; The first argument isn't floating point so floating point registers are not
+; used.
+; MD00305 and GCC disagree on this one. MD00305 says that floats are treated
+; as 8-byte aligned and occupy two slots on O32. GCC is treating them as 4-byte
+; aligned and occupying one slot. We'll use GCC's definition.
+; ALL-DAG:           sb $4, 1([[R1]])
+; ALL-DAG:           sw $5, 4([[R2]])
diff --git a/test/CodeGen/Mips/cconv/arguments-fp128.ll b/test/CodeGen/Mips/cconv/arguments-fp128.ll
new file mode 100644
index 0000000..c8cd8fd
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-fp128.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+
+; Test the fp128 arguments for all ABI's and byte orders as specified
+; by section 2 of the MIPSpro N32 Handbook.
+;
+; O32 is not tested because long double is the same as double on O32.
+
+@ldoubles = global [11 x fp128] zeroinitializer
+
+define void @ldouble_args(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e) nounwind {
+entry:
+        %0 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 1
+        store volatile fp128 %a, fp128* %0
+        %1 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 2
+        store volatile fp128 %b, fp128* %1
+        %2 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 3
+        store volatile fp128 %c, fp128* %2
+        %3 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 4
+        store volatile fp128 %d, fp128* %3
+        %4 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 5
+        store volatile fp128 %e, fp128* %4
+        ret void
+}
+
+; ALL-LABEL: ldouble_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(ldoubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(ldoubles)(
+
+; The first four arguments are the same in N32/N64.
+; The first argument is floating point but soft-float is enabled so floating
+; point registers are not used.
+; ALL-DAG:           sd $4, 16([[R2]])
+; ALL-DAG:           sd $5, 24([[R2]])
+; ALL-DAG:           sd $6, 32([[R2]])
+; ALL-DAG:           sd $7, 40([[R2]])
+; ALL-DAG:           sd $8, 48([[R2]])
+; ALL-DAG:           sd $9, 56([[R2]])
+; ALL-DAG:           sd $10, 64([[R2]])
+; ALL-DAG:           sd $11, 72([[R2]])
+
+; N32/N64 have run out of registers and starts using the stack too
+; ALL-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
+; ALL-DAG:           ld [[R4:\$[0-9]+]], 8($sp)
+; ALL-DAG:           sd [[R3]], 80([[R2]])
+; ALL-DAG:           sd [[R4]], 88([[R2]])
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
new file mode 100644
index 0000000..aadf7d1
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
@@ -0,0 +1,157 @@
+; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+
+; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW %s
+
+; Test the effect of varargs on floating point types in the non-variable part
+; of the argument list as specified by section 2 of the MIPSpro N32 Handbook.
+;
+; N32/N64 are almost identical in this area so many of their checks have been
+; combined into the 'NEW' prefix (the N stands for New).
+;
+; On O32, varargs prevents all FPU argument register usage. This contradicts
+; the N32 handbook, but agrees with the SYSV ABI and GCC's behaviour.
+
+@floats = global [11 x float] zeroinitializer
+@doubles = global [11 x double] zeroinitializer
+
+define void @double_args(double %a, ...)
+                         nounwind {
+entry:
+        %0 = getelementptr [11 x double]* @doubles, i32 0, i32 1
+        store volatile double %a, double* %0
+
+        %ap = alloca i8*
+        %ap2 = bitcast i8** %ap to i8*
+        call void @llvm.va_start(i8* %ap2)
+        %b = va_arg i8** %ap, double
+        %1 = getelementptr [11 x double]* @doubles, i32 0, i32 2
+        store volatile double %b, double* %1
+        ret void
+}
+
+; ALL-LABEL: double_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:         addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:         ld [[R2:\$[0-9]]], %got_disp(doubles)(
+
+; O32 forbids using floating point registers for the non-variable portion.
+; N32/N64 allow it.
+; O32BE-DAG:         mtc1 $5, [[FTMP1:\$f[0-9]*[02468]+]]
+; O32BE-DAG:         mtc1 $4, [[FTMP2:\$f[0-9]*[13579]+]]
+; O32LE-DAG:         mtc1 $4, [[FTMP1:\$f[0-9]*[02468]+]]
+; O32LE-DAG:         mtc1 $5, [[FTMP2:\$f[0-9]*[13579]+]]
+; O32-DAG:           sdc1 [[FTMP1]], 8([[R2]])
+; NEW-DAG:           sdc1 $f12, 8([[R2]])
+
+; The varargs portion is dumped to stack
+; O32-DAG:           sw $6, 16($sp)
+; O32-DAG:           sw $7, 20($sp)
+; NEW-DAG:           sd $5, 8($sp)
+; NEW-DAG:           sd $6, 16($sp)
+; NEW-DAG:           sd $7, 24($sp)
+; NEW-DAG:           sd $8, 32($sp)
+; NEW-DAG:           sd $9, 40($sp)
+; NEW-DAG:           sd $10, 48($sp)
+; NEW-DAG:           sd $11, 56($sp)
+
+; Get the varargs pointer
+; O32 has 4 bytes padding, 4 bytes for the varargs pointer, and 8 bytes reserved
+; for arguments 1 and 2.
+; N32/N64 has 8 bytes for the varargs pointer, and no reserved area.
+; O32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 16
+; O32-DAG:           sw [[VAPTR]], 4($sp)
+; N32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N32-DAG:           sw [[VAPTR]], 4($sp)
+; N64-DAG:           daddiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N64-DAG:           sd [[VAPTR]], 0($sp)
+
+; Increment the pointer then get the varargs arg
+; LLVM will rebind the load to the stack pointer instead of the varargs pointer
+; during lowering. This is fine and doesn't change the behaviour.
+; O32-DAG:           addiu [[VAPTR]], [[VAPTR]], 8
+; O32-DAG:           sw [[VAPTR]], 4($sp)
+; N32-DAG:           addiu [[VAPTR]], [[VAPTR]], 8
+; N32-DAG:           sw [[VAPTR]], 4($sp)
+; N64-DAG:           daddiu [[VAPTR]], [[VAPTR]], 8
+; N64-DAG:           sd [[VAPTR]], 0($sp)
+; O32-DAG:           ldc1 [[FTMP1:\$f[0-9]+]], 16($sp)
+; NEW-DAG:           ldc1 [[FTMP1:\$f[0-9]+]], 8($sp)
+; ALL-DAG:           sdc1 [[FTMP1]], 16([[R2]])
+
+define void @float_args(float %a, ...) nounwind {
+entry:
+        %0 = getelementptr [11 x float]* @floats, i32 0, i32 1
+        store volatile float %a, float* %0
+
+        %ap = alloca i8*
+        %ap2 = bitcast i8** %ap to i8*
+        call void @llvm.va_start(i8* %ap2)
+        %b = va_arg i8** %ap, float
+        %1 = getelementptr [11 x float]* @floats, i32 0, i32 2
+        store volatile float %b, float* %1
+        ret void
+}
+
+; ALL-LABEL: float_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:         addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:         ld [[R2:\$[0-9]]], %got_disp(floats)(
+
+; The first four arguments are the same in O32/N32/N64.
+; The non-variable portion should be unaffected.
+; O32-DAG:           sw $4, 4([[R2]])
+; NEW-DAG:           swc1 $f12, 4([[R2]])
+
+; The varargs portion is dumped to stack
+; O32-DAG:           sw $5, 12($sp)
+; O32-DAG:           sw $6, 16($sp)
+; O32-DAG:           sw $7, 20($sp)
+; NEW-DAG:           sd $5, 8($sp)
+; NEW-DAG:           sd $6, 16($sp)
+; NEW-DAG:           sd $7, 24($sp)
+; NEW-DAG:           sd $8, 32($sp)
+; NEW-DAG:           sd $9, 40($sp)
+; NEW-DAG:           sd $10, 48($sp)
+; NEW-DAG:           sd $11, 56($sp)
+
+; Get the varargs pointer
+; O32 has 4 bytes padding, 4 bytes for the varargs pointer, and should have 8
+; bytes reserved for arguments 1 and 2 (the first float arg) but as discussed in
+; arguments-float.ll, GCC doesn't agree with MD00305 and treats floats as 4
+; bytes so we only have 12 bytes total.
+; N32/N64 has 8 bytes for the varargs pointer, and no reserved area.
+; O32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 12
+; O32-DAG:           sw [[VAPTR]], 4($sp)
+; N32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N32-DAG:           sw [[VAPTR]], 4($sp)
+; N64-DAG:           daddiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N64-DAG:           sd [[VAPTR]], 0($sp)
+
+; Increment the pointer then get the varargs arg
+; LLVM will rebind the load to the stack pointer instead of the varargs pointer
+; during lowering. This is fine and doesn't change the behaviour.
+; N32/N64 is using ori instead of addiu/daddiu but (although odd) this is fine
+; since the stack is always aligned.
+; O32-DAG:           addiu [[VAPTR]], [[VAPTR]], 4
+; O32-DAG:           sw [[VAPTR]], 4($sp)
+; N32-DAG:           ori [[VAPTR]], [[VAPTR]], 4
+; N32-DAG:           sw [[VAPTR]], 4($sp)
+; N64-DAG:           ori [[VAPTR]], [[VAPTR]], 4
+; N64-DAG:           sd [[VAPTR]], 0($sp)
+; O32-DAG:           lwc1 [[FTMP1:\$f[0-9]+]], 12($sp)
+; NEW-DAG:           lwc1 [[FTMP1:\$f[0-9]+]], 8($sp)
+; ALL-DAG:           swc1 [[FTMP1]], 8([[R2]])
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_copy(i8*, i8*)
+declare void @llvm.va_end(i8*)
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-float.ll b/test/CodeGen/Mips/cconv/arguments-hard-float.ll
new file mode 100644
index 0000000..9837f7e
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-hard-float.ll
@@ -0,0 +1,211 @@
+; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+
+; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+
+; Test the floating point arguments for all ABI's and byte orders as specified
+; by section 5 of MD00305 (MIPS ABIs Described).
+;
+; N32/N64 are identical in this area so their checks have been combined into
+; the 'NEW' prefix (the N stands for New).
+
+@bytes = global [11 x i8] zeroinitializer
+@dwords = global [11 x i64] zeroinitializer
+@floats = global [11 x float] zeroinitializer
+@doubles = global [11 x double] zeroinitializer
+
+define void @double_args(double %a, double %b, double %c, double %d, double %e,
+                         double %f, double %g, double %h, double %i) nounwind {
+entry:
+        %0 = getelementptr [11 x double]* @doubles, i32 0, i32 1
+        store volatile double %a, double* %0
+        %1 = getelementptr [11 x double]* @doubles, i32 0, i32 2
+        store volatile double %b, double* %1
+        %2 = getelementptr [11 x double]* @doubles, i32 0, i32 3
+        store volatile double %c, double* %2
+        %3 = getelementptr [11 x double]* @doubles, i32 0, i32 4
+        store volatile double %d, double* %3
+        %4 = getelementptr [11 x double]* @doubles, i32 0, i32 5
+        store volatile double %e, double* %4
+        %5 = getelementptr [11 x double]* @doubles, i32 0, i32 6
+        store volatile double %f, double* %5
+        %6 = getelementptr [11 x double]* @doubles, i32 0, i32 7
+        store volatile double %g, double* %6
+        %7 = getelementptr [11 x double]* @doubles, i32 0, i32 8
+        store volatile double %h, double* %7
+        %8 = getelementptr [11 x double]* @doubles, i32 0, i32 9
+        store volatile double %i, double* %8
+        ret void
+}
+
+; ALL-LABEL: double_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(doubles)(
+
+; The first argument is floating point so floating point registers are used.
+; The first argument is the same for O32/N32/N64 but the second argument differs
+; by register
+; ALL-DAG:           sdc1 $f12, 8([[R2]])
+; O32-DAG:           sdc1 $f14, 16([[R2]])
+; NEW-DAG:           sdc1 $f13, 16([[R2]])
+
+; O32 has run out of argument registers and starts using the stack
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 16($sp)
+; O32-DAG:           sdc1 [[F1]], 24([[R2]])
+; NEW-DAG:           sdc1 $f14, 24([[R2]])
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 24($sp)
+; O32-DAG:           sdc1 [[F1]], 32([[R2]])
+; NEW-DAG:           sdc1 $f15, 32([[R2]])
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 32($sp)
+; O32-DAG:           sdc1 [[F1]], 40([[R2]])
+; NEW-DAG:           sdc1 $f16, 40([[R2]])
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 40($sp)
+; O32-DAG:           sdc1 [[F1]], 48([[R2]])
+; NEW-DAG:           sdc1 $f17, 48([[R2]])
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 48($sp)
+; O32-DAG:           sdc1 [[F1]], 56([[R2]])
+; NEW-DAG:           sdc1 $f18, 56([[R2]])
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 56($sp)
+; O32-DAG:           sdc1 [[F1]], 64([[R2]])
+; NEW-DAG:           sdc1 $f19, 64([[R2]])
+
+; N32/N64 have run out of registers and start using the stack too
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 64($sp)
+; O32-DAG:           sdc1 [[F1]], 72([[R2]])
+; NEW-DAG:           ldc1 [[F1:\$f[0-9]+]], 0($sp)
+; NEW-DAG:           sdc1 [[F1]], 72([[R2]])
+
+define void @float_args(float %a, float %b, float %c, float %d, float %e,
+                        float %f, float %g, float %h, float %i) nounwind {
+entry:
+        %0 = getelementptr [11 x float]* @floats, i32 0, i32 1
+        store volatile float %a, float* %0
+        %1 = getelementptr [11 x float]* @floats, i32 0, i32 2
+        store volatile float %b, float* %1
+        %2 = getelementptr [11 x float]* @floats, i32 0, i32 3
+        store volatile float %c, float* %2
+        %3 = getelementptr [11 x float]* @floats, i32 0, i32 4
+        store volatile float %d, float* %3
+        %4 = getelementptr [11 x float]* @floats, i32 0, i32 5
+        store volatile float %e, float* %4
+        %5 = getelementptr [11 x float]* @floats, i32 0, i32 6
+        store volatile float %f, float* %5
+        %6 = getelementptr [11 x float]* @floats, i32 0, i32 7
+        store volatile float %g, float* %6
+        %7 = getelementptr [11 x float]* @floats, i32 0, i32 8
+        store volatile float %h, float* %7
+        %8 = getelementptr [11 x float]* @floats, i32 0, i32 9
+        store volatile float %i, float* %8
+        ret void
+}
+
+; ALL-LABEL: float_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(floats)(
+
+; The first argument is floating point so floating point registers are used.
+; The first argument is the same for O32/N32/N64 but the second argument differs
+; by register
+; ALL-DAG:           swc1 $f12, 4([[R1]])
+; O32-DAG:           swc1 $f14, 8([[R1]])
+; NEW-DAG:           swc1 $f13, 8([[R1]])
+
+; O32 has run out of argument registers and (in theory) starts using the stack
+; I've yet to find a reference in the documentation about this but GCC uses up
+; the remaining two argument slots in the GPR's first. We'll do the same for
+; compatibility.
+; O32-DAG:           sw $6, 12([[R1]])
+; NEW-DAG:           swc1 $f14, 12([[R1]])
+; O32-DAG:           sw $7, 16([[R1]])
+; NEW-DAG:           swc1 $f15, 16([[R1]])
+
+; O32 is definitely out of registers now and switches to the stack.
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 16($sp)
+; O32-DAG:           swc1 [[F1]], 20([[R1]])
+; NEW-DAG:           swc1 $f16, 20([[R1]])
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 20($sp)
+; O32-DAG:           swc1 [[F1]], 24([[R1]])
+; NEW-DAG:           swc1 $f17, 24([[R1]])
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 24($sp)
+; O32-DAG:           swc1 [[F1]], 28([[R1]])
+; NEW-DAG:           swc1 $f18, 28([[R1]])
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 28($sp)
+; O32-DAG:           swc1 [[F1]], 32([[R1]])
+; NEW-DAG:           swc1 $f19, 32([[R1]])
+
+; N32/N64 have run out of registers and start using the stack too
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 32($sp)
+; O32-DAG:           swc1 [[F1]], 36([[R1]])
+; NEW-DAG:           lwc1 [[F1:\$f[0-9]+]], 0($sp)
+; NEW-DAG:           swc1 [[F1]], 36([[R1]])
+
+
+define void @double_arg2(i8 %a, double %b) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x double]* @doubles, i32 0, i32 1
+        store volatile double %b, double* %1
+        ret void
+}
+
+; ALL-LABEL: double_arg2:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(doubles)(
+
+; The first argument is the same in O32/N32/N64.
+; ALL-DAG:           sb $4, 1([[R1]])
+
+; The first argument isn't floating point so floating point registers are not
+; used in O32, but N32/N64 will still use them.
+; The second slot is insufficiently aligned for double on O32 so it is skipped.
+; Also, double occupies two slots on O32 and only one for N32/N64.
+; O32LE-DAG:           mtc1 $6, [[F1:\$f[0-9]*[02468]+]]
+; O32LE-DAG:           mtc1 $7, [[F2:\$f[0-9]*[13579]+]]
+; O32BE-DAG:           mtc1 $6, [[F2:\$f[0-9]*[13579]+]]
+; O32BE-DAG:           mtc1 $7, [[F1:\$f[0-9]*[02468]+]]
+; O32-DAG:           sdc1 [[F1]], 8([[R2]])
+; NEW-DAG:           sdc1 $f13, 8([[R2]])
+
+define void @float_arg2(i8 %a, float %b) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x float]* @floats, i32 0, i32 1
+        store volatile float %b, float* %1
+        ret void
+}
+
+; ALL-LABEL: float_arg2:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(floats)(
+
+; The first argument is the same in O32/N32/N64.
+; ALL-DAG:           sb $4, 1([[R1]])
+
+; The first argument isn't floating point so floating point registers are not
+; used in O32, but N32/N64 will still use them.
+; MD00305 and GCC disagree on this one. MD00305 says that floats are treated
+; as 8-byte aligned and occupy two slots on O32. GCC is treating them as 4-byte
+; aligned and occupying one slot. We'll use GCC's definition.
+; O32-DAG:           sw $5, 4([[R2]])
+; NEW-DAG:           swc1 $f13, 4([[R2]])
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll b/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll
new file mode 100644
index 0000000..5e3f403
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+
+; Test the fp128 arguments for all ABI's and byte orders as specified
+; by section 2 of the MIPSpro N32 Handbook.
+;
+; O32 is not tested because long double is the same as double on O32.
+
+@ldoubles = global [11 x fp128] zeroinitializer
+
+define void @ldouble_args(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e) nounwind {
+entry:
+        %0 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 1
+        store volatile fp128 %a, fp128* %0
+        %1 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 2
+        store volatile fp128 %b, fp128* %1
+        %2 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 3
+        store volatile fp128 %c, fp128* %2
+        %3 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 4
+        store volatile fp128 %d, fp128* %3
+        %4 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 5
+        store volatile fp128 %e, fp128* %4
+        ret void
+}
+
+; ALL-LABEL: ldouble_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(ldoubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(ldoubles)(
+
+; The first four arguments are the same in N32/N64.
+; ALL-DAG:           sdc1 $f12, 16([[R2]])
+; ALL-DAG:           sdc1 $f13, 24([[R2]])
+; ALL-DAG:           sdc1 $f14, 32([[R2]])
+; ALL-DAG:           sdc1 $f15, 40([[R2]])
+; ALL-DAG:           sdc1 $f16, 48([[R2]])
+; ALL-DAG:           sdc1 $f17, 56([[R2]])
+; ALL-DAG:           sdc1 $f18, 64([[R2]])
+; ALL-DAG:           sdc1 $f19, 72([[R2]])
+
+; N32/N64 have run out of registers and starts using the stack too
+; ALL-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
+; ALL-DAG:           ld [[R4:\$[0-9]+]], 8($sp)
+; ALL-DAG:           sd [[R3]], 80([[R2]])
+; ALL-DAG:           sd [[R4]], 88([[R2]])
diff --git a/test/CodeGen/Mips/cconv/arguments.ll b/test/CodeGen/Mips/cconv/arguments.ll
new file mode 100644
index 0000000..8fe29f3
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments.ll
@@ -0,0 +1,170 @@
+; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+
+; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+
+; Test the integer arguments for all ABI's and byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+;
+; N32/N64 are identical in this area so their checks have been combined into
+; the 'NEW' prefix (the N stands for New).
+;
+; Varargs are covered in arguments-hard-float-varargs.ll.
+
+@bytes = global [11 x i8] zeroinitializer
+@dwords = global [11 x i64] zeroinitializer
+@floats = global [11 x float] zeroinitializer
+@doubles = global [11 x double] zeroinitializer
+
+define void @align_to_arg_slots(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g,
+                                i8 %h, i8 %i, i8 %j) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x i8]* @bytes, i32 0, i32 2
+        store volatile i8 %b, i8* %1
+        %2 = getelementptr [11 x i8]* @bytes, i32 0, i32 3
+        store volatile i8 %c, i8* %2
+        %3 = getelementptr [11 x i8]* @bytes, i32 0, i32 4
+        store volatile i8 %d, i8* %3
+        %4 = getelementptr [11 x i8]* @bytes, i32 0, i32 5
+        store volatile i8 %e, i8* %4
+        %5 = getelementptr [11 x i8]* @bytes, i32 0, i32 6
+        store volatile i8 %f, i8* %5
+        %6 = getelementptr [11 x i8]* @bytes, i32 0, i32 7
+        store volatile i8 %g, i8* %6
+        %7 = getelementptr [11 x i8]* @bytes, i32 0, i32 8
+        store volatile i8 %h, i8* %7
+        %8 = getelementptr [11 x i8]* @bytes, i32 0, i32 9
+        store volatile i8 %i, i8* %8
+        %9 = getelementptr [11 x i8]* @bytes, i32 0, i32 10
+        store volatile i8 %j, i8* %9
+        ret void
+}
+
+; ALL-LABEL: align_to_arg_slots:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+
+; The first four arguments are the same in O32/N32/N64
+; ALL-DAG:           sb $4, 1([[R1]])
+; ALL-DAG:           sb $5, 2([[R1]])
+; ALL-DAG:           sb $6, 3([[R1]])
+; ALL-DAG:           sb $7, 4([[R1]])
+
+; N32/N64 get an extra four arguments in registers
+; O32 starts loading from the stack. The addresses start at 16 because space is
+; always reserved for the first four arguments.
+; O32-DAG:           lw [[R3:\$[0-9]+]], 16($sp)
+; O32-DAG:           sb [[R3]], 5([[R1]])
+; NEW-DAG:           sb $8, 5([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 20($sp)
+; O32-DAG:           sb [[R3]], 6([[R1]])
+; NEW-DAG:           sb $9, 6([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 24($sp)
+; O32-DAG:           sb [[R3]], 7([[R1]])
+; NEW-DAG:           sb $10, 7([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 28($sp)
+; O32-DAG:           sb [[R3]], 8([[R1]])
+; NEW-DAG:           sb $11, 8([[R1]])
+
+; O32/N32/N64 are accessing the stack at this point.
+; Unlike O32, N32/N64 do not reserve space for the arguments.
+; increase by 4 for O32 and 8 for N32/N64.
+; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
+; O32-DAG:           sb [[R3]], 9([[R1]])
+; NEW-DAG:           lw [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           sb [[R3]], 9([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 36($sp)
+; O32-DAG:           sb [[R3]], 10([[R1]])
+; NEW-DAG:           lw [[R3:\$[0-9]+]], 8($sp)
+; NEW-DAG:           sb [[R3]], 10([[R1]])
+
+define void @slot_skipping(i8 %a, i64 %b, i8 %c, i8 %d,
+                           i8 %e, i8 %f, i8 %g, i64 %i, i8 %j) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x i64]* @dwords, i32 0, i32 1
+        store volatile i64 %b, i64* %1
+        %2 = getelementptr [11 x i8]* @bytes, i32 0, i32 2
+        store volatile i8 %c, i8* %2
+        %3 = getelementptr [11 x i8]* @bytes, i32 0, i32 3
+        store volatile i8 %d, i8* %3
+        %4 = getelementptr [11 x i8]* @bytes, i32 0, i32 4
+        store volatile i8 %e, i8* %4
+        %5 = getelementptr [11 x i8]* @bytes, i32 0, i32 5
+        store volatile i8 %f, i8* %5
+        %6 = getelementptr [11 x i8]* @bytes, i32 0, i32 6
+        store volatile i8 %g, i8* %6
+        %7 = getelementptr [11 x i64]* @dwords, i32 0, i32 2
+        store volatile i64 %i, i64* %7
+        %8 = getelementptr [11 x i8]* @bytes, i32 0, i32 7
+        store volatile i8 %j, i8* %8
+        ret void
+}
+
+; ALL-LABEL: slot_skipping:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(dwords)(
+
+; The first argument is the same in O32/N32/N64.
+; ALL-DAG:           sb $4, 1([[R1]])
+
+; The second slot is insufficiently aligned for i64 on O32 so it is skipped.
+; Also, i64 occupies two slots on O32 and only one for N32/N64.
+; O32-DAG:           sw $6, 8([[R2]])
+; O32-DAG:           sw $7, 12([[R2]])
+; NEW-DAG:           sd $5, 8([[R2]])
+
+; N32/N64 get an extra four arguments in registers and still have two left from
+; the first four.
+; O32 starts loading from the stack. The addresses start at 16 because space is
+; always reserved for the first four arguments.
+; It's not clear why O32 uses lbu for this argument, but it's not wrong so we'll
+; accept it for now. The only IR difference is that this argument has
+; anyext from i8 and align 8 on it.
+; O32LE-DAG:           lbu [[R3:\$[0-9]+]], 16($sp)
+; O32BE-DAG:           lbu [[R3:\$[0-9]+]], 19($sp)
+; O32-DAG:           sb [[R3]], 2([[R1]])
+; NEW-DAG:           sb $6, 2([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 20($sp)
+; O32-DAG:           sb [[R3]], 3([[R1]])
+; NEW-DAG:           sb $7, 3([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 24($sp)
+; O32-DAG:           sb [[R3]], 4([[R1]])
+; NEW-DAG:           sb $8, 4([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 28($sp)
+; O32-DAG:           sb [[R3]], 5([[R1]])
+; NEW-DAG:           sb $9, 5([[R1]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
+; O32-DAG:           sb [[R3]], 6([[R1]])
+; NEW-DAG:           sb $10, 6([[R1]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 40($sp)
+; O32-DAG:           sw [[R3]], 16([[R2]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 44($sp)
+; O32-DAG:           sw [[R3]], 20([[R2]])
+; NEW-DAG:           sd $11, 16([[R2]])
+
+; O32/N32/N64 are accessing the stack at this point.
+; Unlike O32, N32/N64 do not reserve space for the arguments.
+; increase by 4 for O32 and 8 for N32/N64.
+; O32-DAG:           lw [[R3:\$[0-9]+]], 48($sp)
+; O32-DAG:           sb [[R3]], 7([[R1]])
+; NEW-DAG:           lw [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           sb [[R3]], 7([[R1]])
diff --git a/test/CodeGen/Mips/cconv/callee-saved-float.ll b/test/CodeGen/Mips/cconv/callee-saved-float.ll
new file mode 100644
index 0000000..de4d917
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/callee-saved-float.ll
@@ -0,0 +1,111 @@
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=O32-INV %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=O32-INV %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N32-INV %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N32-INV %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N64-INV %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N64-INV %s
+
+; Test the the callee-saved registers are callee-saved as specified by section
+; 2 of the MIPSpro N32 Handbook and section 3 of the SYSV ABI spec.
+
+define void @fpu_clobber() nounwind {
+entry:
+        call void asm "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f12},~{$f13},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+        ret void
+}
+
+; ALL-LABEL: fpu_clobber:
+; ALL-INV-NOT:   sdc1 $f0,
+; ALL-INV-NOT:   sdc1 $f1,
+; ALL-INV-NOT:   sdc1 $f2,
+; ALL-INV-NOT:   sdc1 $f3,
+; ALL-INV-NOT:   sdc1 $f4,
+; ALL-INV-NOT:   sdc1 $f5,
+; ALL-INV-NOT:   sdc1 $f6,
+; ALL-INV-NOT:   sdc1 $f7,
+; ALL-INV-NOT:   sdc1 $f8,
+; ALL-INV-NOT:   sdc1 $f9,
+; ALL-INV-NOT:   sdc1 $f10,
+; ALL-INV-NOT:   sdc1 $f11,
+; ALL-INV-NOT:   sdc1 $f12,
+; ALL-INV-NOT:   sdc1 $f13,
+; ALL-INV-NOT:   sdc1 $f14,
+; ALL-INV-NOT:   sdc1 $f15,
+; ALL-INV-NOT:   sdc1 $f16,
+; ALL-INV-NOT:   sdc1 $f17,
+; ALL-INV-NOT:   sdc1 $f18,
+; ALL-INV-NOT:   sdc1 $f19,
+; ALL-INV-NOT:   sdc1 $f21,
+; ALL-INV-NOT:   sdc1 $f23,
+
+; O32:           addiu $sp, $sp, -48
+; O32-DAG:       sdc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp)
+; O32-DAG:       sdc1 [[F22:\$f22]], [[OFF22:[0-9]+]]($sp)
+; O32-DAG:       sdc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp)
+; O32-DAG:       sdc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp)
+; O32-DAG:       sdc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp)
+; O32-DAG:       sdc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp)
+; O32-DAG:       ldc1 [[F20]], [[OFF20]]($sp)
+; O32-DAG:       ldc1 [[F22]], [[OFF22]]($sp)
+; O32-DAG:       ldc1 [[F24]], [[OFF24]]($sp)
+; O32-INV-NOT:   sdc1 $f25,
+; O32-DAG:       ldc1 [[F26]], [[OFF26]]($sp)
+; O32-INV-NOT:   sdc1 $f27,
+; O32-DAG:       ldc1 [[F28]], [[OFF28]]($sp)
+; O32-INV-NOT:   sdc1 $f29,
+; O32-DAG:       ldc1 [[F30]], [[OFF30]]($sp)
+; O32-INV-NOT:   sdc1 $f31,
+; O32:           addiu $sp, $sp, 48
+
+; N32:           addiu $sp, $sp, -48
+; N32-DAG:       sdc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp)
+; N32-DAG:       sdc1 [[F22:\$f22]], [[OFF22:[0-9]+]]($sp)
+; N32-DAG:       sdc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp)
+; N32-DAG:       sdc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp)
+; N32-DAG:       sdc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp)
+; N32-DAG:       sdc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp)
+; N32-DAG:       ldc1 [[F20]], [[OFF20]]($sp)
+; N32-DAG:       ldc1 [[F22]], [[OFF22]]($sp)
+; N32-DAG:       ldc1 [[F24]], [[OFF24]]($sp)
+; N32-INV-NOT:   sdc1 $f25,
+; N32-DAG:       ldc1 [[F26]], [[OFF26]]($sp)
+; N32-INV-NOT:   sdc1 $f27,
+; N32-DAG:       ldc1 [[F28]], [[OFF28]]($sp)
+; N32-INV-NOT:   sdc1 $f29,
+; N32-DAG:       ldc1 [[F30]], [[OFF30]]($sp)
+; N32-INV-NOT:   sdc1 $f31,
+; N32:           addiu $sp, $sp, 48
+
+; N64:           addiu $sp, $sp, -64
+; N64-INV-NOT:   sdc1 $f20,
+; N64-INV-NOT:   sdc1 $f22,
+; N64-DAG:       sdc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F25:\$f25]], [[OFF25:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F27:\$f27]], [[OFF27:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F29:\$f29]], [[OFF29:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F31:\$f31]], [[OFF31:[0-9]+]]($sp)
+; N64-DAG:       ldc1 [[F24]], [[OFF24]]($sp)
+; N64-DAG:       ldc1 [[F25]], [[OFF25]]($sp)
+; N64-DAG:       ldc1 [[F26]], [[OFF26]]($sp)
+; N64-DAG:       ldc1 [[F27]], [[OFF27]]($sp)
+; N64-DAG:       ldc1 [[F28]], [[OFF28]]($sp)
+; N64-DAG:       ldc1 [[F29]], [[OFF29]]($sp)
+; N64-DAG:       ldc1 [[F30]], [[OFF30]]($sp)
+; N64-DAG:       ldc1 [[F31]], [[OFF31]]($sp)
+; N64:           addiu $sp, $sp, 64
diff --git a/test/CodeGen/Mips/cconv/callee-saved.ll b/test/CodeGen/Mips/cconv/callee-saved.ll
new file mode 100644
index 0000000..293e99f
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/callee-saved.ll
@@ -0,0 +1,167 @@
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32-INV %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32-INV %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64-INV %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64-INV %s
+
+; Test the the callee-saved registers are callee-saved as specified by section
+; 2 of the MIPSpro N32 Handbook and section 3 of the SYSV ABI spec.
+
+define void @gpr_clobber() nounwind {
+entry:
+        ; Clobbering the stack pointer is a bad idea so we'll skip that one
+        call void asm "# Clobber", "~{$0},~{$1},~{$2},~{$3},~{$4},~{$5},~{$6},~{$7},~{$8},~{$9},~{$10},~{$11},~{$12},~{$13},~{$14},~{$15},~{$16},~{$17},~{$18},~{$19},~{$20},~{$21},~{$22},~{$23},~{$24},~{$25},~{$26},~{$27},~{$28},~{$30},~{$31}"()
+        ret void
+}
+
+; ALL-LABEL: gpr_clobber:
+; O32:           addiu $sp, $sp, -40
+; O32-INV-NOT:   sw $0,
+; O32-INV-NOT:   sw $1,
+; O32-INV-NOT:   sw $2,
+; O32-INV-NOT:   sw $3,
+; O32-INV-NOT:   sw $4,
+; O32-INV-NOT:   sw $5,
+; O32-INV-NOT:   sw $6,
+; O32-INV-NOT:   sw $7,
+; O32-INV-NOT:   sw $8,
+; O32-INV-NOT:   sw $9,
+; O32-INV-NOT:   sw $10,
+; O32-INV-NOT:   sw $11,
+; O32-INV-NOT:   sw $12,
+; O32-INV-NOT:   sw $13,
+; O32-INV-NOT:   sw $14,
+; O32-INV-NOT:   sw $15,
+; O32-DAG:       sw [[G16:\$16]], [[OFF16:[0-9]+]]($sp)
+; O32-DAG:       sw [[G17:\$17]], [[OFF17:[0-9]+]]($sp)
+; O32-DAG:       sw [[G18:\$18]], [[OFF18:[0-9]+]]($sp)
+; O32-DAG:       sw [[G19:\$19]], [[OFF19:[0-9]+]]($sp)
+; O32-DAG:       sw [[G20:\$20]], [[OFF20:[0-9]+]]($sp)
+; O32-DAG:       sw [[G21:\$21]], [[OFF21:[0-9]+]]($sp)
+; O32-DAG:       sw [[G22:\$22]], [[OFF22:[0-9]+]]($sp)
+; O32-DAG:       sw [[G23:\$23]], [[OFF23:[0-9]+]]($sp)
+; O32-INV-NOT:   sw $24,
+; O32-INV-NOT:   sw $25,
+; O32-INV-NOT:   sw $26,
+; O32-INV-NOT:   sw $27,
+; O32-INV-NOT:   sw $28,
+; O32-INV-NOT:   sw $29,
+; O32-DAG:       sw [[G30:\$fp]], [[OFF30:[0-9]+]]($sp)
+; O32-DAG:       sw [[G31:\$fp]], [[OFF31:[0-9]+]]($sp)
+; O32-DAG:       lw [[G16]], [[OFF16]]($sp)
+; O32-DAG:       lw [[G17]], [[OFF17]]($sp)
+; O32-DAG:       lw [[G18]], [[OFF18]]($sp)
+; O32-DAG:       lw [[G19]], [[OFF19]]($sp)
+; O32-DAG:       lw [[G20]], [[OFF20]]($sp)
+; O32-DAG:       lw [[G21]], [[OFF21]]($sp)
+; O32-DAG:       lw [[G22]], [[OFF22]]($sp)
+; O32-DAG:       lw [[G23]], [[OFF23]]($sp)
+; O32-DAG:       lw [[G30]], [[OFF30]]($sp)
+; O32-DAG:       lw [[G31]], [[OFF31]]($sp)
+; O32:           addiu $sp, $sp, 40
+
+; N32:           addiu $sp, $sp, -96
+; N32-INV-NOT:   sd $0,
+; N32-INV-NOT:   sd $1,
+; N32-INV-NOT:   sd $2,
+; N32-INV-NOT:   sd $3,
+; N32-INV-NOT:   sd $4,
+; N32-INV-NOT:   sd $5,
+; N32-INV-NOT:   sd $6,
+; N32-INV-NOT:   sd $7,
+; N32-INV-NOT:   sd $8,
+; N32-INV-NOT:   sd $9,
+; N32-INV-NOT:   sd $10,
+; N32-INV-NOT:   sd $11,
+; N32-INV-NOT:   sd $12,
+; N32-INV-NOT:   sd $13,
+; N32-INV-NOT:   sd $14,
+; N32-INV-NOT:   sd $15,
+; N32-DAG:       sd [[G16:\$16]], [[OFF16:[0-9]+]]($sp)
+; N32-DAG:       sd [[G17:\$17]], [[OFF17:[0-9]+]]($sp)
+; N32-DAG:       sd [[G18:\$18]], [[OFF18:[0-9]+]]($sp)
+; N32-DAG:       sd [[G19:\$19]], [[OFF19:[0-9]+]]($sp)
+; N32-DAG:       sd [[G20:\$20]], [[OFF20:[0-9]+]]($sp)
+; N32-DAG:       sd [[G21:\$21]], [[OFF21:[0-9]+]]($sp)
+; N32-DAG:       sd [[G22:\$22]], [[OFF22:[0-9]+]]($sp)
+; N32-DAG:       sd [[G23:\$23]], [[OFF23:[0-9]+]]($sp)
+; N32-INV-NOT:   sd $24,
+; N32-INV-NOT:   sd $25,
+; N32-INV-NOT:   sd $26,
+; N32-INV-NOT:   sd $27,
+; N32-DAG:       sd [[G28:\$gp]], [[OFF28:[0-9]+]]($sp)
+; N32-INV-NOT:   sd $29,
+; N32-DAG:       sd [[G30:\$fp]], [[OFF30:[0-9]+]]($sp)
+; N32-DAG:       sd [[G31:\$fp]], [[OFF31:[0-9]+]]($sp)
+; N32-DAG:       ld [[G16]], [[OFF16]]($sp)
+; N32-DAG:       ld [[G17]], [[OFF17]]($sp)
+; N32-DAG:       ld [[G18]], [[OFF18]]($sp)
+; N32-DAG:       ld [[G19]], [[OFF19]]($sp)
+; N32-DAG:       ld [[G20]], [[OFF20]]($sp)
+; N32-DAG:       ld [[G21]], [[OFF21]]($sp)
+; N32-DAG:       ld [[G22]], [[OFF22]]($sp)
+; N32-DAG:       ld [[G23]], [[OFF23]]($sp)
+; N32-DAG:       ld [[G28]], [[OFF28]]($sp)
+; N32-DAG:       ld [[G30]], [[OFF30]]($sp)
+; N32-DAG:       ld [[G31]], [[OFF31]]($sp)
+; N32:           addiu $sp, $sp, 96
+
+; N64:           daddiu $sp, $sp, -96
+; N64-INV-NOT:   sd $0,
+; N64-INV-NOT:   sd $1,
+; N64-INV-NOT:   sd $2,
+; N64-INV-NOT:   sd $3,
+; N64-INV-NOT:   sd $4,
+; N64-INV-NOT:   sd $5,
+; N64-INV-NOT:   sd $6,
+; N64-INV-NOT:   sd $7,
+; N64-INV-NOT:   sd $8,
+; N64-INV-NOT:   sd $9,
+; N64-INV-NOT:   sd $10,
+; N64-INV-NOT:   sd $11,
+; N64-INV-NOT:   sd $12,
+; N64-INV-NOT:   sd $13,
+; N64-INV-NOT:   sd $14,
+; N64-INV-NOT:   sd $15,
+; N64-DAG:       sd [[G16:\$16]], [[OFF16:[0-9]+]]($sp)
+; N64-DAG:       sd [[G17:\$17]], [[OFF17:[0-9]+]]($sp)
+; N64-DAG:       sd [[G18:\$18]], [[OFF18:[0-9]+]]($sp)
+; N64-DAG:       sd [[G19:\$19]], [[OFF19:[0-9]+]]($sp)
+; N64-DAG:       sd [[G20:\$20]], [[OFF20:[0-9]+]]($sp)
+; N64-DAG:       sd [[G21:\$21]], [[OFF21:[0-9]+]]($sp)
+; N64-DAG:       sd [[G22:\$22]], [[OFF22:[0-9]+]]($sp)
+; N64-DAG:       sd [[G23:\$23]], [[OFF23:[0-9]+]]($sp)
+; N64-DAG:       sd [[G30:\$fp]], [[OFF30:[0-9]+]]($sp)
+; N64-DAG:       sd [[G31:\$fp]], [[OFF31:[0-9]+]]($sp)
+; N64-INV-NOT:   sd $24,
+; N64-INV-NOT:   sd $25,
+; N64-INV-NOT:   sd $26,
+; N64-INV-NOT:   sd $27,
+; N64-DAG:       sd [[G28:\$gp]], [[OFF28:[0-9]+]]($sp)
+; N64-INV-NOT:   sd $29,
+; N64-DAG:       ld [[G16]], [[OFF16]]($sp)
+; N64-DAG:       ld [[G17]], [[OFF17]]($sp)
+; N64-DAG:       ld [[G18]], [[OFF18]]($sp)
+; N64-DAG:       ld [[G19]], [[OFF19]]($sp)
+; N64-DAG:       ld [[G20]], [[OFF20]]($sp)
+; N64-DAG:       ld [[G21]], [[OFF21]]($sp)
+; N64-DAG:       ld [[G22]], [[OFF22]]($sp)
+; N64-DAG:       ld [[G23]], [[OFF23]]($sp)
+; N64-DAG:       ld [[G28]], [[OFF28]]($sp)
+; N64-DAG:       ld [[G30]], [[OFF30]]($sp)
+; N64-DAG:       ld [[G31]], [[OFF31]]($sp)
+; N64:           daddiu $sp, $sp, 96
diff --git a/test/CodeGen/Mips/cconv/memory-layout.ll b/test/CodeGen/Mips/cconv/memory-layout.ll
new file mode 100644
index 0000000..0c3cc9e
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/memory-layout.ll
@@ -0,0 +1,140 @@
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the memory layout for all ABI's and byte orders as specified by section
+; 4 of MD00305 (MIPS ABIs Described).
+; Bitfields are not covered since they are not available as a type in LLVM IR.
+;
+; The assembly directives deal with endianness so we don't need to account for
+; that.
+
+; Deliberately request alignments that are too small for the target so we get
+; the minimum alignment instead of the preferred alignment.
+@byte = global i8 1, align 1
+@halfword = global i16 258, align 1
+@word = global i32 16909060, align 1
+@float = global float 1.0, align 1
+@dword = global i64 283686952306183, align 1
+@double = global double 1.0, align 1
+@pointer = global i8* @byte
+
+; ALL-NOT:       .align
+; ALL-LABEL: byte:
+; ALL:           .byte 1
+; ALL:           .size byte, 1
+
+; ALL:           .align 1
+; ALL-LABEL: halfword:
+; ALL:           .2byte 258
+; ALL:           .size halfword, 2
+
+; ALL:           .align 2
+; ALL-LABEL: word:
+; ALL:           .4byte 16909060
+; ALL:           .size word, 4
+
+; ALL:           .align 2
+; ALL-LABEL: float:
+; ALL:           .4byte 1065353216
+; ALL:           .size float, 4
+
+; ALL:           .align 3
+; ALL-LABEL: dword:
+; ALL:           .8byte 283686952306183
+; ALL:           .size dword, 8
+
+; ALL:           .align 3
+; ALL-LABEL: double:
+; ALL:           .8byte 4607182418800017408
+; ALL:           .size double, 8
+
+; O32:           .align 2
+; N32:           .align 2
+; N64:           .align 3
+; ALL-LABEL: pointer:
+; O32:           .4byte byte
+; O32:           .size pointer, 4
+; N32:           .4byte byte
+; N32:           .size pointer, 4
+; N64:           .8byte byte
+; N64:           .size pointer, 8
+
+@byte_array = global [2 x i8] [i8 1, i8 2], align 1
+@halfword_array = global [2 x i16] [i16 1, i16 2], align 1
+@word_array = global [2 x i32] [i32 1, i32 2], align 1
+@float_array = global [2 x float] [float 1.0, float 2.0], align 1
+@dword_array = global [2 x i64] [i64 1, i64 2], align 1
+@double_array = global [2 x double] [double 1.0, double 2.0], align 1
+@pointer_array = global [2 x i8*] [i8* @byte, i8* @byte]
+
+; ALL-NOT:       .align
+; ALL-LABEL: byte_array:
+; ALL:           .ascii "\001\002"
+; ALL:           .size byte_array, 2
+
+; ALL:           .align 1
+; ALL-LABEL: halfword_array:
+; ALL:           .2byte 1
+; ALL:           .2byte 2
+; ALL:           .size halfword_array, 4
+
+; ALL:           .align 2
+; ALL-LABEL: word_array:
+; ALL:           .4byte 1
+; ALL:           .4byte 2
+; ALL:           .size word_array, 8
+
+; ALL:           .align 2
+; ALL-LABEL: float_array:
+; ALL:           .4byte 1065353216
+; ALL:           .4byte 1073741824
+; ALL:           .size float_array, 8
+
+; ALL:           .align 3
+; ALL-LABEL: dword_array:
+; ALL:           .8byte 1
+; ALL:           .8byte 2
+; ALL:           .size dword_array, 16
+
+; ALL:           .align 3
+; ALL-LABEL: double_array:
+; ALL:           .8byte 4607182418800017408
+; ALL:           .8byte 4611686018427387904
+; ALL:           .size double_array, 16
+
+; O32:           .align 2
+; N32:           .align 2
+; N64:           .align 3
+; ALL-LABEL: pointer_array:
+; O32:           .4byte byte
+; O32:           .4byte byte
+; O32:           .size pointer_array, 8
+; N32:           .4byte byte
+; N32:           .4byte byte
+; N32:           .size pointer_array, 8
+; N64:           .8byte byte
+; N64:           .8byte byte
+; N64:           .size pointer_array, 16
+
+%mixed = type { i8, double, i16 }
+@mixed = global %mixed { i8 1, double 1.0, i16 515 }, align 1
+
+; ALL:           .align 3
+; ALL-LABEL: mixed:
+; ALL:           .byte 1
+; ALL:           .space 7
+; ALL:           .8byte 4607182418800017408
+; ALL:           .2byte 515
+; ALL:           .space 6
+; ALL:           .size mixed, 24
+
+; Bitfields are not available in LLVM IR so we can't test them here.
diff --git a/test/CodeGen/Mips/cconv/reserved-space.ll b/test/CodeGen/Mips/cconv/reserved-space.ll
new file mode 100644
index 0000000..b36f89e
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/reserved-space.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test that O32 correctly reserved space for the four arguments, even when
+; there aren't any as per section 5 of MD00305 (MIPS ABIs Described).
+
+declare void @foo() nounwind;
+
+define void @reserved_space() nounwind {
+entry:
+        tail call void @foo()
+        ret void
+}
+
+; ALL-LABEL: reserved_space:
+; O32:           addiu $sp, $sp, -24
+; O32:           sw $ra, 20($sp)
+; O32:           lw $ra, 20($sp)
+; O32:           addiu $sp, $sp, 24
+; Despite pointers being 32-bit wide on N32, the return pointer is saved as a
+; 64-bit pointer. I've yet to find a documentation reference for this quirk but
+; this behaviour matches GCC so I have considered it to be correct.
+; N32:           addiu $sp, $sp, -16
+; N32:           sd $ra, 8($sp)
+; N32:           ld $ra, 8($sp)
+; N32:           addiu $sp, $sp, 16
+; N64:           daddiu $sp, $sp, -16
+; N64:           sd $ra, 8($sp)
+; N64:           ld $ra, 8($sp)
+; N64:           daddiu $sp, $sp, 16
diff --git a/test/CodeGen/Mips/cconv/return-float.ll b/test/CodeGen/Mips/cconv/return-float.ll
new file mode 100644
index 0000000..28cf83d
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return-float.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=mips-linux-gnu -soft-float -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -soft-float -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the float returns for all ABI's and byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+
+; We only test Linux because other OS's use different relocations and I don't
+; know if this is correct.
+
+@float = global float zeroinitializer
+@double = global double zeroinitializer
+
+define float @retfloat() nounwind {
+entry:
+        %0 = load volatile float* @float
+        ret float %0
+}
+
+; ALL-LABEL: retfloat:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
+; O32-DAG:           lw $2, %lo(float)([[R1]])
+; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
+; N32-DAG:           lw $2, %lo(float)([[R1]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)($1)
+; N64-DAG:           lw $2, 0([[R1]])
+
+define double @retdouble() nounwind {
+entry:
+        %0 = load volatile double* @double
+        ret double %0
+}
+
+; ALL-LABEL: retdouble:
+; O32-DAG:           lw $2, %lo(double)([[R1:\$[0-9]+]])
+; O32-DAG:           addiu [[R2:\$[0-9]+]], [[R1]], %lo(double)
+; O32-DAG:           lw $3, 4([[R2]])
+; N32-DAG:           ld $2, %lo(double)([[R1:\$[0-9]+]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)($1)
+; N64-DAG:           ld $2, 0([[R1]])
diff --git a/test/CodeGen/Mips/cconv/return-hard-float.ll b/test/CodeGen/Mips/cconv/return-hard-float.ll
new file mode 100644
index 0000000..371b3a5
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return-hard-float.ll
@@ -0,0 +1,46 @@
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the float returns for all ABI's and byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+
+; We only test Linux because other OS's use different relocations and I don't
+; know if this is correct.
+
+@float = global float zeroinitializer
+@double = global double zeroinitializer
+
+define float @retfloat() nounwind {
+entry:
+        %0 = load volatile float* @float
+        ret float %0
+}
+
+; ALL-LABEL: retfloat:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
+; O32-DAG:           lwc1 $f0, %lo(float)([[R1]])
+; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
+; N32-DAG:           lwc1 $f0, %lo(float)([[R1]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)($1)
+; N64-DAG:           lwc1 $f0, 0([[R1]])
+
+define double @retdouble() nounwind {
+entry:
+        %0 = load volatile double* @double
+        ret double %0
+}
+
+; ALL-LABEL: retdouble:
+; O32-DAG:           ldc1 $f0, %lo(double)([[R1:\$[0-9]+]])
+; N32-DAG:           ldc1 $f0, %lo(double)([[R1:\$[0-9]+]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)($1)
+; N64-DAG:           ldc1 $f0, 0([[R1]])
diff --git a/test/CodeGen/Mips/cconv/return-hard-fp128.ll b/test/CodeGen/Mips/cconv/return-hard-fp128.ll
new file mode 100644
index 0000000..0da59ef
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return-hard-fp128.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the fp128 returns for N32/N64 and all byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+;
+; O32 is not tested because long double is the same as double on O32.
+;
+@fp128 = global fp128 zeroinitializer
+
+define fp128 @retldouble() nounwind {
+entry:
+        %0 = load volatile fp128* @fp128
+        ret fp128 %0
+}
+
+; ALL-LABEL: retldouble:
+; N32-DAG:           ld [[R2:\$[0-9]+]], %lo(fp128)([[R1:\$[0-9]+]])
+; N32-DAG:           addiu [[R3:\$[0-9]+]], [[R1]], %lo(fp128)
+; N32-DAG:           ld [[R4:\$[0-9]+]], 8([[R3]])
+; N32-DAG:           dmtc1 [[R2]], $f0
+; N32-DAG:           dmtc1 [[R4]], $f2
+
+; N64-DAG:           ld [[R2:\$[0-9]+]], %got_disp(fp128)([[R1:\$[0-9]+]])
+; N64-DAG:           ld [[R3:\$[0-9]+]], 0([[R2]])
+; N64-DAG:           ld [[R4:\$[0-9]+]], 8([[R2]])
+; N64-DAG:           dmtc1 [[R3]], $f0
+; N64-DAG:           dmtc1 [[R4]], $f2
diff --git a/test/CodeGen/Mips/cconv/return.ll b/test/CodeGen/Mips/cconv/return.ll
new file mode 100644
index 0000000..76ce5e4
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return.ll
@@ -0,0 +1,66 @@
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the integer returns for all ABI's and byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+
+; We only test Linux because other OS's use different relocations and I don't
+; know if this is correct.
+
+@byte = global i8 zeroinitializer
+@word = global i32 zeroinitializer
+@dword = global i64 zeroinitializer
+@float = global float zeroinitializer
+@double = global double zeroinitializer
+
+define i8 @reti8() nounwind {
+entry:
+        %0 = load volatile i8* @byte
+        ret i8 %0
+}
+
+; ALL-LABEL: reti8:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(byte)
+; O32-DAG:           lbu $2, %lo(byte)([[R1]])
+; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(byte)
+; N32-DAG:           lbu $2, %lo(byte)([[R1]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(byte)($1)
+; N64-DAG:           lbu $2, 0([[R1]])
+
+define i32 @reti32() nounwind {
+entry:
+        %0 = load volatile i32* @word
+        ret i32 %0
+}
+
+; ALL-LABEL: reti32:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(word)
+; O32-DAG:           lw $2, %lo(word)([[R1]])
+; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(word)
+; N32-DAG:           lw $2, %lo(word)([[R1]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(word)($1)
+; N64-DAG:           lw $2, 0([[R1]])
+
+define i64 @reti64() nounwind {
+entry:
+        %0 = load volatile i64* @dword
+        ret i64 %0
+}
+
+; ALL-LABEL: reti64:
+; On O32, we must use v0 and v1 for the return value
+; O32-DAG:           lw $2, %lo(dword)([[R1:\$[0-9]+]])
+; O32-DAG:           addiu [[R2:\$[0-9]+]], [[R1]], %lo(dword)
+; O32-DAG:           lw $3, 4([[R2]])
+; N32-DAG:           ld $2, %lo(dword)([[R1:\$[0-9]+]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(dword)([[R1:\$[0-9]+]])
+; N64-DAG:           ld $2, 0([[R1]])
diff --git a/test/CodeGen/Mips/cconv/stack-alignment.ll b/test/CodeGen/Mips/cconv/stack-alignment.ll
new file mode 100644
index 0000000..834033b
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/stack-alignment.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the stack alignment for all ABI's and byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+
+define void @local_bytes_1() nounwind {
+entry:
+        %0 = alloca i8
+        ret void
+}
+
+; ALL-LABEL: local_bytes_1:
+; O32:           addiu $sp, $sp, -8
+; O32:           addiu $sp, $sp, 8
+; N32:           addiu $sp, $sp, -16
+; N32:           addiu $sp, $sp, 16
+; N64:           addiu $sp, $sp, -16
+; N64:           addiu $sp, $sp, 16
diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index f2009fa..b9732eb 100644
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=O32
 ; RUN: llc -march=mips -regalloc=basic < %s | FileCheck %s -check-prefix=O32
+; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 < %s | FileCheck %s -check-prefix=N64
 ; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck %s -check-prefix=N64
 
 @i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4
@@ -238,4 +239,4 @@ define i32 @slti6(i32 %a) nounwind readnone {
 ; O32-DAG: xori [[R1]], [[R1]], 1
 ; O32-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
 ; O32-NOT: movn
-; O32:.size slti6
-\ No newline at end of file
+; O32:.size slti6
diff --git a/test/CodeGen/Mips/eh-dwarf-cfa.ll b/test/CodeGen/Mips/eh-dwarf-cfa.ll
index 3a21332..6554974 100644
--- a/test/CodeGen/Mips/eh-dwarf-cfa.ll
+++ b/test/CodeGen/Mips/eh-dwarf-cfa.ll
@@ -1,4 +1,6 @@
 ; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | \
+; RUN:      FileCheck %s -check-prefix=CHECK-MIPS64
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | \
 ; RUN:      FileCheck %s -check-prefix=CHECK-MIPS64
 
diff --git a/test/CodeGen/Mips/eh-return64.ll b/test/CodeGen/Mips/eh-return64.ll
index 32fc5e6..8c5af50 100644
--- a/test/CodeGen/Mips/eh-return64.ll
+++ b/test/CodeGen/Mips/eh-return64.ll
@@ -1,3 +1,4 @@
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
 
 declare void @llvm.eh.return.i64(i64, i8*)
diff --git a/test/CodeGen/Mips/elf_eflags.ll b/test/CodeGen/Mips/elf_eflags.ll
index 336ed7b..00d8584 100644
--- a/test/CodeGen/Mips/elf_eflags.ll
+++ b/test/CodeGen/Mips/elf_eflags.ll
@@ -23,6 +23,9 @@
 ; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE32R2-MICROMIPS %s
 ; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | FileCheck -check-prefix=CHECK-LE32R2-MICROMIPS_PIC %s
 
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips4 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips4 %s -o - | FileCheck -check-prefix=CHECK-LE64_PIC %s
+
 ; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64 %s
 ; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 %s -o - | FileCheck -check-prefix=CHECK-LE64_PIC %s
 ; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64r2 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64R2 %s
diff --git a/test/CodeGen/Mips/elf_st_other.ll b/test/CodeGen/Mips/elf_st_other.ll
deleted file mode 100644
index 8a5f20d..0000000
--- a/test/CodeGen/Mips/elf_st_other.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; This tests value of ELF st_other field for function symbol table entries.
-; For microMIPS value should be equal to STO_MIPS_MICROMIPS.
-
-; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | FileCheck %s
-
-define i32 @main() nounwind {
-entry:
-  ret i32 0
-}
-
-; CHECK: .set	micromips
-; CHECK: main:
diff --git a/test/CodeGen/Mips/fabs.ll b/test/CodeGen/Mips/fabs.ll
index 49d8a72..ce1a9a6 100644
--- a/test/CodeGen/Mips/fabs.ll
+++ b/test/CodeGen/Mips/fabs.ll
@@ -1,21 +1,23 @@
-; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 | FileCheck %s -check-prefix=32
-; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2
-; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=64
-; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2
-; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 -enable-no-nans-fp-math | FileCheck %s -check-prefix=NO-NAN
+; Check that abs.[ds] is selected and does not depend on -enable-no-nans-fp-math
+; They obey the Has2008 and ABS2008 configuration bits which govern the
+; conformance to IEEE 754 (1985) and IEEE 754 (2008). When these bits are not
+; present, they confirm to 1985.
+; In 1985 mode, abs.[ds] are arithmetic (i.e. they raise invalid operation
+; exceptions when given NaN's). In 2008 mode, they are non-arithmetic (i.e.
+; they are copies and don't raise any exceptions).
 
-define float @foo0(float %a) nounwind readnone {
-entry:
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 | FileCheck %s
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32r2 | FileCheck %s
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 -enable-no-nans-fp-math | FileCheck %s
 
-; 32: lui  $[[T0:[0-9]+]], 32767
-; 32: ori  $[[MSK0:[0-9]+]], $[[T0]], 65535
-; 32: and  $[[AND:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 32: mtc1 $[[AND]], $f0
+; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64 | FileCheck %s
+; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64 -enable-no-nans-fp-math | FileCheck %s
 
-; 32R2: ins  $[[INS:[0-9]+]], $zero, 31, 1
-; 32R2: mtc1 $[[INS]], $f0
+define float @foo0(float %a) nounwind readnone {
+entry:
 
-; NO-NAN: abs.s
+; CHECK-LABEL: foo0
+; CHECK: abs.s
 
   %call = tail call float @fabsf(float %a) nounwind readnone
   ret float %call
@@ -26,24 +28,8 @@ declare float @fabsf(float) nounwind readnone
 define double @foo1(double %a) nounwind readnone {
 entry:
 
-; 32: lui  $[[T0:[0-9]+]], 32767
-; 32: ori  $[[MSK0:[0-9]+]], $[[T0]], 65535
-; 32: and  $[[AND:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 32: mtc1 $[[AND]], $f1
-
-; 32R2: ins  $[[INS:[0-9]+]], $zero, 31, 1
-; 32R2: mtc1 $[[INS]], $f1
-
-; 64: daddiu  $[[T0:[0-9]+]], $zero, 1
-; 64: dsll    $[[T1:[0-9]+]], ${{[0-9]+}}, 63
-; 64: daddiu  $[[MSK0:[0-9]+]], $[[T1]], -1
-; 64: and     $[[AND:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 64: dmtc1   $[[AND]], $f0
-
-; 64R2: dins  $[[INS:[0-9]+]], $zero, 63, 1
-; 64R2: dmtc1 $[[INS]], $f0
-
-; NO-NAN: abs.d
+; CHECK-LABEL: foo1:
+; CHECK: abs.d
 
   %call = tail call double @fabs(double %a) nounwind readnone
   ret double %call
diff --git a/test/CodeGen/Mips/fcopysign-f32-f64.ll b/test/CodeGen/Mips/fcopysign-f32-f64.ll
index 9f88d0c..148a780 100644
--- a/test/CodeGen/Mips/fcopysign-f32-f64.ll
+++ b/test/CodeGen/Mips/fcopysign-f32-f64.ll
@@ -1,3 +1,4 @@
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s -check-prefix=64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2
 
diff --git a/test/CodeGen/Mips/fcopysign.ll b/test/CodeGen/Mips/fcopysign.ll
index 1c57eca..44c4117 100644
--- a/test/CodeGen/Mips/fcopysign.ll
+++ b/test/CodeGen/Mips/fcopysign.ll
@@ -1,5 +1,6 @@
 ; RUN: llc  < %s -march=mipsel -mcpu=mips32 | FileCheck %s -check-prefix=32
 ; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s -check-prefix=64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2
 
diff --git a/test/CodeGen/Mips/fmadd1.ll b/test/CodeGen/Mips/fmadd1.ll
index 6768ed6..a9a8e21 100644
--- a/test/CodeGen/Mips/fmadd1.ll
+++ b/test/CodeGen/Mips/fmadd1.ll
@@ -1,3 +1,10 @@
+; Check that madd.[ds], msub.[ds], nmadd.[ds], and nmsub.[ds] are supported
+; correctly.
+; The spec for nmadd.[ds], and nmsub.[ds] does not state that they obey the
+; the Has2008 and ABS2008 configuration bits which govern the conformance to
+; IEEE 754 (1985) and IEEE 754 (2008). These instructions are therefore only
+; available when -enable-no-nans-fp-math is given.
+
 ; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -enable-no-nans-fp-math | FileCheck %s -check-prefix=32R2 -check-prefix=CHECK
 ; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=64R2 -check-prefix=CHECK
 ; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2NAN -check-prefix=CHECK
@@ -5,6 +12,7 @@
 
 define float @FOO0float(float %a, float %b, float %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO0float:
 ; CHECK: madd.s 
   %mul = fmul float %a, %b
   %add = fadd float %mul, %c
@@ -14,6 +22,7 @@ entry:
 
 define float @FOO1float(float %a, float %b, float %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO1float:
 ; CHECK: msub.s 
   %mul = fmul float %a, %b
   %sub = fsub float %mul, %c
@@ -23,6 +32,7 @@ entry:
 
 define float @FOO2float(float %a, float %b, float %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO2float:
 ; 32R2: nmadd.s 
 ; 64R2: nmadd.s 
 ; 32R2NAN: madd.s 
@@ -35,6 +45,7 @@ entry:
 
 define float @FOO3float(float %a, float %b, float %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO3float:
 ; 32R2: nmsub.s 
 ; 64R2: nmsub.s 
 ; 32R2NAN: msub.s 
@@ -47,6 +58,7 @@ entry:
 
 define double @FOO10double(double %a, double %b, double %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO10double:
 ; CHECK: madd.d
   %mul = fmul double %a, %b
   %add = fadd double %mul, %c
@@ -56,6 +68,7 @@ entry:
 
 define double @FOO11double(double %a, double %b, double %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO11double:
 ; CHECK: msub.d
   %mul = fmul double %a, %b
   %sub = fsub double %mul, %c
@@ -65,6 +78,7 @@ entry:
 
 define double @FOO12double(double %a, double %b, double %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO12double:
 ; 32R2: nmadd.d 
 ; 64R2: nmadd.d 
 ; 32R2NAN: madd.d 
@@ -77,6 +91,7 @@ entry:
 
 define double @FOO13double(double %a, double %b, double %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO13double:
 ; 32R2: nmsub.d 
 ; 64R2: nmsub.d 
 ; 32R2NAN: msub.d 
diff --git a/test/CodeGen/Mips/fneg.ll b/test/CodeGen/Mips/fneg.ll
index b322abd..4fb80fd 100644
--- a/test/CodeGen/Mips/fneg.ll
+++ b/test/CodeGen/Mips/fneg.ll
@@ -1,17 +1,30 @@
-; RUN: llc  < %s -march=mipsel -mcpu=mips32 | FileCheck %s 
+; Check that abs.[ds] is selected and does not depend on -enable-no-nans-fp-math
+; They obey the Has2008 and ABS2008 configuration bits which govern the
+; conformance to IEEE 754 (1985) and IEEE 754 (2008). When these bits are not
+; present, they confirm to 1985.
+; In 1985 mode, abs.[ds] are arithmetic (i.e. they raise invalid operation
+; exceptions when given NaN's). In 2008 mode, they are non-arithmetic (i.e.
+; they are copies and don't raise any exceptions).
 
-define float @foo0(i32 %a, float %d) nounwind readnone {
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 | FileCheck %s
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32r2 | FileCheck %s
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 -enable-no-nans-fp-math | FileCheck %s
+
+; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64 | FileCheck %s
+; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64 -enable-no-nans-fp-math | FileCheck %s
+
+define float @foo0(float %d) nounwind readnone {
 entry:
-; CHECK-NOT: neg.s
+; CHECK-LABEL: foo0:
+; CHECK: neg.s
   %sub = fsub float -0.000000e+00, %d
   ret float %sub
 }
 
-define double @foo1(i32 %a, double %d) nounwind readnone {
+define double @foo1(double %d) nounwind readnone {
 entry:
-; CHECK:     foo1
-; CHECK-NOT: neg.d
-; CHECK:     jr
+; CHECK-LABEL: foo1:
+; CHECK: neg.d
   %sub = fsub double -0.000000e+00, %d
   ret double %sub
 }
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll
index f9e53cb..c09108d 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll
@@ -9,7 +9,7 @@ define i32 @main() nounwind {
 entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'I'
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i32 7, i32 1048576) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,I"(i32 7, i32 1048576) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll
index 1fdf672..2b24b0f 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll
@@ -10,7 +10,7 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'J'
 
-  tail call i32 asm "addi $0,$1,$2", "=r,r,J"(i32 1024, i32 3) nounwind
+  tail call i32 asm "addiu $0,$1,$2", "=r,r,J"(i32 1024, i32 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll
index 49dcc87..5edb3e2 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll
@@ -10,7 +10,7 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'L'
 
-  tail call i32 asm "addi $0,$1,$2", "=r,r,L"(i32 7, i32 1048579) nounwind
+  tail call i32 asm "addiu $0,$1,$2", "=r,r,L"(i32 7, i32 1048579) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll
index 770669d..eaa540a 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll
@@ -11,7 +11,7 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'N'
 
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,N"(i32 7, i32 3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,N"(i32 7, i32 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll
index cd4431a..56afbaa 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll
@@ -11,6 +11,6 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'O'
 
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,O"(i32 undef, i32 16384) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,O"(i32 undef, i32 16384) nounwind
   ret i32 0
 }
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll
index 0a4739e..0a55cb5 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll
@@ -11,6 +11,6 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'P'
 
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,P"(i32 undef, i32 655536) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,P"(i32 undef, i32 655536) nounwind
   ret i32 0
 }
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
index 94ded30..9464918 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
@@ -7,27 +7,27 @@ entry:
 
 ; r with char
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},23
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},23
 ;CHECK:	#NO_APP
-  tail call i8 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i8 27, i8 23) nounwind
+  tail call i8 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i8 27, i8 23) nounwind
 
 ; r with short
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},13
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},13
 ;CHECK:	#NO_APP
-  tail call i16 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i16 17, i16 13) nounwind
+  tail call i16 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i16 17, i16 13) nounwind
 
 ; r with int
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},3
 ;CHECK:	#NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i32 7, i32 3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i32 7, i32 3) nounwind
 
 ; Now c with 1024: make sure register $25 is picked
 ; CHECK: #APP
-; CHECK: addi $25,${{[0-9]+}},1024
+; CHECK: addiu $25,${{[0-9]+}},1024
 ; CHECK: #NO_APP	
-   tail call i32 asm sideeffect "addi $0,$1,$2", "=c,c,I"(i32 4194304, i32 1024) nounwind
+   tail call i32 asm sideeffect "addiu $0,$1,$2", "=c,c,I"(i32 4194304, i32 1024) nounwind
 
 ; Now l with 1024: make sure register lo is picked. We do this by checking the instruction
 ; after the inline expression for a mflo to pull the value out of lo.
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
index 7870666..a7ba762 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
@@ -12,9 +12,9 @@ entry:
 
 ; r with long long
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},3
 ;CHECK:	#NO_APP
-  tail call i64 asm sideeffect "addi $0,$1,$2", "=r,r,i"(i64 7, i64 3) nounwind
+  tail call i64 asm sideeffect "addiu $0,$1,$2", "=r,r,i"(i64 7, i64 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-operand-code.ll b/test/CodeGen/Mips/inlineasm-operand-code.ll
index 7bb4adc..6512851 100644
--- a/test/CodeGen/Mips/inlineasm-operand-code.ll
+++ b/test/CodeGen/Mips/inlineasm-operand-code.ll
@@ -12,9 +12,9 @@ define i32 @constraint_X() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_X:
 ;CHECK_LITTLE_32: #APP
-;CHECK_LITTLE_32: addi ${{[0-9]+}},${{[0-9]+}},0xfffffffffffffffd
+;CHECK_LITTLE_32: addiu ${{[0-9]+}},${{[0-9]+}},0xfffffffffffffffd
 ;CHECK_LITTLE_32: #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:X}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:X}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -23,9 +23,9 @@ define i32 @constraint_x() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_x:
 ;CHECK_LITTLE_32: #APP
-;CHECK_LITTLE_32: addi ${{[0-9]+}},${{[0-9]+}},0xfffd
+;CHECK_LITTLE_32: addiu ${{[0-9]+}},${{[0-9]+}},0xfffd
 ;CHECK_LITTLE_32: #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:x}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:x}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -34,9 +34,9 @@ define i32 @constraint_d() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_d:
 ;CHECK_LITTLE_32:   #APP
-;CHECK_LITTLE_32:   addi ${{[0-9]+}},${{[0-9]+}},-3
+;CHECK_LITTLE_32:   addiu ${{[0-9]+}},${{[0-9]+}},-3
 ;CHECK_LITTLE_32:   #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:d}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:d}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -45,9 +45,9 @@ define i32 @constraint_m() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_m:
 ;CHECK_LITTLE_32:   #APP
-;CHECK_LITTLE_32:   addi ${{[0-9]+}},${{[0-9]+}},-4
+;CHECK_LITTLE_32:   addiu ${{[0-9]+}},${{[0-9]+}},-4
 ;CHECK_LITTLE_32:   #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:m}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:m}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -56,15 +56,15 @@ define i32 @constraint_z() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL: constraint_z:
 ;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},-3
+;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},-3
 ;CHECK_LITTLE_32:    #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:z}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:z}", "=r,r,I"(i32 7, i32 -3) ;
 
 ; z with 0
 ;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},$0
+;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},$0
 ;CHECK_LITTLE_32:    #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
   ret i32 0
 }
 
@@ -73,9 +73,9 @@ define i32 @constraint_longlong() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL: constraint_longlong:
 ;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},3
 ;CHECK_LITTLE_32:    #NO_APP
-  tail call i64 asm sideeffect "addi $0,$1,$2 \0A\09", "=r,r,X"(i64 1229801703532086340, i64 3) nounwind
+  tail call i64 asm sideeffect "addiu $0,$1,$2 \0A\09", "=r,r,X"(i64 1229801703532086340, i64 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm_constraint.ll b/test/CodeGen/Mips/inlineasm_constraint.ll
index 8d30f45..76b73dc 100644
--- a/test/CodeGen/Mips/inlineasm_constraint.ll
+++ b/test/CodeGen/Mips/inlineasm_constraint.ll
@@ -5,21 +5,21 @@ entry:
 
 ; First I with short
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},4096
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},4096
 ; CHECK: #NO_APP
-  tail call i16 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i16 7, i16 4096) nounwind
+  tail call i16 asm sideeffect "addiu $0,$1,$2", "=r,r,I"(i16 7, i16 4096) nounwind
 
 ; Then I with int
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
 ; CHECK: #NO_APP
-   tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i32 7, i32 -3) nounwind
+   tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,I"(i32 7, i32 -3) nounwind
 
 ; Now J with 0
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},0
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},0
 ; CHECK: #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,$2\0A\09 ", "=r,r,J"(i32 7, i16 0) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2\0A\09 ", "=r,r,J"(i32 7, i16 0) nounwind
 
 ; Now K with 64
 ; CHECK: #APP
@@ -35,29 +35,29 @@ entry:
 
 ; Now N with -3
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
 ; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,N"(i32 7, i32 -3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,N"(i32 7, i32 -3) nounwind
 
 ; Now O with -3
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
 ; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,O"(i32 7, i16 -3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,O"(i32 7, i16 -3) nounwind
 
 ; Now P with 65535
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},65535
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},65535
 ; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,P"(i32 7, i32 65535) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,P"(i32 7, i32 65535) nounwind
 
 ; Now R Which takes the address of c
   %c = alloca i32, align 4
   store i32 -4469539, i32* %c, align 4
-  %8 = call i32 asm sideeffect "lwl $0, 1 + $1\0A\09lwr $0, 2 + $1\0A\09", "=r,*R"(i32* %c) #1
+  %8 = call i32 asm sideeffect "lw $0, 1 + $1\0A\09lw $0, 2 + $1\0A\09", "=r,*R"(i32* %c) #1
 ; CHECK: #APP
-; CHECK: lwl ${{[0-9]+}}, 1 + 0(${{[0-9]+}})
-; CHECK: lwr ${{[0-9]+}}, 2 + 0(${{[0-9]+}})
+; CHECK: lw ${{[0-9]+}}, 1 + 0(${{[0-9]+}})
+; CHECK: lw ${{[0-9]+}}, 2 + 0(${{[0-9]+}})
 ; CHECK: #NO_APP	
 
   ret i32 0
diff --git a/test/CodeGen/Mips/int-to-float-conversion.ll b/test/CodeGen/Mips/int-to-float-conversion.ll
index c2baf44..d226b48 100644
--- a/test/CodeGen/Mips/int-to-float-conversion.ll
+++ b/test/CodeGen/Mips/int-to-float-conversion.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=64
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=64
 
 @i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
index 09fee3d..0e9c91f 100644
--- a/test/CodeGen/Mips/largeimmprinting.ll
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 < %s | \
+; RUN:     FileCheck %s -check-prefix=64
 ; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | \
-; RUN: FileCheck %s -check-prefix=64
+; RUN:     FileCheck %s -check-prefix=64
 
 %struct.S1 = type { [65536 x i8] }
 
diff --git a/test/CodeGen/Mips/load-store-left-right.ll b/test/CodeGen/Mips/load-store-left-right.ll
index d0928ee..a3f5ebf 100644
--- a/test/CodeGen/Mips/load-store-left-right.ll
+++ b/test/CodeGen/Mips/load-store-left-right.ll
@@ -1,29 +1,439 @@
-; RUN: llc -march=mipsel < %s | FileCheck  -check-prefix=EL %s
-; RUN: llc -march=mips < %s | FileCheck  -check-prefix=EB %s
+; RUN: llc -march=mipsel   -mcpu=mips32              < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EL %s
+; RUN: llc -march=mips     -mcpu=mips32              < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EB %s
+; RUN: llc -march=mipsel   -mcpu=mips32r2            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EL %s
+; RUN: llc -march=mips     -mcpu=mips32r2            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EB %s
+; RUN: llc -march=mipsel   -mcpu=mips32r6            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32R6 -check-prefix=MIPS32R6-EL %s
+; RUN: llc -march=mips     -mcpu=mips32r6            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32R6 -check-prefix=MIPS32R6-EB %s
+; RUN: llc -march=mips64el -mcpu=mips4    -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips4    -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64   -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64   -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64r2 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64r6 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64r6 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EB %s
 
+%struct.SLL = type { i64 }
 %struct.SI = type { i32 }
+%struct.SUI = type { i32 }
 
+@sll = common global %struct.SLL zeroinitializer, align 1
 @si = common global %struct.SI zeroinitializer, align 1
+@sui = common global %struct.SUI zeroinitializer, align 1
 
-define i32 @foo_load_i() nounwind readonly {
+define i32 @load_SI() nounwind readonly {
 entry:
-; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: lwr $[[R0]], 0($[[R1]])
-; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: lwr $[[R0]], 3($[[R1]])
+; ALL-LABEL: load_SI:
+
+; MIPS32-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6:      lw $2, 0($[[PTR]])
+
+; MIPS64-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      lw $2, 0($[[PTR]])
 
   %0 = load i32* getelementptr inbounds (%struct.SI* @si, i32 0, i32 0), align 1
   ret i32 %0
 }
 
-define void @foo_store_i(i32 %a) nounwind {
+define void @store_SI(i32 %a) nounwind {
 entry:
-; EL: swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: swr $[[R0]], 0($[[R1]])
-; EB: swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: swr $[[R0]], 3($[[R1]])
+; ALL-LABEL: store_SI:
+
+; MIPS32-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6:      sw $4, 0($[[PTR]])
+
+; MIPS64-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      sw $4, 0($[[PTR]])
 
   store i32 %a, i32* getelementptr inbounds (%struct.SI* @si, i32 0, i32 0), align 1
   ret void
 }
 
+define i64 @load_SLL() nounwind readonly {
+entry:
+; ALL-LABEL: load_SLL:
+
+; MIPS32-EL:     lwl $2, 3($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $2, 0($[[R1]])
+; MIPS32-EL:     lwl $3, 7($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $3, 4($[[R1]])
+
+; MIPS32-EB:     lwl $2, 0($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $2, 3($[[R1]])
+; MIPS32-EB:     lwl $3, 4($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $3, 7($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(sll)(
+; MIPS32R6-DAG:  lw $2, 0($[[PTR]])
+; MIPS32R6-DAG:  lw $3, 4($[[PTR]])
+
+; MIPS64-EL:     ldl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
+; MIPS64-EL:     ldr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     ldl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     ldr $[[R0]], 7($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sll)(
+; MIPS64R6:      ld $2, 0($[[PTR]])
+
+  %0 = load i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
+  ret i64 %0
+}
+
+define i64 @load_SI_sext_to_i64() nounwind readonly {
+entry:
+; ALL-LABEL: load_SI_sext_to_i64:
+
+; MIPS32-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6-EL:   lw $2, 0($[[PTR]])
+; MIPS32R6-EL:   sra $3, $2, 31
+; MIPS32R6-EB:   lw $3, 0($[[PTR]])
+; MIPS32R6-EB:   sra $2, $3, 31
+
+; MIPS64-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      lw $2, 0($[[PTR]])
+
+  %0 = load i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
+  %conv = sext i32 %0 to i64
+  ret i64 %conv
+}
+
+define i64 @load_UI() nounwind readonly {
+entry:
+; ALL-LABEL: load_UI:
+
+; MIPS32-EL-DAG: lwl $[[R2:2]], 3($[[R1:[0-9]+]])
+; MIPS32-EL-DAG: lwr $[[R2]],   0($[[R1]])
+; MIPS32-EL-DAG: addiu $3, $zero, 0
+
+; MIPS32-EB-DAG: lwl $[[R2:3]], 0($[[R1:[0-9]+]])
+; MIPS32-EB-DAG: lwr $[[R2]],   3($[[R1]])
+; MIPS32-EB-DAG: addiu $2, $zero, 0
+
+; MIPS32R6:        lw $[[PTR:[0-9]+]], %got(sui)(
+; MIPS32R6-EL-DAG: lw $2, 0($[[PTR]])
+; MIPS32R6-EL-DAG: addiu $3, $zero, 0
+; MIPS32R6-EB-DAG: lw $3, 0($[[PTR]])
+; MIPS32R6-EB-DAG: addiu $2, $zero, 0
+
+; MIPS64-EL-DAG: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL-DAG: lwr $[[R0]], 0($[[R1]])
+; MIPS64-EL-DAG: daddiu $[[R2:[0-9]+]], $zero, 1
+; MIPS64-EL-DAG: dsll   $[[R3:[0-9]+]], $[[R2]], 32
+; MIPS64-EL-DAG: daddiu $[[R4:[0-9]+]], $[[R3]], -1
+; MIPS64-EL-DAG: and    ${{[0-9]+}}, $[[R0]], $[[R4]]
+
+; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sui)(
+; MIPS64R6:      lwu $2, 0($[[PTR]])
+
+  %0 = load i32* getelementptr inbounds (%struct.SUI* @sui, i64 0, i32 0), align 1
+  %conv = zext i32 %0 to i64
+  ret i64 %conv
+}
+
+define void @store_SLL(i64 %a) nounwind {
+entry:
+; ALL-LABEL: store_SLL:
+
+; MIPS32-EL-DAG: swl $[[A1:4]], 3($[[R1:[0-9]+]])
+; MIPS32-EL-DAG: swr $[[A1]],   0($[[R1]])
+; MIPS32-EL-DAG: swl $[[A2:5]], 7($[[R1:[0-9]+]])
+; MIPS32-EL-DAG: swr $[[A2]],   4($[[R1]])
+
+; MIPS32-EB-DAG: swl $[[A1:4]], 0($[[R1:[0-9]+]])
+; MIPS32-EB-DAG: swr $[[A1]],   3($[[R1]])
+; MIPS32-EB-DAG: swl $[[A1:5]], 4($[[R1:[0-9]+]])
+; MIPS32-EB-DAG: swr $[[A1]],   7($[[R1]])
+
+; MIPS32R6-DAG:  lw $[[PTR:[0-9]+]], %got(sll)(
+; MIPS32R6-DAG:  sw $4, 0($[[PTR]])
+; MIPS32R6-DAG:  sw $5, 4($[[PTR]])
+
+; MIPS64-EL:     sdl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
+; MIPS64-EL:     sdr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     sdl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     sdr $[[R0]], 7($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sll)(
+; MIPS64R6:      sd $4, 0($[[PTR]])
+
+  store i64 %a, i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
+  ret void
+}
+
+define void @store_SI_trunc_from_i64(i32 %a) nounwind {
+entry:
+; ALL-LABEL: store_SI_trunc_from_i64:
+
+; MIPS32-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6:      sw $4, 0($[[PTR]])
+
+; MIPS64-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      sw $4, 0($[[PTR]])
+
+  store i32 %a, i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
+  ret void
+}
+
+;
+; Structures are simply concatenations of the members. They are unaffected by
+; endianness
+;
+
+%struct.S0 = type { i8, i8 }
+@struct_s0 = common global %struct.S0 zeroinitializer, align 1
+%struct.S1 = type { i16, i16 }
+@struct_s1 = common global %struct.S1 zeroinitializer, align 1
+%struct.S2 = type { i32, i32 }
+@struct_s2 = common global %struct.S2 zeroinitializer, align 1
+
+define void @copy_struct_S0() nounwind {
+entry:
+; ALL-LABEL: copy_struct_S0:
+
+; MIPS32-EL:     lw $[[PTR:[0-9]+]], %got(struct_s0)(
+; MIPS32-EB:     lw $[[PTR:[0-9]+]], %got(struct_s0)(
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(struct_s0)(
+; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
+; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
+
+; FIXME: We should be able to do better than this on MIPS32r6/MIPS64r6 since
+;        we have unaligned halfword load/store available
+; ALL-DAG:       lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; ALL-DAG:       sb $[[R1]], 2($[[PTR]])
+; ALL-DAG:       lbu $[[R1:[0-9]+]], 1($[[PTR]])
+; ALL-DAG:       sb $[[R1]], 3($[[PTR]])
+
+  %0 = load %struct.S0* getelementptr inbounds (%struct.S0* @struct_s0, i32 0), align 1
+  store %struct.S0 %0, %struct.S0* getelementptr inbounds (%struct.S0* @struct_s0, i32 1), align 1
+  ret void
+}
+
+define void @copy_struct_S1() nounwind {
+entry:
+; ALL-LABEL: copy_struct_S1:
+
+; MIPS32-EL:     lw $[[PTR:[0-9]+]], %got(struct_s1)(
+; MIPS32-EB:     lw $[[PTR:[0-9]+]], %got(struct_s1)(
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 4($[[PTR]])
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 1($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 5($[[PTR]])
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 6($[[PTR]])
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 7($[[PTR]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(struct_s1)(
+; MIPS32R6-DAG:  lhu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32R6-DAG:  sh $[[R1]], 4($[[PTR]])
+; MIPS32R6-DAG:  lhu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS32R6-DAG:  sh $[[R1]], 6($[[PTR]])
+
+; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
+; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 4($[[PTR]])
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 1($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 5($[[PTR]])
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 6($[[PTR]])
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 7($[[PTR]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
+; MIPS64R6-DAG:  lhu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64R6-DAG:  sh $[[R1]], 4($[[PTR]])
+; MIPS64R6-DAG:  lhu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS64R6-DAG:  sh $[[R1]], 6($[[PTR]])
+
+  %0 = load %struct.S1* getelementptr inbounds (%struct.S1* @struct_s1, i32 0), align 1
+  store %struct.S1 %0, %struct.S1* getelementptr inbounds (%struct.S1* @struct_s1, i32 1), align 1
+  ret void
+}
+
+define void @copy_struct_S2() nounwind {
+entry:
+; ALL-LABEL: copy_struct_S2:
+
+; MIPS32-EL:     lw $[[PTR:[0-9]+]], %got(struct_s2)(
+; MIPS32-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS32-EL-DAG: lwr $[[R1]],        0($[[PTR]])
+; MIPS32-EL-DAG: swl $[[R1]],       11($[[PTR]])
+; MIPS32-EL-DAG: swr $[[R1]],        8($[[PTR]])
+; MIPS32-EL-DAG: lwl $[[R1:[0-9]+]], 7($[[PTR]])
+; MIPS32-EL-DAG: lwr $[[R1]],        4($[[PTR]])
+; MIPS32-EL-DAG: swl $[[R1]],       15($[[PTR]])
+; MIPS32-EL-DAG: swr $[[R1]],       12($[[PTR]])
+
+; MIPS32-EB:     lw $[[PTR:[0-9]+]], %got(struct_s2)(
+; MIPS32-EB-DAG: lwl $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32-EB-DAG: lwr $[[R1]],        3($[[PTR]])
+; MIPS32-EB-DAG: swl $[[R1]],        8($[[PTR]])
+; MIPS32-EB-DAG: swr $[[R1]],       11($[[PTR]])
+; MIPS32-EB-DAG: lwl $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS32-EB-DAG: lwr $[[R1]],        7($[[PTR]])
+; MIPS32-EB-DAG: swl $[[R1]],       12($[[PTR]])
+; MIPS32-EB-DAG: swr $[[R1]],       15($[[PTR]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(struct_s2)(
+; MIPS32R6-DAG:  lw $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32R6-DAG:  sw $[[R1]],        8($[[PTR]])
+; MIPS32R6-DAG:  lw $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS32R6-DAG:  sw $[[R1]],       12($[[PTR]])
+
+; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
+; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64-EL-DAG: lwr $[[R1]],        0($[[PTR]])
+; MIPS64-EL-DAG: swl $[[R1]],       11($[[PTR]])
+; MIPS64-EL-DAG: swr $[[R1]],        8($[[PTR]])
+; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 7($[[PTR]])
+; MIPS64-EL-DAG: lwr $[[R1]],        4($[[PTR]])
+; MIPS64-EL-DAG: swl $[[R1]],       15($[[PTR]])
+; MIPS64-EL-DAG: swr $[[R1]],       12($[[PTR]])
+
+; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
+; MIPS64-EB-DAG: lwl $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-EB-DAG: lwr $[[R1]],        3($[[PTR]])
+; MIPS64-EB-DAG: swl $[[R1]],        8($[[PTR]])
+; MIPS64-EB-DAG: swr $[[R1]],       11($[[PTR]])
+; MIPS64-EB-DAG: lwl $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS64-EB-DAG: lwr $[[R1]],        7($[[PTR]])
+; MIPS64-EB-DAG: swl $[[R1]],       12($[[PTR]])
+; MIPS64-EB-DAG: swr $[[R1]],       15($[[PTR]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
+; MIPS64R6-DAG:  lw $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64R6-DAG:  sw $[[R1]],        8($[[PTR]])
+; MIPS64R6-DAG:  lw $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS64R6-DAG:  sw $[[R1]],       12($[[PTR]])
+
+  %0 = load %struct.S2* getelementptr inbounds (%struct.S2* @struct_s2, i32 0), align 1
+  store %struct.S2 %0, %struct.S2* getelementptr inbounds (%struct.S2* @struct_s2, i32 1), align 1
+  ret void
+}
+
+;
+; Arrays are simply concatenations of the members. They are unaffected by
+; endianness
+;
+
+@arr = common global [7 x i8] zeroinitializer, align 1
+
+define void @pass_array_byval() nounwind {
+entry:
+; ALL-LABEL: pass_array_byval:
+
+; MIPS32-EL:     lw $[[SPTR:[0-9]+]], %got(arr)(
+; MIPS32-EL-DAG: lwl $[[R1:4]], 3($[[PTR]])
+; MIPS32-EL-DAG: lwr $[[R1]],   0($[[PTR]])
+; MIPS32-EL-DAG: lbu $[[R2:[0-9]+]], 4($[[PTR]])
+; MIPS32-EL-DAG: lbu $[[R3:[0-9]+]], 5($[[PTR]])
+; MIPS32-EL-DAG: sll $[[T0:[0-9]+]], $[[R3]], 8
+; MIPS32-EL-DAG: or  $[[T1:[0-9]+]], $[[T0]], $[[R2]]
+; MIPS32-EL-DAG: lbu $[[R4:[0-9]+]], 6($[[PTR]])
+; MIPS32-EL-DAG: sll $[[T2:[0-9]+]], $[[R4]], 16
+; MIPS32-EL-DAG: or  $5, $[[T1]], $[[T2]]
+
+; MIPS32-EB:     lw $[[SPTR:[0-9]+]], %got(arr)(
+; MIPS32-EB-DAG: lwl $[[R1:4]], 0($[[PTR]])
+; MIPS32-EB-DAG: lwr $[[R1]],   3($[[PTR]])
+; MIPS32-EB-DAG: lbu $[[R2:[0-9]+]], 5($[[PTR]])
+; MIPS32-EB-DAG: lbu $[[R3:[0-9]+]], 4($[[PTR]])
+; MIPS32-EB-DAG: sll $[[T0:[0-9]+]], $[[R3]], 8
+; MIPS32-EB-DAG: or  $[[T1:[0-9]+]], $[[T0]], $[[R2]]
+; MIPS32-EB-DAG: sll $[[T1]], $[[T1]], 16
+; MIPS32-EB-DAG: lbu $[[R4:[0-9]+]], 6($[[PTR]])
+; MIPS32-EB-DAG: sll $[[T2:[0-9]+]], $[[R4]], 8
+; MIPS32-EB-DAG: or  $5, $[[T1]], $[[T2]]
+
+; MIPS32R6:        lw $[[SPTR:[0-9]+]], %got(arr)(
+; MIPS32R6-DAG:    lw $4, 0($[[PTR]])
+; MIPS32R6-EL-DAG: lhu $[[R2:[0-9]+]], 4($[[PTR]])
+; MIPS32R6-EL-DAG: lbu $[[R3:[0-9]+]], 6($[[PTR]])
+; MIPS32R6-EL-DAG: sll $[[T0:[0-9]+]], $[[R3]], 16
+; MIPS32R6-EL-DAG: or  $5, $[[R2]], $[[T0]]
+
+; MIPS32R6-EB-DAG: lhu $[[R2:[0-9]+]], 4($[[PTR]])
+; MIPS32R6-EB-DAG: lbu $[[R3:[0-9]+]], 6($[[PTR]])
+; MIPS32R6-EB-DAG: sll $[[T0:[0-9]+]], $[[R2]], 16
+; MIPS32R6-EB-DAG: or  $5, $[[T0]], $[[R3]]
+
+; MIPS64-EL:     ld $[[SPTR:[0-9]+]], %got_disp(arr)(
+; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64-EL-DAG: lwr $[[R1]],   0($[[PTR]])
+
+; MIPS64-EB:     ld $[[SPTR:[0-9]+]], %got_disp(arr)(
+; MIPS64-EB-DAG: lwl  $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-EB-DAG: lwr  $[[R1]],   3($[[PTR]])
+; MIPS64-EB-DAG: dsll $[[R1]], $[[R1]], 32
+; MIPS64-EB-DAG: lbu  $[[R2:[0-9]+]], 5($[[PTR]])
+; MIPS64-EB-DAG: lbu  $[[R3:[0-9]+]], 4($[[PTR]])
+; MIPS64-EB-DAG: dsll $[[T0:[0-9]+]], $[[R3]], 8
+; MIPS64-EB-DAG: or   $[[T1:[0-9]+]], $[[T0]], $[[R2]]
+; MIPS64-EB-DAG: dsll $[[T1]], $[[T1]], 16
+; MIPS64-EB-DAG: or   $[[T3:[0-9]+]], $[[R1]], $[[T1]]
+; MIPS64-EB-DAG: lbu  $[[R4:[0-9]+]], 6($[[PTR]])
+; MIPS64-EB-DAG: dsll $[[T4:[0-9]+]], $[[R4]], 8
+; MIPS64-EB-DAG: or   $4, $[[T3]], $[[T4]]
+
+; MIPS64R6:      ld $[[SPTR:[0-9]+]], %got_disp(arr)(
+
+  tail call void @extern_func([7 x i8]* byval @arr) nounwind
+  ret void
+}
+
+declare void @extern_func([7 x i8]* byval)
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index af192d0..c7fe6fd 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -1,35 +1,129 @@
-; RUN: llc -march=mipsel -force-mips-long-branch -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=O32
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -force-mips-long-branch -O3 < %s \
+; RUN:   | FileCheck %s -check-prefix=O32
+; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 -force-mips-long-branch -O3 \
+; RUN:   < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 -force-mips-long-branch -O3 \
+; RUN:   < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=micromips \
+; RUN:   -force-mips-long-branch -O3 < %s | FileCheck %s -check-prefix=MICROMIPS
 
-@g0 = external global i32
 
-define void @foo1(i32 %s) nounwind {
+@x = external global i32
+
+define void @test1(i32 %s) {
 entry:
-; O32: nop
-; O32: addiu $sp, $sp, -8
-; O32: bal
-; O32: lui $1, 0
-; O32: addiu $1, $1, {{[0-9]+}} 
-; N64: nop
-; N64: daddiu $sp, $sp, -16
-; N64: lui $1, 0
-; N64: daddiu $1, $1, 0
-; N64: dsll $1, $1, 16
-; N64: daddiu $1, $1, 0
-; N64: bal
-; N64: dsll $1, $1, 16
-; N64: daddiu $1, $1, {{[0-9]+}}  
-
-  %tobool = icmp eq i32 %s, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %0 = load i32* @g0, align 4
-  %add = add nsw i32 %0, 12
-  store i32 %add, i32* @g0, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %entry, %if.then
+  %cmp = icmp eq i32 %s, 0
+  br i1 %cmp, label %end, label %then
+
+then:
+  store i32 1, i32* @x, align 4
+  br label %end
+
+end:
   ret void
-}
 
+
+; First check the normal version (without long branch).  beqz jumps to return,
+; and fallthrough block stores 1 to global variable.
+
+; CHECK:        lui     $[[R0:[0-9]+]], %hi(_gp_disp)
+; CHECK:        addiu   $[[R0]], $[[R0]], %lo(_gp_disp)
+; CHECK:        beqz    $4, $[[BB0:BB[0-9_]+]]
+; CHECK:        addu    $[[GP:[0-9]+]], $[[R0]], $25
+; CHECK:        lw      $[[R1:[0-9]+]], %got(x)($[[GP]])
+; CHECK:        addiu   $[[R2:[0-9]+]], $zero, 1
+; CHECK:        sw      $[[R2]], 0($[[R1]])
+; CHECK:   $[[BB0]]:
+; CHECK:        jr      $ra
+; CHECK:        nop
+
+
+; Check the MIPS32 version.  Check that branch logic is inverted, so that the
+; target of the new branch (bnez) is the fallthrough block of the original
+; branch.  Check that fallthrough block of the new branch contains long branch
+; expansion which at the end indirectly jumps to the target of the original
+; branch.
+
+; O32:        lui     $[[R0:[0-9]+]], %hi(_gp_disp)
+; O32:        addiu   $[[R0]], $[[R0]], %lo(_gp_disp)
+; O32:        bnez    $4, $[[BB0:BB[0-9_]+]]
+; O32:        addu    $[[GP:[0-9]+]], $[[R0]], $25
+
+; Check for long branch expansion:
+; O32:             addiu   $sp, $sp, -8
+; O32-NEXT:        sw      $ra, 0($sp)
+; O32-NEXT:        lui     $1, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; O32-NEXT:        bal     $[[BB1]]
+; O32-NEXT:        addiu   $1, $1, %lo(($[[BB2]])-($[[BB1]]))
+; O32-NEXT:   $[[BB1]]:
+; O32-NEXT:        addu    $1, $ra, $1
+; O32-NEXT:        lw      $ra, 0($sp)
+; O32-NEXT:        jr      $1
+; O32-NEXT:        addiu   $sp, $sp, 8
+
+; O32:   $[[BB0]]:
+; O32:        lw      $[[R1:[0-9]+]], %got(x)($[[GP]])
+; O32:        addiu   $[[R2:[0-9]+]], $zero, 1
+; O32:        sw      $[[R2]], 0($[[R1]])
+; O32:   $[[BB2]]:
+; O32:        jr      $ra
+; O32:        nop
+
+
+; Check the MIPS64 version.
+
+; N64:        lui     $[[R0:[0-9]+]], %hi(%neg(%gp_rel(test1)))
+; N64:        bnez    $4, $[[BB0:BB[0-9_]+]]
+; N64:        daddu   $[[R1:[0-9]+]], $[[R0]], $25
+
+; Check for long branch expansion:
+; N64:           daddiu  $sp, $sp, -16
+; N64-NEXT:      sd      $ra, 0($sp)
+; N64-NEXT:      daddiu  $1, $zero, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; N64-NEXT:      dsll    $1, $1, 16
+; N64-NEXT:      bal     $[[BB1]]
+; N64-NEXT:      daddiu  $1, $1, %lo(($[[BB2]])-($[[BB1]]))
+; N64-NEXT:  $[[BB1]]:
+; N64-NEXT:      daddu   $1, $ra, $1
+; N64-NEXT:      ld      $ra, 0($sp)
+; N64-NEXT:      jr      $1
+; N64-NEXT:      daddiu  $sp, $sp, 16
+
+; N64:   $[[BB0]]:
+; N64:        daddiu  $[[GP:[0-9]+]], $[[R1]], %lo(%neg(%gp_rel(test1)))
+; N64:        ld      $[[R2:[0-9]+]], %got_disp(x)($[[GP]])
+; N64:        addiu   $[[R3:[0-9]+]], $zero, 1
+; N64:        sw      $[[R3]], 0($[[R2]])
+; N64:   $[[BB2]]:
+; N64:        jr      $ra
+; N64:        nop
+
+
+; Check the microMIPS version.
+
+; MICROMIPS:        lui     $[[R0:[0-9]+]], %hi(_gp_disp)
+; MICROMIPS:        addiu   $[[R0]], $[[R0]], %lo(_gp_disp)
+; MICROMIPS:        bnez    $4, $[[BB0:BB[0-9_]+]]
+; MICROMIPS:        addu    $[[GP:[0-9]+]], $[[R0]], $25
+
+; Check for long branch expansion:
+; MICROMIPS:          addiu   $sp, $sp, -8
+; MICROMIPS-NEXT:     sw      $ra, 0($sp)
+; MICROMIPS-NEXT:     lui     $1, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; MICROMIPS-NEXT:     bal     $[[BB1]]
+; MICROMIPS-NEXT:     addiu   $1, $1, %lo(($[[BB2]])-($[[BB1]]))
+; MICROMIPS-NEXT:  $[[BB1]]:
+; MICROMIPS-NEXT:     addu    $1, $ra, $1
+; MICROMIPS-NEXT:     lw      $ra, 0($sp)
+; MICROMIPS-NEXT:     jr      $1
+; MICROMIPS-NEXT:     addiu   $sp, $sp, 8
+
+; MICROMIPS:   $[[BB0]]:
+; MICROMIPS:        lw      $[[R1:[0-9]+]], %got(x)($[[GP]])
+; MICROMIPS:        addiu   $[[R2:[0-9]+]], $zero, 1
+; MICROMIPS:        sw      $[[R2]], 0($[[R1]])
+; MICROMIPS:   $[[BB2]]:
+; MICROMIPS:        jr      $ra
+; MICROMIPS:        nop
+}
diff --git a/test/CodeGen/Mips/micromips-directives.ll b/test/CodeGen/Mips/micromips-directives.ll
new file mode 100644
index 0000000..dd0bd58
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-directives.ll
@@ -0,0 +1,16 @@
+; This test checks if the '.set [no]micromips' directives
+; are emitted before a function's entry label.
+
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | \
+; RUN:   FileCheck %s -check-prefix=CHECK-MM
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=-micromips %s -o - | \
+; RUN:   FileCheck %s -check-prefix=CHECK-NO-MM
+
+define i32 @main() nounwind {
+entry:
+  ret i32 0
+}
+
+; CHECK-MM: .set micromips
+; CHECK-NO-MM: .set nomicromips
+; CHECK: main:
diff --git a/test/CodeGen/Mips/micromips-long-branch.ll b/test/CodeGen/Mips/micromips-long-branch.ll
deleted file mode 100644
index 3267f4a..0000000
--- a/test/CodeGen/Mips/micromips-long-branch.ll
+++ /dev/null
@@ -1,16437 +0,0 @@
-; RUN: llc %s -march=mipsel -mcpu=mips32r2 -mattr=micromips -filetype=asm \
-; RUN: -relocation-model=pic -O3 -o - | FileCheck %s
-
-@a = common global [10 x i32] zeroinitializer, align 16
-
-; Function Attrs: nounwind uwtable
-define i32 @main() #0 {
-entry:
-  %retval = alloca i32, align 4
-  %i = alloca i32, align 4
-  store i32 0, i32* %retval
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:
-  %0 = load i32* %i, align 4
-  %cmp = icmp slt i32 %0, 10
-  br i1 %cmp, label %for.body, label %for.end
-
-; CHECK:  addiu $sp, $sp, -8
-; CHECK:  sw  $ra, 0($sp)
-; CHECK:  lui $[[REG1:[0-9]+]], 2
-; CHECK:  addiu $[[REG1]], $[[REG1]], 0
-; CHECK:  addu  $[[REG1]], $ra, $[[REG1]]
-; CHECK:  lw  $ra, 0($sp)
-; CHECK:  jr  $[[REG1]]
-; CHECK:  addiu $sp, $sp, 8
-
-for.body:
-  %1 = load i32* %i, align 4
-  %2 = load i32* %i, align 4
-  %idxprom = sext i32 %2 to i64
-  %arrayidx = getelementptr inbounds [10 x i32]* @a, i32 0, i64 %idxprom
-  store i32 %1, i32* %arrayidx, align 4  %nop0 = alloca i1, i1 0
-  %nop1 = alloca i1, i1 0
-  %nop2 = alloca i1, i1 0
-  %nop3 = alloca i1, i1 0
-  %nop4 = alloca i1, i1 0
-  %nop5 = alloca i1, i1 0
-  %nop6 = alloca i1, i1 0
-  %nop7 = alloca i1, i1 0
-  %nop8 = alloca i1, i1 0
-  %nop9 = alloca i1, i1 0
-  %nop10 = alloca i1, i1 0
-  %nop11 = alloca i1, i1 0
-  %nop12 = alloca i1, i1 0
-  %nop13 = alloca i1, i1 0
-  %nop14 = alloca i1, i1 0
-  %nop15 = alloca i1, i1 0
-  %nop16 = alloca i1, i1 0
-  %nop17 = alloca i1, i1 0
-  %nop18 = alloca i1, i1 0
-  %nop19 = alloca i1, i1 0
-  %nop20 = alloca i1, i1 0
-  %nop21 = alloca i1, i1 0
-  %nop22 = alloca i1, i1 0
-  %nop23 = alloca i1, i1 0
-  %nop24 = alloca i1, i1 0
-  %nop25 = alloca i1, i1 0
-  %nop26 = alloca i1, i1 0
-  %nop27 = alloca i1, i1 0
-  %nop28 = alloca i1, i1 0
-  %nop29 = alloca i1, i1 0
-  %nop30 = alloca i1, i1 0
-  %nop31 = alloca i1, i1 0
-  %nop32 = alloca i1, i1 0
-  %nop33 = alloca i1, i1 0
-  %nop34 = alloca i1, i1 0
-  %nop35 = alloca i1, i1 0
-  %nop36 = alloca i1, i1 0
-  %nop37 = alloca i1, i1 0
-  %nop38 = alloca i1, i1 0
-  %nop39 = alloca i1, i1 0
-  %nop40 = alloca i1, i1 0
-  %nop41 = alloca i1, i1 0
-  %nop42 = alloca i1, i1 0
-  %nop43 = alloca i1, i1 0
-  %nop44 = alloca i1, i1 0
-  %nop45 = alloca i1, i1 0
-  %nop46 = alloca i1, i1 0
-  %nop47 = alloca i1, i1 0
-  %nop48 = alloca i1, i1 0
-  %nop49 = alloca i1, i1 0
-  %nop50 = alloca i1, i1 0
-  %nop51 = alloca i1, i1 0
-  %nop52 = alloca i1, i1 0
-  %nop53 = alloca i1, i1 0
-  %nop54 = alloca i1, i1 0
-  %nop55 = alloca i1, i1 0
-  %nop56 = alloca i1, i1 0
-  %nop57 = alloca i1, i1 0
-  %nop58 = alloca i1, i1 0
-  %nop59 = alloca i1, i1 0
-  %nop60 = alloca i1, i1 0
-  %nop61 = alloca i1, i1 0
-  %nop62 = alloca i1, i1 0
-  %nop63 = alloca i1, i1 0
-  %nop64 = alloca i1, i1 0
-  %nop65 = alloca i1, i1 0
-  %nop66 = alloca i1, i1 0
-  %nop67 = alloca i1, i1 0
-  %nop68 = alloca i1, i1 0
-  %nop69 = alloca i1, i1 0
-  %nop70 = alloca i1, i1 0
-  %nop71 = alloca i1, i1 0
-  %nop72 = alloca i1, i1 0
-  %nop73 = alloca i1, i1 0
-  %nop74 = alloca i1, i1 0
-  %nop75 = alloca i1, i1 0
-  %nop76 = alloca i1, i1 0
-  %nop77 = alloca i1, i1 0
-  %nop78 = alloca i1, i1 0
-  %nop79 = alloca i1, i1 0
-  %nop80 = alloca i1, i1 0
-  %nop81 = alloca i1, i1 0
-  %nop82 = alloca i1, i1 0
-  %nop83 = alloca i1, i1 0
-  %nop84 = alloca i1, i1 0
-  %nop85 = alloca i1, i1 0
-  %nop86 = alloca i1, i1 0
-  %nop87 = alloca i1, i1 0
-  %nop88 = alloca i1, i1 0
-  %nop89 = alloca i1, i1 0
-  %nop90 = alloca i1, i1 0
-  %nop91 = alloca i1, i1 0
-  %nop92 = alloca i1, i1 0
-  %nop93 = alloca i1, i1 0
-  %nop94 = alloca i1, i1 0
-  %nop95 = alloca i1, i1 0
-  %nop96 = alloca i1, i1 0
-  %nop97 = alloca i1, i1 0
-  %nop98 = alloca i1, i1 0
-  %nop99 = alloca i1, i1 0
-  %nop100 = alloca i1, i1 0
-  %nop101 = alloca i1, i1 0
-  %nop102 = alloca i1, i1 0
-  %nop103 = alloca i1, i1 0
-  %nop104 = alloca i1, i1 0
-  %nop105 = alloca i1, i1 0
-  %nop106 = alloca i1, i1 0
-  %nop107 = alloca i1, i1 0
-  %nop108 = alloca i1, i1 0
-  %nop109 = alloca i1, i1 0
-  %nop110 = alloca i1, i1 0
-  %nop111 = alloca i1, i1 0
-  %nop112 = alloca i1, i1 0
-  %nop113 = alloca i1, i1 0
-  %nop114 = alloca i1, i1 0
-  %nop115 = alloca i1, i1 0
-  %nop116 = alloca i1, i1 0
-  %nop117 = alloca i1, i1 0
-  %nop118 = alloca i1, i1 0
-  %nop119 = alloca i1, i1 0
-  %nop120 = alloca i1, i1 0
-  %nop121 = alloca i1, i1 0
-  %nop122 = alloca i1, i1 0
-  %nop123 = alloca i1, i1 0
-  %nop124 = alloca i1, i1 0
-  %nop125 = alloca i1, i1 0
-  %nop126 = alloca i1, i1 0
-  %nop127 = alloca i1, i1 0
-  %nop128 = alloca i1, i1 0
-  %nop129 = alloca i1, i1 0
-  %nop130 = alloca i1, i1 0
-  %nop131 = alloca i1, i1 0
-  %nop132 = alloca i1, i1 0
-  %nop133 = alloca i1, i1 0
-  %nop134 = alloca i1, i1 0
-  %nop135 = alloca i1, i1 0
-  %nop136 = alloca i1, i1 0
-  %nop137 = alloca i1, i1 0
-  %nop138 = alloca i1, i1 0
-  %nop139 = alloca i1, i1 0
-  %nop140 = alloca i1, i1 0
-  %nop141 = alloca i1, i1 0
-  %nop142 = alloca i1, i1 0
-  %nop143 = alloca i1, i1 0
-  %nop144 = alloca i1, i1 0
-  %nop145 = alloca i1, i1 0
-  %nop146 = alloca i1, i1 0
-  %nop147 = alloca i1, i1 0
-  %nop148 = alloca i1, i1 0
-  %nop149 = alloca i1, i1 0
-  %nop150 = alloca i1, i1 0
-  %nop151 = alloca i1, i1 0
-  %nop152 = alloca i1, i1 0
-  %nop153 = alloca i1, i1 0
-  %nop154 = alloca i1, i1 0
-  %nop155 = alloca i1, i1 0
-  %nop156 = alloca i1, i1 0
-  %nop157 = alloca i1, i1 0
-  %nop158 = alloca i1, i1 0
-  %nop159 = alloca i1, i1 0
-  %nop160 = alloca i1, i1 0
-  %nop161 = alloca i1, i1 0
-  %nop162 = alloca i1, i1 0
-  %nop163 = alloca i1, i1 0
-  %nop164 = alloca i1, i1 0
-  %nop165 = alloca i1, i1 0
-  %nop166 = alloca i1, i1 0
-  %nop167 = alloca i1, i1 0
-  %nop168 = alloca i1, i1 0
-  %nop169 = alloca i1, i1 0
-  %nop170 = alloca i1, i1 0
-  %nop171 = alloca i1, i1 0
-  %nop172 = alloca i1, i1 0
-  %nop173 = alloca i1, i1 0
-  %nop174 = alloca i1, i1 0
-  %nop175 = alloca i1, i1 0
-  %nop176 = alloca i1, i1 0
-  %nop177 = alloca i1, i1 0
-  %nop178 = alloca i1, i1 0
-  %nop179 = alloca i1, i1 0
-  %nop180 = alloca i1, i1 0
-  %nop181 = alloca i1, i1 0
-  %nop182 = alloca i1, i1 0
-  %nop183 = alloca i1, i1 0
-  %nop184 = alloca i1, i1 0
-  %nop185 = alloca i1, i1 0
-  %nop186 = alloca i1, i1 0
-  %nop187 = alloca i1, i1 0
-  %nop188 = alloca i1, i1 0
-  %nop189 = alloca i1, i1 0
-  %nop190 = alloca i1, i1 0
-  %nop191 = alloca i1, i1 0
-  %nop192 = alloca i1, i1 0
-  %nop193 = alloca i1, i1 0
-  %nop194 = alloca i1, i1 0
-  %nop195 = alloca i1, i1 0
-  %nop196 = alloca i1, i1 0
-  %nop197 = alloca i1, i1 0
-  %nop198 = alloca i1, i1 0
-  %nop199 = alloca i1, i1 0
-  %nop200 = alloca i1, i1 0
-  %nop201 = alloca i1, i1 0
-  %nop202 = alloca i1, i1 0
-  %nop203 = alloca i1, i1 0
-  %nop204 = alloca i1, i1 0
-  %nop205 = alloca i1, i1 0
-  %nop206 = alloca i1, i1 0
-  %nop207 = alloca i1, i1 0
-  %nop208 = alloca i1, i1 0
-  %nop209 = alloca i1, i1 0
-  %nop210 = alloca i1, i1 0
-  %nop211 = alloca i1, i1 0
-  %nop212 = alloca i1, i1 0
-  %nop213 = alloca i1, i1 0
-  %nop214 = alloca i1, i1 0
-  %nop215 = alloca i1, i1 0
-  %nop216 = alloca i1, i1 0
-  %nop217 = alloca i1, i1 0
-  %nop218 = alloca i1, i1 0
-  %nop219 = alloca i1, i1 0
-  %nop220 = alloca i1, i1 0
-  %nop221 = alloca i1, i1 0
-  %nop222 = alloca i1, i1 0
-  %nop223 = alloca i1, i1 0
-  %nop224 = alloca i1, i1 0
-  %nop225 = alloca i1, i1 0
-  %nop226 = alloca i1, i1 0
-  %nop227 = alloca i1, i1 0
-  %nop228 = alloca i1, i1 0
-  %nop229 = alloca i1, i1 0
-  %nop230 = alloca i1, i1 0
-  %nop231 = alloca i1, i1 0
-  %nop232 = alloca i1, i1 0
-  %nop233 = alloca i1, i1 0
-  %nop234 = alloca i1, i1 0
-  %nop235 = alloca i1, i1 0
-  %nop236 = alloca i1, i1 0
-  %nop237 = alloca i1, i1 0
-  %nop238 = alloca i1, i1 0
-  %nop239 = alloca i1, i1 0
-  %nop240 = alloca i1, i1 0
-  %nop241 = alloca i1, i1 0
-  %nop242 = alloca i1, i1 0
-  %nop243 = alloca i1, i1 0
-  %nop244 = alloca i1, i1 0
-  %nop245 = alloca i1, i1 0
-  %nop246 = alloca i1, i1 0
-  %nop247 = alloca i1, i1 0
-  %nop248 = alloca i1, i1 0
-  %nop249 = alloca i1, i1 0
-  %nop250 = alloca i1, i1 0
-  %nop251 = alloca i1, i1 0
-  %nop252 = alloca i1, i1 0
-  %nop253 = alloca i1, i1 0
-  %nop254 = alloca i1, i1 0
-  %nop255 = alloca i1, i1 0
-  %nop256 = alloca i1, i1 0
-  %nop257 = alloca i1, i1 0
-  %nop258 = alloca i1, i1 0
-  %nop259 = alloca i1, i1 0
-  %nop260 = alloca i1, i1 0
-  %nop261 = alloca i1, i1 0
-  %nop262 = alloca i1, i1 0
-  %nop263 = alloca i1, i1 0
-  %nop264 = alloca i1, i1 0
-  %nop265 = alloca i1, i1 0
-  %nop266 = alloca i1, i1 0
-  %nop267 = alloca i1, i1 0
-  %nop268 = alloca i1, i1 0
-  %nop269 = alloca i1, i1 0
-  %nop270 = alloca i1, i1 0
-  %nop271 = alloca i1, i1 0
-  %nop272 = alloca i1, i1 0
-  %nop273 = alloca i1, i1 0
-  %nop274 = alloca i1, i1 0
-  %nop275 = alloca i1, i1 0
-  %nop276 = alloca i1, i1 0
-  %nop277 = alloca i1, i1 0
-  %nop278 = alloca i1, i1 0
-  %nop279 = alloca i1, i1 0
-  %nop280 = alloca i1, i1 0
-  %nop281 = alloca i1, i1 0
-  %nop282 = alloca i1, i1 0
-  %nop283 = alloca i1, i1 0
-  %nop284 = alloca i1, i1 0
-  %nop285 = alloca i1, i1 0
-  %nop286 = alloca i1, i1 0
-  %nop287 = alloca i1, i1 0
-  %nop288 = alloca i1, i1 0
-  %nop289 = alloca i1, i1 0
-  %nop290 = alloca i1, i1 0
-  %nop291 = alloca i1, i1 0
-  %nop292 = alloca i1, i1 0
-  %nop293 = alloca i1, i1 0
-  %nop294 = alloca i1, i1 0
-  %nop295 = alloca i1, i1 0
-  %nop296 = alloca i1, i1 0
-  %nop297 = alloca i1, i1 0
-  %nop298 = alloca i1, i1 0
-  %nop299 = alloca i1, i1 0
-  %nop300 = alloca i1, i1 0
-  %nop301 = alloca i1, i1 0
-  %nop302 = alloca i1, i1 0
-  %nop303 = alloca i1, i1 0
-  %nop304 = alloca i1, i1 0
-  %nop305 = alloca i1, i1 0
-  %nop306 = alloca i1, i1 0
-  %nop307 = alloca i1, i1 0
-  %nop308 = alloca i1, i1 0
-  %nop309 = alloca i1, i1 0
-  %nop310 = alloca i1, i1 0
-  %nop311 = alloca i1, i1 0
-  %nop312 = alloca i1, i1 0
-  %nop313 = alloca i1, i1 0
-  %nop314 = alloca i1, i1 0
-  %nop315 = alloca i1, i1 0
-  %nop316 = alloca i1, i1 0
-  %nop317 = alloca i1, i1 0
-  %nop318 = alloca i1, i1 0
-  %nop319 = alloca i1, i1 0
-  %nop320 = alloca i1, i1 0
-  %nop321 = alloca i1, i1 0
-  %nop322 = alloca i1, i1 0
-  %nop323 = alloca i1, i1 0
-  %nop324 = alloca i1, i1 0
-  %nop325 = alloca i1, i1 0
-  %nop326 = alloca i1, i1 0
-  %nop327 = alloca i1, i1 0
-  %nop328 = alloca i1, i1 0
-  %nop329 = alloca i1, i1 0
-  %nop330 = alloca i1, i1 0
-  %nop331 = alloca i1, i1 0
-  %nop332 = alloca i1, i1 0
-  %nop333 = alloca i1, i1 0
-  %nop334 = alloca i1, i1 0
-  %nop335 = alloca i1, i1 0
-  %nop336 = alloca i1, i1 0
-  %nop337 = alloca i1, i1 0
-  %nop338 = alloca i1, i1 0
-  %nop339 = alloca i1, i1 0
-  %nop340 = alloca i1, i1 0
-  %nop341 = alloca i1, i1 0
-  %nop342 = alloca i1, i1 0
-  %nop343 = alloca i1, i1 0
-  %nop344 = alloca i1, i1 0
-  %nop345 = alloca i1, i1 0
-  %nop346 = alloca i1, i1 0
-  %nop347 = alloca i1, i1 0
-  %nop348 = alloca i1, i1 0
-  %nop349 = alloca i1, i1 0
-  %nop350 = alloca i1, i1 0
-  %nop351 = alloca i1, i1 0
-  %nop352 = alloca i1, i1 0
-  %nop353 = alloca i1, i1 0
-  %nop354 = alloca i1, i1 0
-  %nop355 = alloca i1, i1 0
-  %nop356 = alloca i1, i1 0
-  %nop357 = alloca i1, i1 0
-  %nop358 = alloca i1, i1 0
-  %nop359 = alloca i1, i1 0
-  %nop360 = alloca i1, i1 0
-  %nop361 = alloca i1, i1 0
-  %nop362 = alloca i1, i1 0
-  %nop363 = alloca i1, i1 0
-  %nop364 = alloca i1, i1 0
-  %nop365 = alloca i1, i1 0
-  %nop366 = alloca i1, i1 0
-  %nop367 = alloca i1, i1 0
-  %nop368 = alloca i1, i1 0
-  %nop369 = alloca i1, i1 0
-  %nop370 = alloca i1, i1 0
-  %nop371 = alloca i1, i1 0
-  %nop372 = alloca i1, i1 0
-  %nop373 = alloca i1, i1 0
-  %nop374 = alloca i1, i1 0
-  %nop375 = alloca i1, i1 0
-  %nop376 = alloca i1, i1 0
-  %nop377 = alloca i1, i1 0
-  %nop378 = alloca i1, i1 0
-  %nop379 = alloca i1, i1 0
-  %nop380 = alloca i1, i1 0
-  %nop381 = alloca i1, i1 0
-  %nop382 = alloca i1, i1 0
-  %nop383 = alloca i1, i1 0
-  %nop384 = alloca i1, i1 0
-  %nop385 = alloca i1, i1 0
-  %nop386 = alloca i1, i1 0
-  %nop387 = alloca i1, i1 0
-  %nop388 = alloca i1, i1 0
-  %nop389 = alloca i1, i1 0
-  %nop390 = alloca i1, i1 0
-  %nop391 = alloca i1, i1 0
-  %nop392 = alloca i1, i1 0
-  %nop393 = alloca i1, i1 0
-  %nop394 = alloca i1, i1 0
-  %nop395 = alloca i1, i1 0
-  %nop396 = alloca i1, i1 0
-  %nop397 = alloca i1, i1 0
-  %nop398 = alloca i1, i1 0
-  %nop399 = alloca i1, i1 0
-  %nop400 = alloca i1, i1 0
-  %nop401 = alloca i1, i1 0
-  %nop402 = alloca i1, i1 0
-  %nop403 = alloca i1, i1 0
-  %nop404 = alloca i1, i1 0
-  %nop405 = alloca i1, i1 0
-  %nop406 = alloca i1, i1 0
-  %nop407 = alloca i1, i1 0
-  %nop408 = alloca i1, i1 0
-  %nop409 = alloca i1, i1 0
-  %nop410 = alloca i1, i1 0
-  %nop411 = alloca i1, i1 0
-  %nop412 = alloca i1, i1 0
-  %nop413 = alloca i1, i1 0
-  %nop414 = alloca i1, i1 0
-  %nop415 = alloca i1, i1 0
-  %nop416 = alloca i1, i1 0
-  %nop417 = alloca i1, i1 0
-  %nop418 = alloca i1, i1 0
-  %nop419 = alloca i1, i1 0
-  %nop420 = alloca i1, i1 0
-  %nop421 = alloca i1, i1 0
-  %nop422 = alloca i1, i1 0
-  %nop423 = alloca i1, i1 0
-  %nop424 = alloca i1, i1 0
-  %nop425 = alloca i1, i1 0
-  %nop426 = alloca i1, i1 0
-  %nop427 = alloca i1, i1 0
-  %nop428 = alloca i1, i1 0
-  %nop429 = alloca i1, i1 0
-  %nop430 = alloca i1, i1 0
-  %nop431 = alloca i1, i1 0
-  %nop432 = alloca i1, i1 0
-  %nop433 = alloca i1, i1 0
-  %nop434 = alloca i1, i1 0
-  %nop435 = alloca i1, i1 0
-  %nop436 = alloca i1, i1 0
-  %nop437 = alloca i1, i1 0
-  %nop438 = alloca i1, i1 0
-  %nop439 = alloca i1, i1 0
-  %nop440 = alloca i1, i1 0
-  %nop441 = alloca i1, i1 0
-  %nop442 = alloca i1, i1 0
-  %nop443 = alloca i1, i1 0
-  %nop444 = alloca i1, i1 0
-  %nop445 = alloca i1, i1 0
-  %nop446 = alloca i1, i1 0
-  %nop447 = alloca i1, i1 0
-  %nop448 = alloca i1, i1 0
-  %nop449 = alloca i1, i1 0
-  %nop450 = alloca i1, i1 0
-  %nop451 = alloca i1, i1 0
-  %nop452 = alloca i1, i1 0
-  %nop453 = alloca i1, i1 0
-  %nop454 = alloca i1, i1 0
-  %nop455 = alloca i1, i1 0
-  %nop456 = alloca i1, i1 0
-  %nop457 = alloca i1, i1 0
-  %nop458 = alloca i1, i1 0
-  %nop459 = alloca i1, i1 0
-  %nop460 = alloca i1, i1 0
-  %nop461 = alloca i1, i1 0
-  %nop462 = alloca i1, i1 0
-  %nop463 = alloca i1, i1 0
-  %nop464 = alloca i1, i1 0
-  %nop465 = alloca i1, i1 0
-  %nop466 = alloca i1, i1 0
-  %nop467 = alloca i1, i1 0
-  %nop468 = alloca i1, i1 0
-  %nop469 = alloca i1, i1 0
-  %nop470 = alloca i1, i1 0
-  %nop471 = alloca i1, i1 0
-  %nop472 = alloca i1, i1 0
-  %nop473 = alloca i1, i1 0
-  %nop474 = alloca i1, i1 0
-  %nop475 = alloca i1, i1 0
-  %nop476 = alloca i1, i1 0
-  %nop477 = alloca i1, i1 0
-  %nop478 = alloca i1, i1 0
-  %nop479 = alloca i1, i1 0
-  %nop480 = alloca i1, i1 0
-  %nop481 = alloca i1, i1 0
-  %nop482 = alloca i1, i1 0
-  %nop483 = alloca i1, i1 0
-  %nop484 = alloca i1, i1 0
-  %nop485 = alloca i1, i1 0
-  %nop486 = alloca i1, i1 0
-  %nop487 = alloca i1, i1 0
-  %nop488 = alloca i1, i1 0
-  %nop489 = alloca i1, i1 0
-  %nop490 = alloca i1, i1 0
-  %nop491 = alloca i1, i1 0
-  %nop492 = alloca i1, i1 0
-  %nop493 = alloca i1, i1 0
-  %nop494 = alloca i1, i1 0
-  %nop495 = alloca i1, i1 0
-  %nop496 = alloca i1, i1 0
-  %nop497 = alloca i1, i1 0
-  %nop498 = alloca i1, i1 0
-  %nop499 = alloca i1, i1 0
-  %nop500 = alloca i1, i1 0
-  %nop501 = alloca i1, i1 0
-  %nop502 = alloca i1, i1 0
-  %nop503 = alloca i1, i1 0
-  %nop504 = alloca i1, i1 0
-  %nop505 = alloca i1, i1 0
-  %nop506 = alloca i1, i1 0
-  %nop507 = alloca i1, i1 0
-  %nop508 = alloca i1, i1 0
-  %nop509 = alloca i1, i1 0
-  %nop510 = alloca i1, i1 0
-  %nop511 = alloca i1, i1 0
-  %nop512 = alloca i1, i1 0
-  %nop513 = alloca i1, i1 0
-  %nop514 = alloca i1, i1 0
-  %nop515 = alloca i1, i1 0
-  %nop516 = alloca i1, i1 0
-  %nop517 = alloca i1, i1 0
-  %nop518 = alloca i1, i1 0
-  %nop519 = alloca i1, i1 0
-  %nop520 = alloca i1, i1 0
-  %nop521 = alloca i1, i1 0
-  %nop522 = alloca i1, i1 0
-  %nop523 = alloca i1, i1 0
-  %nop524 = alloca i1, i1 0
-  %nop525 = alloca i1, i1 0
-  %nop526 = alloca i1, i1 0
-  %nop527 = alloca i1, i1 0
-  %nop528 = alloca i1, i1 0
-  %nop529 = alloca i1, i1 0
-  %nop530 = alloca i1, i1 0
-  %nop531 = alloca i1, i1 0
-  %nop532 = alloca i1, i1 0
-  %nop533 = alloca i1, i1 0
-  %nop534 = alloca i1, i1 0
-  %nop535 = alloca i1, i1 0
-  %nop536 = alloca i1, i1 0
-  %nop537 = alloca i1, i1 0
-  %nop538 = alloca i1, i1 0
-  %nop539 = alloca i1, i1 0
-  %nop540 = alloca i1, i1 0
-  %nop541 = alloca i1, i1 0
-  %nop542 = alloca i1, i1 0
-  %nop543 = alloca i1, i1 0
-  %nop544 = alloca i1, i1 0
-  %nop545 = alloca i1, i1 0
-  %nop546 = alloca i1, i1 0
-  %nop547 = alloca i1, i1 0
-  %nop548 = alloca i1, i1 0
-  %nop549 = alloca i1, i1 0
-  %nop550 = alloca i1, i1 0
-  %nop551 = alloca i1, i1 0
-  %nop552 = alloca i1, i1 0
-  %nop553 = alloca i1, i1 0
-  %nop554 = alloca i1, i1 0
-  %nop555 = alloca i1, i1 0
-  %nop556 = alloca i1, i1 0
-  %nop557 = alloca i1, i1 0
-  %nop558 = alloca i1, i1 0
-  %nop559 = alloca i1, i1 0
-  %nop560 = alloca i1, i1 0
-  %nop561 = alloca i1, i1 0
-  %nop562 = alloca i1, i1 0
-  %nop563 = alloca i1, i1 0
-  %nop564 = alloca i1, i1 0
-  %nop565 = alloca i1, i1 0
-  %nop566 = alloca i1, i1 0
-  %nop567 = alloca i1, i1 0
-  %nop568 = alloca i1, i1 0
-  %nop569 = alloca i1, i1 0
-  %nop570 = alloca i1, i1 0
-  %nop571 = alloca i1, i1 0
-  %nop572 = alloca i1, i1 0
-  %nop573 = alloca i1, i1 0
-  %nop574 = alloca i1, i1 0
-  %nop575 = alloca i1, i1 0
-  %nop576 = alloca i1, i1 0
-  %nop577 = alloca i1, i1 0
-  %nop578 = alloca i1, i1 0
-  %nop579 = alloca i1, i1 0
-  %nop580 = alloca i1, i1 0
-  %nop581 = alloca i1, i1 0
-  %nop582 = alloca i1, i1 0
-  %nop583 = alloca i1, i1 0
-  %nop584 = alloca i1, i1 0
-  %nop585 = alloca i1, i1 0
-  %nop586 = alloca i1, i1 0
-  %nop587 = alloca i1, i1 0
-  %nop588 = alloca i1, i1 0
-  %nop589 = alloca i1, i1 0
-  %nop590 = alloca i1, i1 0
-  %nop591 = alloca i1, i1 0
-  %nop592 = alloca i1, i1 0
-  %nop593 = alloca i1, i1 0
-  %nop594 = alloca i1, i1 0
-  %nop595 = alloca i1, i1 0
-  %nop596 = alloca i1, i1 0
-  %nop597 = alloca i1, i1 0
-  %nop598 = alloca i1, i1 0
-  %nop599 = alloca i1, i1 0
-  %nop600 = alloca i1, i1 0
-  %nop601 = alloca i1, i1 0
-  %nop602 = alloca i1, i1 0
-  %nop603 = alloca i1, i1 0
-  %nop604 = alloca i1, i1 0
-  %nop605 = alloca i1, i1 0
-  %nop606 = alloca i1, i1 0
-  %nop607 = alloca i1, i1 0
-  %nop608 = alloca i1, i1 0
-  %nop609 = alloca i1, i1 0
-  %nop610 = alloca i1, i1 0
-  %nop611 = alloca i1, i1 0
-  %nop612 = alloca i1, i1 0
-  %nop613 = alloca i1, i1 0
-  %nop614 = alloca i1, i1 0
-  %nop615 = alloca i1, i1 0
-  %nop616 = alloca i1, i1 0
-  %nop617 = alloca i1, i1 0
-  %nop618 = alloca i1, i1 0
-  %nop619 = alloca i1, i1 0
-  %nop620 = alloca i1, i1 0
-  %nop621 = alloca i1, i1 0
-  %nop622 = alloca i1, i1 0
-  %nop623 = alloca i1, i1 0
-  %nop624 = alloca i1, i1 0
-  %nop625 = alloca i1, i1 0
-  %nop626 = alloca i1, i1 0
-  %nop627 = alloca i1, i1 0
-  %nop628 = alloca i1, i1 0
-  %nop629 = alloca i1, i1 0
-  %nop630 = alloca i1, i1 0
-  %nop631 = alloca i1, i1 0
-  %nop632 = alloca i1, i1 0
-  %nop633 = alloca i1, i1 0
-  %nop634 = alloca i1, i1 0
-  %nop635 = alloca i1, i1 0
-  %nop636 = alloca i1, i1 0
-  %nop637 = alloca i1, i1 0
-  %nop638 = alloca i1, i1 0
-  %nop639 = alloca i1, i1 0
-  %nop640 = alloca i1, i1 0
-  %nop641 = alloca i1, i1 0
-  %nop642 = alloca i1, i1 0
-  %nop643 = alloca i1, i1 0
-  %nop644 = alloca i1, i1 0
-  %nop645 = alloca i1, i1 0
-  %nop646 = alloca i1, i1 0
-  %nop647 = alloca i1, i1 0
-  %nop648 = alloca i1, i1 0
-  %nop649 = alloca i1, i1 0
-  %nop650 = alloca i1, i1 0
-  %nop651 = alloca i1, i1 0
-  %nop652 = alloca i1, i1 0
-  %nop653 = alloca i1, i1 0
-  %nop654 = alloca i1, i1 0
-  %nop655 = alloca i1, i1 0
-  %nop656 = alloca i1, i1 0
-  %nop657 = alloca i1, i1 0
-  %nop658 = alloca i1, i1 0
-  %nop659 = alloca i1, i1 0
-  %nop660 = alloca i1, i1 0
-  %nop661 = alloca i1, i1 0
-  %nop662 = alloca i1, i1 0
-  %nop663 = alloca i1, i1 0
-  %nop664 = alloca i1, i1 0
-  %nop665 = alloca i1, i1 0
-  %nop666 = alloca i1, i1 0
-  %nop667 = alloca i1, i1 0
-  %nop668 = alloca i1, i1 0
-  %nop669 = alloca i1, i1 0
-  %nop670 = alloca i1, i1 0
-  %nop671 = alloca i1, i1 0
-  %nop672 = alloca i1, i1 0
-  %nop673 = alloca i1, i1 0
-  %nop674 = alloca i1, i1 0
-  %nop675 = alloca i1, i1 0
-  %nop676 = alloca i1, i1 0
-  %nop677 = alloca i1, i1 0
-  %nop678 = alloca i1, i1 0
-  %nop679 = alloca i1, i1 0
-  %nop680 = alloca i1, i1 0
-  %nop681 = alloca i1, i1 0
-  %nop682 = alloca i1, i1 0
-  %nop683 = alloca i1, i1 0
-  %nop684 = alloca i1, i1 0
-  %nop685 = alloca i1, i1 0
-  %nop686 = alloca i1, i1 0
-  %nop687 = alloca i1, i1 0
-  %nop688 = alloca i1, i1 0
-  %nop689 = alloca i1, i1 0
-  %nop690 = alloca i1, i1 0
-  %nop691 = alloca i1, i1 0
-  %nop692 = alloca i1, i1 0
-  %nop693 = alloca i1, i1 0
-  %nop694 = alloca i1, i1 0
-  %nop695 = alloca i1, i1 0
-  %nop696 = alloca i1, i1 0
-  %nop697 = alloca i1, i1 0
-  %nop698 = alloca i1, i1 0
-  %nop699 = alloca i1, i1 0
-  %nop700 = alloca i1, i1 0
-  %nop701 = alloca i1, i1 0
-  %nop702 = alloca i1, i1 0
-  %nop703 = alloca i1, i1 0
-  %nop704 = alloca i1, i1 0
-  %nop705 = alloca i1, i1 0
-  %nop706 = alloca i1, i1 0
-  %nop707 = alloca i1, i1 0
-  %nop708 = alloca i1, i1 0
-  %nop709 = alloca i1, i1 0
-  %nop710 = alloca i1, i1 0
-  %nop711 = alloca i1, i1 0
-  %nop712 = alloca i1, i1 0
-  %nop713 = alloca i1, i1 0
-  %nop714 = alloca i1, i1 0
-  %nop715 = alloca i1, i1 0
-  %nop716 = alloca i1, i1 0
-  %nop717 = alloca i1, i1 0
-  %nop718 = alloca i1, i1 0
-  %nop719 = alloca i1, i1 0
-  %nop720 = alloca i1, i1 0
-  %nop721 = alloca i1, i1 0
-  %nop722 = alloca i1, i1 0
-  %nop723 = alloca i1, i1 0
-  %nop724 = alloca i1, i1 0
-  %nop725 = alloca i1, i1 0
-  %nop726 = alloca i1, i1 0
-  %nop727 = alloca i1, i1 0
-  %nop728 = alloca i1, i1 0
-  %nop729 = alloca i1, i1 0
-  %nop730 = alloca i1, i1 0
-  %nop731 = alloca i1, i1 0
-  %nop732 = alloca i1, i1 0
-  %nop733 = alloca i1, i1 0
-  %nop734 = alloca i1, i1 0
-  %nop735 = alloca i1, i1 0
-  %nop736 = alloca i1, i1 0
-  %nop737 = alloca i1, i1 0
-  %nop738 = alloca i1, i1 0
-  %nop739 = alloca i1, i1 0
-  %nop740 = alloca i1, i1 0
-  %nop741 = alloca i1, i1 0
-  %nop742 = alloca i1, i1 0
-  %nop743 = alloca i1, i1 0
-  %nop744 = alloca i1, i1 0
-  %nop745 = alloca i1, i1 0
-  %nop746 = alloca i1, i1 0
-  %nop747 = alloca i1, i1 0
-  %nop748 = alloca i1, i1 0
-  %nop749 = alloca i1, i1 0
-  %nop750 = alloca i1, i1 0
-  %nop751 = alloca i1, i1 0
-  %nop752 = alloca i1, i1 0
-  %nop753 = alloca i1, i1 0
-  %nop754 = alloca i1, i1 0
-  %nop755 = alloca i1, i1 0
-  %nop756 = alloca i1, i1 0
-  %nop757 = alloca i1, i1 0
-  %nop758 = alloca i1, i1 0
-  %nop759 = alloca i1, i1 0
-  %nop760 = alloca i1, i1 0
-  %nop761 = alloca i1, i1 0
-  %nop762 = alloca i1, i1 0
-  %nop763 = alloca i1, i1 0
-  %nop764 = alloca i1, i1 0
-  %nop765 = alloca i1, i1 0
-  %nop766 = alloca i1, i1 0
-  %nop767 = alloca i1, i1 0
-  %nop768 = alloca i1, i1 0
-  %nop769 = alloca i1, i1 0
-  %nop770 = alloca i1, i1 0
-  %nop771 = alloca i1, i1 0
-  %nop772 = alloca i1, i1 0
-  %nop773 = alloca i1, i1 0
-  %nop774 = alloca i1, i1 0
-  %nop775 = alloca i1, i1 0
-  %nop776 = alloca i1, i1 0
-  %nop777 = alloca i1, i1 0
-  %nop778 = alloca i1, i1 0
-  %nop779 = alloca i1, i1 0
-  %nop780 = alloca i1, i1 0
-  %nop781 = alloca i1, i1 0
-  %nop782 = alloca i1, i1 0
-  %nop783 = alloca i1, i1 0
-  %nop784 = alloca i1, i1 0
-  %nop785 = alloca i1, i1 0
-  %nop786 = alloca i1, i1 0
-  %nop787 = alloca i1, i1 0
-  %nop788 = alloca i1, i1 0
-  %nop789 = alloca i1, i1 0
-  %nop790 = alloca i1, i1 0
-  %nop791 = alloca i1, i1 0
-  %nop792 = alloca i1, i1 0
-  %nop793 = alloca i1, i1 0
-  %nop794 = alloca i1, i1 0
-  %nop795 = alloca i1, i1 0
-  %nop796 = alloca i1, i1 0
-  %nop797 = alloca i1, i1 0
-  %nop798 = alloca i1, i1 0
-  %nop799 = alloca i1, i1 0
-  %nop800 = alloca i1, i1 0
-  %nop801 = alloca i1, i1 0
-  %nop802 = alloca i1, i1 0
-  %nop803 = alloca i1, i1 0
-  %nop804 = alloca i1, i1 0
-  %nop805 = alloca i1, i1 0
-  %nop806 = alloca i1, i1 0
-  %nop807 = alloca i1, i1 0
-  %nop808 = alloca i1, i1 0
-  %nop809 = alloca i1, i1 0
-  %nop810 = alloca i1, i1 0
-  %nop811 = alloca i1, i1 0
-  %nop812 = alloca i1, i1 0
-  %nop813 = alloca i1, i1 0
-  %nop814 = alloca i1, i1 0
-  %nop815 = alloca i1, i1 0
-  %nop816 = alloca i1, i1 0
-  %nop817 = alloca i1, i1 0
-  %nop818 = alloca i1, i1 0
-  %nop819 = alloca i1, i1 0
-  %nop820 = alloca i1, i1 0
-  %nop821 = alloca i1, i1 0
-  %nop822 = alloca i1, i1 0
-  %nop823 = alloca i1, i1 0
-  %nop824 = alloca i1, i1 0
-  %nop825 = alloca i1, i1 0
-  %nop826 = alloca i1, i1 0
-  %nop827 = alloca i1, i1 0
-  %nop828 = alloca i1, i1 0
-  %nop829 = alloca i1, i1 0
-  %nop830 = alloca i1, i1 0
-  %nop831 = alloca i1, i1 0
-  %nop832 = alloca i1, i1 0
-  %nop833 = alloca i1, i1 0
-  %nop834 = alloca i1, i1 0
-  %nop835 = alloca i1, i1 0
-  %nop836 = alloca i1, i1 0
-  %nop837 = alloca i1, i1 0
-  %nop838 = alloca i1, i1 0
-  %nop839 = alloca i1, i1 0
-  %nop840 = alloca i1, i1 0
-  %nop841 = alloca i1, i1 0
-  %nop842 = alloca i1, i1 0
-  %nop843 = alloca i1, i1 0
-  %nop844 = alloca i1, i1 0
-  %nop845 = alloca i1, i1 0
-  %nop846 = alloca i1, i1 0
-  %nop847 = alloca i1, i1 0
-  %nop848 = alloca i1, i1 0
-  %nop849 = alloca i1, i1 0
-  %nop850 = alloca i1, i1 0
-  %nop851 = alloca i1, i1 0
-  %nop852 = alloca i1, i1 0
-  %nop853 = alloca i1, i1 0
-  %nop854 = alloca i1, i1 0
-  %nop855 = alloca i1, i1 0
-  %nop856 = alloca i1, i1 0
-  %nop857 = alloca i1, i1 0
-  %nop858 = alloca i1, i1 0
-  %nop859 = alloca i1, i1 0
-  %nop860 = alloca i1, i1 0
-  %nop861 = alloca i1, i1 0
-  %nop862 = alloca i1, i1 0
-  %nop863 = alloca i1, i1 0
-  %nop864 = alloca i1, i1 0
-  %nop865 = alloca i1, i1 0
-  %nop866 = alloca i1, i1 0
-  %nop867 = alloca i1, i1 0
-  %nop868 = alloca i1, i1 0
-  %nop869 = alloca i1, i1 0
-  %nop870 = alloca i1, i1 0
-  %nop871 = alloca i1, i1 0
-  %nop872 = alloca i1, i1 0
-  %nop873 = alloca i1, i1 0
-  %nop874 = alloca i1, i1 0
-  %nop875 = alloca i1, i1 0
-  %nop876 = alloca i1, i1 0
-  %nop877 = alloca i1, i1 0
-  %nop878 = alloca i1, i1 0
-  %nop879 = alloca i1, i1 0
-  %nop880 = alloca i1, i1 0
-  %nop881 = alloca i1, i1 0
-  %nop882 = alloca i1, i1 0
-  %nop883 = alloca i1, i1 0
-  %nop884 = alloca i1, i1 0
-  %nop885 = alloca i1, i1 0
-  %nop886 = alloca i1, i1 0
-  %nop887 = alloca i1, i1 0
-  %nop888 = alloca i1, i1 0
-  %nop889 = alloca i1, i1 0
-  %nop890 = alloca i1, i1 0
-  %nop891 = alloca i1, i1 0
-  %nop892 = alloca i1, i1 0
-  %nop893 = alloca i1, i1 0
-  %nop894 = alloca i1, i1 0
-  %nop895 = alloca i1, i1 0
-  %nop896 = alloca i1, i1 0
-  %nop897 = alloca i1, i1 0
-  %nop898 = alloca i1, i1 0
-  %nop899 = alloca i1, i1 0
-  %nop900 = alloca i1, i1 0
-  %nop901 = alloca i1, i1 0
-  %nop902 = alloca i1, i1 0
-  %nop903 = alloca i1, i1 0
-  %nop904 = alloca i1, i1 0
-  %nop905 = alloca i1, i1 0
-  %nop906 = alloca i1, i1 0
-  %nop907 = alloca i1, i1 0
-  %nop908 = alloca i1, i1 0
-  %nop909 = alloca i1, i1 0
-  %nop910 = alloca i1, i1 0
-  %nop911 = alloca i1, i1 0
-  %nop912 = alloca i1, i1 0
-  %nop913 = alloca i1, i1 0
-  %nop914 = alloca i1, i1 0
-  %nop915 = alloca i1, i1 0
-  %nop916 = alloca i1, i1 0
-  %nop917 = alloca i1, i1 0
-  %nop918 = alloca i1, i1 0
-  %nop919 = alloca i1, i1 0
-  %nop920 = alloca i1, i1 0
-  %nop921 = alloca i1, i1 0
-  %nop922 = alloca i1, i1 0
-  %nop923 = alloca i1, i1 0
-  %nop924 = alloca i1, i1 0
-  %nop925 = alloca i1, i1 0
-  %nop926 = alloca i1, i1 0
-  %nop927 = alloca i1, i1 0
-  %nop928 = alloca i1, i1 0
-  %nop929 = alloca i1, i1 0
-  %nop930 = alloca i1, i1 0
-  %nop931 = alloca i1, i1 0
-  %nop932 = alloca i1, i1 0
-  %nop933 = alloca i1, i1 0
-  %nop934 = alloca i1, i1 0
-  %nop935 = alloca i1, i1 0
-  %nop936 = alloca i1, i1 0
-  %nop937 = alloca i1, i1 0
-  %nop938 = alloca i1, i1 0
-  %nop939 = alloca i1, i1 0
-  %nop940 = alloca i1, i1 0
-  %nop941 = alloca i1, i1 0
-  %nop942 = alloca i1, i1 0
-  %nop943 = alloca i1, i1 0
-  %nop944 = alloca i1, i1 0
-  %nop945 = alloca i1, i1 0
-  %nop946 = alloca i1, i1 0
-  %nop947 = alloca i1, i1 0
-  %nop948 = alloca i1, i1 0
-  %nop949 = alloca i1, i1 0
-  %nop950 = alloca i1, i1 0
-  %nop951 = alloca i1, i1 0
-  %nop952 = alloca i1, i1 0
-  %nop953 = alloca i1, i1 0
-  %nop954 = alloca i1, i1 0
-  %nop955 = alloca i1, i1 0
-  %nop956 = alloca i1, i1 0
-  %nop957 = alloca i1, i1 0
-  %nop958 = alloca i1, i1 0
-  %nop959 = alloca i1, i1 0
-  %nop960 = alloca i1, i1 0
-  %nop961 = alloca i1, i1 0
-  %nop962 = alloca i1, i1 0
-  %nop963 = alloca i1, i1 0
-  %nop964 = alloca i1, i1 0
-  %nop965 = alloca i1, i1 0
-  %nop966 = alloca i1, i1 0
-  %nop967 = alloca i1, i1 0
-  %nop968 = alloca i1, i1 0
-  %nop969 = alloca i1, i1 0
-  %nop970 = alloca i1, i1 0
-  %nop971 = alloca i1, i1 0
-  %nop972 = alloca i1, i1 0
-  %nop973 = alloca i1, i1 0
-  %nop974 = alloca i1, i1 0
-  %nop975 = alloca i1, i1 0
-  %nop976 = alloca i1, i1 0
-  %nop977 = alloca i1, i1 0
-  %nop978 = alloca i1, i1 0
-  %nop979 = alloca i1, i1 0
-  %nop980 = alloca i1, i1 0
-  %nop981 = alloca i1, i1 0
-  %nop982 = alloca i1, i1 0
-  %nop983 = alloca i1, i1 0
-  %nop984 = alloca i1, i1 0
-  %nop985 = alloca i1, i1 0
-  %nop986 = alloca i1, i1 0
-  %nop987 = alloca i1, i1 0
-  %nop988 = alloca i1, i1 0
-  %nop989 = alloca i1, i1 0
-  %nop990 = alloca i1, i1 0
-  %nop991 = alloca i1, i1 0
-  %nop992 = alloca i1, i1 0
-  %nop993 = alloca i1, i1 0
-  %nop994 = alloca i1, i1 0
-  %nop995 = alloca i1, i1 0
-  %nop996 = alloca i1, i1 0
-  %nop997 = alloca i1, i1 0
-  %nop998 = alloca i1, i1 0
-  %nop999 = alloca i1, i1 0
-  %nop1000 = alloca i1, i1 0
-  %nop1001 = alloca i1, i1 0
-  %nop1002 = alloca i1, i1 0
-  %nop1003 = alloca i1, i1 0
-  %nop1004 = alloca i1, i1 0
-  %nop1005 = alloca i1, i1 0
-  %nop1006 = alloca i1, i1 0
-  %nop1007 = alloca i1, i1 0
-  %nop1008 = alloca i1, i1 0
-  %nop1009 = alloca i1, i1 0
-  %nop1010 = alloca i1, i1 0
-  %nop1011 = alloca i1, i1 0
-  %nop1012 = alloca i1, i1 0
-  %nop1013 = alloca i1, i1 0
-  %nop1014 = alloca i1, i1 0
-  %nop1015 = alloca i1, i1 0
-  %nop1016 = alloca i1, i1 0
-  %nop1017 = alloca i1, i1 0
-  %nop1018 = alloca i1, i1 0
-  %nop1019 = alloca i1, i1 0
-  %nop1020 = alloca i1, i1 0
-  %nop1021 = alloca i1, i1 0
-  %nop1022 = alloca i1, i1 0
-  %nop1023 = alloca i1, i1 0
-  %nop1024 = alloca i1, i1 0
-  %nop1025 = alloca i1, i1 0
-  %nop1026 = alloca i1, i1 0
-  %nop1027 = alloca i1, i1 0
-  %nop1028 = alloca i1, i1 0
-  %nop1029 = alloca i1, i1 0
-  %nop1030 = alloca i1, i1 0
-  %nop1031 = alloca i1, i1 0
-  %nop1032 = alloca i1, i1 0
-  %nop1033 = alloca i1, i1 0
-  %nop1034 = alloca i1, i1 0
-  %nop1035 = alloca i1, i1 0
-  %nop1036 = alloca i1, i1 0
-  %nop1037 = alloca i1, i1 0
-  %nop1038 = alloca i1, i1 0
-  %nop1039 = alloca i1, i1 0
-  %nop1040 = alloca i1, i1 0
-  %nop1041 = alloca i1, i1 0
-  %nop1042 = alloca i1, i1 0
-  %nop1043 = alloca i1, i1 0
-  %nop1044 = alloca i1, i1 0
-  %nop1045 = alloca i1, i1 0
-  %nop1046 = alloca i1, i1 0
-  %nop1047 = alloca i1, i1 0
-  %nop1048 = alloca i1, i1 0
-  %nop1049 = alloca i1, i1 0
-  %nop1050 = alloca i1, i1 0
-  %nop1051 = alloca i1, i1 0
-  %nop1052 = alloca i1, i1 0
-  %nop1053 = alloca i1, i1 0
-  %nop1054 = alloca i1, i1 0
-  %nop1055 = alloca i1, i1 0
-  %nop1056 = alloca i1, i1 0
-  %nop1057 = alloca i1, i1 0
-  %nop1058 = alloca i1, i1 0
-  %nop1059 = alloca i1, i1 0
-  %nop1060 = alloca i1, i1 0
-  %nop1061 = alloca i1, i1 0
-  %nop1062 = alloca i1, i1 0
-  %nop1063 = alloca i1, i1 0
-  %nop1064 = alloca i1, i1 0
-  %nop1065 = alloca i1, i1 0
-  %nop1066 = alloca i1, i1 0
-  %nop1067 = alloca i1, i1 0
-  %nop1068 = alloca i1, i1 0
-  %nop1069 = alloca i1, i1 0
-  %nop1070 = alloca i1, i1 0
-  %nop1071 = alloca i1, i1 0
-  %nop1072 = alloca i1, i1 0
-  %nop1073 = alloca i1, i1 0
-  %nop1074 = alloca i1, i1 0
-  %nop1075 = alloca i1, i1 0
-  %nop1076 = alloca i1, i1 0
-  %nop1077 = alloca i1, i1 0
-  %nop1078 = alloca i1, i1 0
-  %nop1079 = alloca i1, i1 0
-  %nop1080 = alloca i1, i1 0
-  %nop1081 = alloca i1, i1 0
-  %nop1082 = alloca i1, i1 0
-  %nop1083 = alloca i1, i1 0
-  %nop1084 = alloca i1, i1 0
-  %nop1085 = alloca i1, i1 0
-  %nop1086 = alloca i1, i1 0
-  %nop1087 = alloca i1, i1 0
-  %nop1088 = alloca i1, i1 0
-  %nop1089 = alloca i1, i1 0
-  %nop1090 = alloca i1, i1 0
-  %nop1091 = alloca i1, i1 0
-  %nop1092 = alloca i1, i1 0
-  %nop1093 = alloca i1, i1 0
-  %nop1094 = alloca i1, i1 0
-  %nop1095 = alloca i1, i1 0
-  %nop1096 = alloca i1, i1 0
-  %nop1097 = alloca i1, i1 0
-  %nop1098 = alloca i1, i1 0
-  %nop1099 = alloca i1, i1 0
-  %nop1100 = alloca i1, i1 0
-  %nop1101 = alloca i1, i1 0
-  %nop1102 = alloca i1, i1 0
-  %nop1103 = alloca i1, i1 0
-  %nop1104 = alloca i1, i1 0
-  %nop1105 = alloca i1, i1 0
-  %nop1106 = alloca i1, i1 0
-  %nop1107 = alloca i1, i1 0
-  %nop1108 = alloca i1, i1 0
-  %nop1109 = alloca i1, i1 0
-  %nop1110 = alloca i1, i1 0
-  %nop1111 = alloca i1, i1 0
-  %nop1112 = alloca i1, i1 0
-  %nop1113 = alloca i1, i1 0
-  %nop1114 = alloca i1, i1 0
-  %nop1115 = alloca i1, i1 0
-  %nop1116 = alloca i1, i1 0
-  %nop1117 = alloca i1, i1 0
-  %nop1118 = alloca i1, i1 0
-  %nop1119 = alloca i1, i1 0
-  %nop1120 = alloca i1, i1 0
-  %nop1121 = alloca i1, i1 0
-  %nop1122 = alloca i1, i1 0
-  %nop1123 = alloca i1, i1 0
-  %nop1124 = alloca i1, i1 0
-  %nop1125 = alloca i1, i1 0
-  %nop1126 = alloca i1, i1 0
-  %nop1127 = alloca i1, i1 0
-  %nop1128 = alloca i1, i1 0
-  %nop1129 = alloca i1, i1 0
-  %nop1130 = alloca i1, i1 0
-  %nop1131 = alloca i1, i1 0
-  %nop1132 = alloca i1, i1 0
-  %nop1133 = alloca i1, i1 0
-  %nop1134 = alloca i1, i1 0
-  %nop1135 = alloca i1, i1 0
-  %nop1136 = alloca i1, i1 0
-  %nop1137 = alloca i1, i1 0
-  %nop1138 = alloca i1, i1 0
-  %nop1139 = alloca i1, i1 0
-  %nop1140 = alloca i1, i1 0
-  %nop1141 = alloca i1, i1 0
-  %nop1142 = alloca i1, i1 0
-  %nop1143 = alloca i1, i1 0
-  %nop1144 = alloca i1, i1 0
-  %nop1145 = alloca i1, i1 0
-  %nop1146 = alloca i1, i1 0
-  %nop1147 = alloca i1, i1 0
-  %nop1148 = alloca i1, i1 0
-  %nop1149 = alloca i1, i1 0
-  %nop1150 = alloca i1, i1 0
-  %nop1151 = alloca i1, i1 0
-  %nop1152 = alloca i1, i1 0
-  %nop1153 = alloca i1, i1 0
-  %nop1154 = alloca i1, i1 0
-  %nop1155 = alloca i1, i1 0
-  %nop1156 = alloca i1, i1 0
-  %nop1157 = alloca i1, i1 0
-  %nop1158 = alloca i1, i1 0
-  %nop1159 = alloca i1, i1 0
-  %nop1160 = alloca i1, i1 0
-  %nop1161 = alloca i1, i1 0
-  %nop1162 = alloca i1, i1 0
-  %nop1163 = alloca i1, i1 0
-  %nop1164 = alloca i1, i1 0
-  %nop1165 = alloca i1, i1 0
-  %nop1166 = alloca i1, i1 0
-  %nop1167 = alloca i1, i1 0
-  %nop1168 = alloca i1, i1 0
-  %nop1169 = alloca i1, i1 0
-  %nop1170 = alloca i1, i1 0
-  %nop1171 = alloca i1, i1 0
-  %nop1172 = alloca i1, i1 0
-  %nop1173 = alloca i1, i1 0
-  %nop1174 = alloca i1, i1 0
-  %nop1175 = alloca i1, i1 0
-  %nop1176 = alloca i1, i1 0
-  %nop1177 = alloca i1, i1 0
-  %nop1178 = alloca i1, i1 0
-  %nop1179 = alloca i1, i1 0
-  %nop1180 = alloca i1, i1 0
-  %nop1181 = alloca i1, i1 0
-  %nop1182 = alloca i1, i1 0
-  %nop1183 = alloca i1, i1 0
-  %nop1184 = alloca i1, i1 0
-  %nop1185 = alloca i1, i1 0
-  %nop1186 = alloca i1, i1 0
-  %nop1187 = alloca i1, i1 0
-  %nop1188 = alloca i1, i1 0
-  %nop1189 = alloca i1, i1 0
-  %nop1190 = alloca i1, i1 0
-  %nop1191 = alloca i1, i1 0
-  %nop1192 = alloca i1, i1 0
-  %nop1193 = alloca i1, i1 0
-  %nop1194 = alloca i1, i1 0
-  %nop1195 = alloca i1, i1 0
-  %nop1196 = alloca i1, i1 0
-  %nop1197 = alloca i1, i1 0
-  %nop1198 = alloca i1, i1 0
-  %nop1199 = alloca i1, i1 0
-  %nop1200 = alloca i1, i1 0
-  %nop1201 = alloca i1, i1 0
-  %nop1202 = alloca i1, i1 0
-  %nop1203 = alloca i1, i1 0
-  %nop1204 = alloca i1, i1 0
-  %nop1205 = alloca i1, i1 0
-  %nop1206 = alloca i1, i1 0
-  %nop1207 = alloca i1, i1 0
-  %nop1208 = alloca i1, i1 0
-  %nop1209 = alloca i1, i1 0
-  %nop1210 = alloca i1, i1 0
-  %nop1211 = alloca i1, i1 0
-  %nop1212 = alloca i1, i1 0
-  %nop1213 = alloca i1, i1 0
-  %nop1214 = alloca i1, i1 0
-  %nop1215 = alloca i1, i1 0
-  %nop1216 = alloca i1, i1 0
-  %nop1217 = alloca i1, i1 0
-  %nop1218 = alloca i1, i1 0
-  %nop1219 = alloca i1, i1 0
-  %nop1220 = alloca i1, i1 0
-  %nop1221 = alloca i1, i1 0
-  %nop1222 = alloca i1, i1 0
-  %nop1223 = alloca i1, i1 0
-  %nop1224 = alloca i1, i1 0
-  %nop1225 = alloca i1, i1 0
-  %nop1226 = alloca i1, i1 0
-  %nop1227 = alloca i1, i1 0
-  %nop1228 = alloca i1, i1 0
-  %nop1229 = alloca i1, i1 0
-  %nop1230 = alloca i1, i1 0
-  %nop1231 = alloca i1, i1 0
-  %nop1232 = alloca i1, i1 0
-  %nop1233 = alloca i1, i1 0
-  %nop1234 = alloca i1, i1 0
-  %nop1235 = alloca i1, i1 0
-  %nop1236 = alloca i1, i1 0
-  %nop1237 = alloca i1, i1 0
-  %nop1238 = alloca i1, i1 0
-  %nop1239 = alloca i1, i1 0
-  %nop1240 = alloca i1, i1 0
-  %nop1241 = alloca i1, i1 0
-  %nop1242 = alloca i1, i1 0
-  %nop1243 = alloca i1, i1 0
-  %nop1244 = alloca i1, i1 0
-  %nop1245 = alloca i1, i1 0
-  %nop1246 = alloca i1, i1 0
-  %nop1247 = alloca i1, i1 0
-  %nop1248 = alloca i1, i1 0
-  %nop1249 = alloca i1, i1 0
-  %nop1250 = alloca i1, i1 0
-  %nop1251 = alloca i1, i1 0
-  %nop1252 = alloca i1, i1 0
-  %nop1253 = alloca i1, i1 0
-  %nop1254 = alloca i1, i1 0
-  %nop1255 = alloca i1, i1 0
-  %nop1256 = alloca i1, i1 0
-  %nop1257 = alloca i1, i1 0
-  %nop1258 = alloca i1, i1 0
-  %nop1259 = alloca i1, i1 0
-  %nop1260 = alloca i1, i1 0
-  %nop1261 = alloca i1, i1 0
-  %nop1262 = alloca i1, i1 0
-  %nop1263 = alloca i1, i1 0
-  %nop1264 = alloca i1, i1 0
-  %nop1265 = alloca i1, i1 0
-  %nop1266 = alloca i1, i1 0
-  %nop1267 = alloca i1, i1 0
-  %nop1268 = alloca i1, i1 0
-  %nop1269 = alloca i1, i1 0
-  %nop1270 = alloca i1, i1 0
-  %nop1271 = alloca i1, i1 0
-  %nop1272 = alloca i1, i1 0
-  %nop1273 = alloca i1, i1 0
-  %nop1274 = alloca i1, i1 0
-  %nop1275 = alloca i1, i1 0
-  %nop1276 = alloca i1, i1 0
-  %nop1277 = alloca i1, i1 0
-  %nop1278 = alloca i1, i1 0
-  %nop1279 = alloca i1, i1 0
-  %nop1280 = alloca i1, i1 0
-  %nop1281 = alloca i1, i1 0
-  %nop1282 = alloca i1, i1 0
-  %nop1283 = alloca i1, i1 0
-  %nop1284 = alloca i1, i1 0
-  %nop1285 = alloca i1, i1 0
-  %nop1286 = alloca i1, i1 0
-  %nop1287 = alloca i1, i1 0
-  %nop1288 = alloca i1, i1 0
-  %nop1289 = alloca i1, i1 0
-  %nop1290 = alloca i1, i1 0
-  %nop1291 = alloca i1, i1 0
-  %nop1292 = alloca i1, i1 0
-  %nop1293 = alloca i1, i1 0
-  %nop1294 = alloca i1, i1 0
-  %nop1295 = alloca i1, i1 0
-  %nop1296 = alloca i1, i1 0
-  %nop1297 = alloca i1, i1 0
-  %nop1298 = alloca i1, i1 0
-  %nop1299 = alloca i1, i1 0
-  %nop1300 = alloca i1, i1 0
-  %nop1301 = alloca i1, i1 0
-  %nop1302 = alloca i1, i1 0
-  %nop1303 = alloca i1, i1 0
-  %nop1304 = alloca i1, i1 0
-  %nop1305 = alloca i1, i1 0
-  %nop1306 = alloca i1, i1 0
-  %nop1307 = alloca i1, i1 0
-  %nop1308 = alloca i1, i1 0
-  %nop1309 = alloca i1, i1 0
-  %nop1310 = alloca i1, i1 0
-  %nop1311 = alloca i1, i1 0
-  %nop1312 = alloca i1, i1 0
-  %nop1313 = alloca i1, i1 0
-  %nop1314 = alloca i1, i1 0
-  %nop1315 = alloca i1, i1 0
-  %nop1316 = alloca i1, i1 0
-  %nop1317 = alloca i1, i1 0
-  %nop1318 = alloca i1, i1 0
-  %nop1319 = alloca i1, i1 0
-  %nop1320 = alloca i1, i1 0
-  %nop1321 = alloca i1, i1 0
-  %nop1322 = alloca i1, i1 0
-  %nop1323 = alloca i1, i1 0
-  %nop1324 = alloca i1, i1 0
-  %nop1325 = alloca i1, i1 0
-  %nop1326 = alloca i1, i1 0
-  %nop1327 = alloca i1, i1 0
-  %nop1328 = alloca i1, i1 0
-  %nop1329 = alloca i1, i1 0
-  %nop1330 = alloca i1, i1 0
-  %nop1331 = alloca i1, i1 0
-  %nop1332 = alloca i1, i1 0
-  %nop1333 = alloca i1, i1 0
-  %nop1334 = alloca i1, i1 0
-  %nop1335 = alloca i1, i1 0
-  %nop1336 = alloca i1, i1 0
-  %nop1337 = alloca i1, i1 0
-  %nop1338 = alloca i1, i1 0
-  %nop1339 = alloca i1, i1 0
-  %nop1340 = alloca i1, i1 0
-  %nop1341 = alloca i1, i1 0
-  %nop1342 = alloca i1, i1 0
-  %nop1343 = alloca i1, i1 0
-  %nop1344 = alloca i1, i1 0
-  %nop1345 = alloca i1, i1 0
-  %nop1346 = alloca i1, i1 0
-  %nop1347 = alloca i1, i1 0
-  %nop1348 = alloca i1, i1 0
-  %nop1349 = alloca i1, i1 0
-  %nop1350 = alloca i1, i1 0
-  %nop1351 = alloca i1, i1 0
-  %nop1352 = alloca i1, i1 0
-  %nop1353 = alloca i1, i1 0
-  %nop1354 = alloca i1, i1 0
-  %nop1355 = alloca i1, i1 0
-  %nop1356 = alloca i1, i1 0
-  %nop1357 = alloca i1, i1 0
-  %nop1358 = alloca i1, i1 0
-  %nop1359 = alloca i1, i1 0
-  %nop1360 = alloca i1, i1 0
-  %nop1361 = alloca i1, i1 0
-  %nop1362 = alloca i1, i1 0
-  %nop1363 = alloca i1, i1 0
-  %nop1364 = alloca i1, i1 0
-  %nop1365 = alloca i1, i1 0
-  %nop1366 = alloca i1, i1 0
-  %nop1367 = alloca i1, i1 0
-  %nop1368 = alloca i1, i1 0
-  %nop1369 = alloca i1, i1 0
-  %nop1370 = alloca i1, i1 0
-  %nop1371 = alloca i1, i1 0
-  %nop1372 = alloca i1, i1 0
-  %nop1373 = alloca i1, i1 0
-  %nop1374 = alloca i1, i1 0
-  %nop1375 = alloca i1, i1 0
-  %nop1376 = alloca i1, i1 0
-  %nop1377 = alloca i1, i1 0
-  %nop1378 = alloca i1, i1 0
-  %nop1379 = alloca i1, i1 0
-  %nop1380 = alloca i1, i1 0
-  %nop1381 = alloca i1, i1 0
-  %nop1382 = alloca i1, i1 0
-  %nop1383 = alloca i1, i1 0
-  %nop1384 = alloca i1, i1 0
-  %nop1385 = alloca i1, i1 0
-  %nop1386 = alloca i1, i1 0
-  %nop1387 = alloca i1, i1 0
-  %nop1388 = alloca i1, i1 0
-  %nop1389 = alloca i1, i1 0
-  %nop1390 = alloca i1, i1 0
-  %nop1391 = alloca i1, i1 0
-  %nop1392 = alloca i1, i1 0
-  %nop1393 = alloca i1, i1 0
-  %nop1394 = alloca i1, i1 0
-  %nop1395 = alloca i1, i1 0
-  %nop1396 = alloca i1, i1 0
-  %nop1397 = alloca i1, i1 0
-  %nop1398 = alloca i1, i1 0
-  %nop1399 = alloca i1, i1 0
-  %nop1400 = alloca i1, i1 0
-  %nop1401 = alloca i1, i1 0
-  %nop1402 = alloca i1, i1 0
-  %nop1403 = alloca i1, i1 0
-  %nop1404 = alloca i1, i1 0
-  %nop1405 = alloca i1, i1 0
-  %nop1406 = alloca i1, i1 0
-  %nop1407 = alloca i1, i1 0
-  %nop1408 = alloca i1, i1 0
-  %nop1409 = alloca i1, i1 0
-  %nop1410 = alloca i1, i1 0
-  %nop1411 = alloca i1, i1 0
-  %nop1412 = alloca i1, i1 0
-  %nop1413 = alloca i1, i1 0
-  %nop1414 = alloca i1, i1 0
-  %nop1415 = alloca i1, i1 0
-  %nop1416 = alloca i1, i1 0
-  %nop1417 = alloca i1, i1 0
-  %nop1418 = alloca i1, i1 0
-  %nop1419 = alloca i1, i1 0
-  %nop1420 = alloca i1, i1 0
-  %nop1421 = alloca i1, i1 0
-  %nop1422 = alloca i1, i1 0
-  %nop1423 = alloca i1, i1 0
-  %nop1424 = alloca i1, i1 0
-  %nop1425 = alloca i1, i1 0
-  %nop1426 = alloca i1, i1 0
-  %nop1427 = alloca i1, i1 0
-  %nop1428 = alloca i1, i1 0
-  %nop1429 = alloca i1, i1 0
-  %nop1430 = alloca i1, i1 0
-  %nop1431 = alloca i1, i1 0
-  %nop1432 = alloca i1, i1 0
-  %nop1433 = alloca i1, i1 0
-  %nop1434 = alloca i1, i1 0
-  %nop1435 = alloca i1, i1 0
-  %nop1436 = alloca i1, i1 0
-  %nop1437 = alloca i1, i1 0
-  %nop1438 = alloca i1, i1 0
-  %nop1439 = alloca i1, i1 0
-  %nop1440 = alloca i1, i1 0
-  %nop1441 = alloca i1, i1 0
-  %nop1442 = alloca i1, i1 0
-  %nop1443 = alloca i1, i1 0
-  %nop1444 = alloca i1, i1 0
-  %nop1445 = alloca i1, i1 0
-  %nop1446 = alloca i1, i1 0
-  %nop1447 = alloca i1, i1 0
-  %nop1448 = alloca i1, i1 0
-  %nop1449 = alloca i1, i1 0
-  %nop1450 = alloca i1, i1 0
-  %nop1451 = alloca i1, i1 0
-  %nop1452 = alloca i1, i1 0
-  %nop1453 = alloca i1, i1 0
-  %nop1454 = alloca i1, i1 0
-  %nop1455 = alloca i1, i1 0
-  %nop1456 = alloca i1, i1 0
-  %nop1457 = alloca i1, i1 0
-  %nop1458 = alloca i1, i1 0
-  %nop1459 = alloca i1, i1 0
-  %nop1460 = alloca i1, i1 0
-  %nop1461 = alloca i1, i1 0
-  %nop1462 = alloca i1, i1 0
-  %nop1463 = alloca i1, i1 0
-  %nop1464 = alloca i1, i1 0
-  %nop1465 = alloca i1, i1 0
-  %nop1466 = alloca i1, i1 0
-  %nop1467 = alloca i1, i1 0
-  %nop1468 = alloca i1, i1 0
-  %nop1469 = alloca i1, i1 0
-  %nop1470 = alloca i1, i1 0
-  %nop1471 = alloca i1, i1 0
-  %nop1472 = alloca i1, i1 0
-  %nop1473 = alloca i1, i1 0
-  %nop1474 = alloca i1, i1 0
-  %nop1475 = alloca i1, i1 0
-  %nop1476 = alloca i1, i1 0
-  %nop1477 = alloca i1, i1 0
-  %nop1478 = alloca i1, i1 0
-  %nop1479 = alloca i1, i1 0
-  %nop1480 = alloca i1, i1 0
-  %nop1481 = alloca i1, i1 0
-  %nop1482 = alloca i1, i1 0
-  %nop1483 = alloca i1, i1 0
-  %nop1484 = alloca i1, i1 0
-  %nop1485 = alloca i1, i1 0
-  %nop1486 = alloca i1, i1 0
-  %nop1487 = alloca i1, i1 0
-  %nop1488 = alloca i1, i1 0
-  %nop1489 = alloca i1, i1 0
-  %nop1490 = alloca i1, i1 0
-  %nop1491 = alloca i1, i1 0
-  %nop1492 = alloca i1, i1 0
-  %nop1493 = alloca i1, i1 0
-  %nop1494 = alloca i1, i1 0
-  %nop1495 = alloca i1, i1 0
-  %nop1496 = alloca i1, i1 0
-  %nop1497 = alloca i1, i1 0
-  %nop1498 = alloca i1, i1 0
-  %nop1499 = alloca i1, i1 0
-  %nop1500 = alloca i1, i1 0
-  %nop1501 = alloca i1, i1 0
-  %nop1502 = alloca i1, i1 0
-  %nop1503 = alloca i1, i1 0
-  %nop1504 = alloca i1, i1 0
-  %nop1505 = alloca i1, i1 0
-  %nop1506 = alloca i1, i1 0
-  %nop1507 = alloca i1, i1 0
-  %nop1508 = alloca i1, i1 0
-  %nop1509 = alloca i1, i1 0
-  %nop1510 = alloca i1, i1 0
-  %nop1511 = alloca i1, i1 0
-  %nop1512 = alloca i1, i1 0
-  %nop1513 = alloca i1, i1 0
-  %nop1514 = alloca i1, i1 0
-  %nop1515 = alloca i1, i1 0
-  %nop1516 = alloca i1, i1 0
-  %nop1517 = alloca i1, i1 0
-  %nop1518 = alloca i1, i1 0
-  %nop1519 = alloca i1, i1 0
-  %nop1520 = alloca i1, i1 0
-  %nop1521 = alloca i1, i1 0
-  %nop1522 = alloca i1, i1 0
-  %nop1523 = alloca i1, i1 0
-  %nop1524 = alloca i1, i1 0
-  %nop1525 = alloca i1, i1 0
-  %nop1526 = alloca i1, i1 0
-  %nop1527 = alloca i1, i1 0
-  %nop1528 = alloca i1, i1 0
-  %nop1529 = alloca i1, i1 0
-  %nop1530 = alloca i1, i1 0
-  %nop1531 = alloca i1, i1 0
-  %nop1532 = alloca i1, i1 0
-  %nop1533 = alloca i1, i1 0
-  %nop1534 = alloca i1, i1 0
-  %nop1535 = alloca i1, i1 0
-  %nop1536 = alloca i1, i1 0
-  %nop1537 = alloca i1, i1 0
-  %nop1538 = alloca i1, i1 0
-  %nop1539 = alloca i1, i1 0
-  %nop1540 = alloca i1, i1 0
-  %nop1541 = alloca i1, i1 0
-  %nop1542 = alloca i1, i1 0
-  %nop1543 = alloca i1, i1 0
-  %nop1544 = alloca i1, i1 0
-  %nop1545 = alloca i1, i1 0
-  %nop1546 = alloca i1, i1 0
-  %nop1547 = alloca i1, i1 0
-  %nop1548 = alloca i1, i1 0
-  %nop1549 = alloca i1, i1 0
-  %nop1550 = alloca i1, i1 0
-  %nop1551 = alloca i1, i1 0
-  %nop1552 = alloca i1, i1 0
-  %nop1553 = alloca i1, i1 0
-  %nop1554 = alloca i1, i1 0
-  %nop1555 = alloca i1, i1 0
-  %nop1556 = alloca i1, i1 0
-  %nop1557 = alloca i1, i1 0
-  %nop1558 = alloca i1, i1 0
-  %nop1559 = alloca i1, i1 0
-  %nop1560 = alloca i1, i1 0
-  %nop1561 = alloca i1, i1 0
-  %nop1562 = alloca i1, i1 0
-  %nop1563 = alloca i1, i1 0
-  %nop1564 = alloca i1, i1 0
-  %nop1565 = alloca i1, i1 0
-  %nop1566 = alloca i1, i1 0
-  %nop1567 = alloca i1, i1 0
-  %nop1568 = alloca i1, i1 0
-  %nop1569 = alloca i1, i1 0
-  %nop1570 = alloca i1, i1 0
-  %nop1571 = alloca i1, i1 0
-  %nop1572 = alloca i1, i1 0
-  %nop1573 = alloca i1, i1 0
-  %nop1574 = alloca i1, i1 0
-  %nop1575 = alloca i1, i1 0
-  %nop1576 = alloca i1, i1 0
-  %nop1577 = alloca i1, i1 0
-  %nop1578 = alloca i1, i1 0
-  %nop1579 = alloca i1, i1 0
-  %nop1580 = alloca i1, i1 0
-  %nop1581 = alloca i1, i1 0
-  %nop1582 = alloca i1, i1 0
-  %nop1583 = alloca i1, i1 0
-  %nop1584 = alloca i1, i1 0
-  %nop1585 = alloca i1, i1 0
-  %nop1586 = alloca i1, i1 0
-  %nop1587 = alloca i1, i1 0
-  %nop1588 = alloca i1, i1 0
-  %nop1589 = alloca i1, i1 0
-  %nop1590 = alloca i1, i1 0
-  %nop1591 = alloca i1, i1 0
-  %nop1592 = alloca i1, i1 0
-  %nop1593 = alloca i1, i1 0
-  %nop1594 = alloca i1, i1 0
-  %nop1595 = alloca i1, i1 0
-  %nop1596 = alloca i1, i1 0
-  %nop1597 = alloca i1, i1 0
-  %nop1598 = alloca i1, i1 0
-  %nop1599 = alloca i1, i1 0
-  %nop1600 = alloca i1, i1 0
-  %nop1601 = alloca i1, i1 0
-  %nop1602 = alloca i1, i1 0
-  %nop1603 = alloca i1, i1 0
-  %nop1604 = alloca i1, i1 0
-  %nop1605 = alloca i1, i1 0
-  %nop1606 = alloca i1, i1 0
-  %nop1607 = alloca i1, i1 0
-  %nop1608 = alloca i1, i1 0
-  %nop1609 = alloca i1, i1 0
-  %nop1610 = alloca i1, i1 0
-  %nop1611 = alloca i1, i1 0
-  %nop1612 = alloca i1, i1 0
-  %nop1613 = alloca i1, i1 0
-  %nop1614 = alloca i1, i1 0
-  %nop1615 = alloca i1, i1 0
-  %nop1616 = alloca i1, i1 0
-  %nop1617 = alloca i1, i1 0
-  %nop1618 = alloca i1, i1 0
-  %nop1619 = alloca i1, i1 0
-  %nop1620 = alloca i1, i1 0
-  %nop1621 = alloca i1, i1 0
-  %nop1622 = alloca i1, i1 0
-  %nop1623 = alloca i1, i1 0
-  %nop1624 = alloca i1, i1 0
-  %nop1625 = alloca i1, i1 0
-  %nop1626 = alloca i1, i1 0
-  %nop1627 = alloca i1, i1 0
-  %nop1628 = alloca i1, i1 0
-  %nop1629 = alloca i1, i1 0
-  %nop1630 = alloca i1, i1 0
-  %nop1631 = alloca i1, i1 0
-  %nop1632 = alloca i1, i1 0
-  %nop1633 = alloca i1, i1 0
-  %nop1634 = alloca i1, i1 0
-  %nop1635 = alloca i1, i1 0
-  %nop1636 = alloca i1, i1 0
-  %nop1637 = alloca i1, i1 0
-  %nop1638 = alloca i1, i1 0
-  %nop1639 = alloca i1, i1 0
-  %nop1640 = alloca i1, i1 0
-  %nop1641 = alloca i1, i1 0
-  %nop1642 = alloca i1, i1 0
-  %nop1643 = alloca i1, i1 0
-  %nop1644 = alloca i1, i1 0
-  %nop1645 = alloca i1, i1 0
-  %nop1646 = alloca i1, i1 0
-  %nop1647 = alloca i1, i1 0
-  %nop1648 = alloca i1, i1 0
-  %nop1649 = alloca i1, i1 0
-  %nop1650 = alloca i1, i1 0
-  %nop1651 = alloca i1, i1 0
-  %nop1652 = alloca i1, i1 0
-  %nop1653 = alloca i1, i1 0
-  %nop1654 = alloca i1, i1 0
-  %nop1655 = alloca i1, i1 0
-  %nop1656 = alloca i1, i1 0
-  %nop1657 = alloca i1, i1 0
-  %nop1658 = alloca i1, i1 0
-  %nop1659 = alloca i1, i1 0
-  %nop1660 = alloca i1, i1 0
-  %nop1661 = alloca i1, i1 0
-  %nop1662 = alloca i1, i1 0
-  %nop1663 = alloca i1, i1 0
-  %nop1664 = alloca i1, i1 0
-  %nop1665 = alloca i1, i1 0
-  %nop1666 = alloca i1, i1 0
-  %nop1667 = alloca i1, i1 0
-  %nop1668 = alloca i1, i1 0
-  %nop1669 = alloca i1, i1 0
-  %nop1670 = alloca i1, i1 0
-  %nop1671 = alloca i1, i1 0
-  %nop1672 = alloca i1, i1 0
-  %nop1673 = alloca i1, i1 0
-  %nop1674 = alloca i1, i1 0
-  %nop1675 = alloca i1, i1 0
-  %nop1676 = alloca i1, i1 0
-  %nop1677 = alloca i1, i1 0
-  %nop1678 = alloca i1, i1 0
-  %nop1679 = alloca i1, i1 0
-  %nop1680 = alloca i1, i1 0
-  %nop1681 = alloca i1, i1 0
-  %nop1682 = alloca i1, i1 0
-  %nop1683 = alloca i1, i1 0
-  %nop1684 = alloca i1, i1 0
-  %nop1685 = alloca i1, i1 0
-  %nop1686 = alloca i1, i1 0
-  %nop1687 = alloca i1, i1 0
-  %nop1688 = alloca i1, i1 0
-  %nop1689 = alloca i1, i1 0
-  %nop1690 = alloca i1, i1 0
-  %nop1691 = alloca i1, i1 0
-  %nop1692 = alloca i1, i1 0
-  %nop1693 = alloca i1, i1 0
-  %nop1694 = alloca i1, i1 0
-  %nop1695 = alloca i1, i1 0
-  %nop1696 = alloca i1, i1 0
-  %nop1697 = alloca i1, i1 0
-  %nop1698 = alloca i1, i1 0
-  %nop1699 = alloca i1, i1 0
-  %nop1700 = alloca i1, i1 0
-  %nop1701 = alloca i1, i1 0
-  %nop1702 = alloca i1, i1 0
-  %nop1703 = alloca i1, i1 0
-  %nop1704 = alloca i1, i1 0
-  %nop1705 = alloca i1, i1 0
-  %nop1706 = alloca i1, i1 0
-  %nop1707 = alloca i1, i1 0
-  %nop1708 = alloca i1, i1 0
-  %nop1709 = alloca i1, i1 0
-  %nop1710 = alloca i1, i1 0
-  %nop1711 = alloca i1, i1 0
-  %nop1712 = alloca i1, i1 0
-  %nop1713 = alloca i1, i1 0
-  %nop1714 = alloca i1, i1 0
-  %nop1715 = alloca i1, i1 0
-  %nop1716 = alloca i1, i1 0
-  %nop1717 = alloca i1, i1 0
-  %nop1718 = alloca i1, i1 0
-  %nop1719 = alloca i1, i1 0
-  %nop1720 = alloca i1, i1 0
-  %nop1721 = alloca i1, i1 0
-  %nop1722 = alloca i1, i1 0
-  %nop1723 = alloca i1, i1 0
-  %nop1724 = alloca i1, i1 0
-  %nop1725 = alloca i1, i1 0
-  %nop1726 = alloca i1, i1 0
-  %nop1727 = alloca i1, i1 0
-  %nop1728 = alloca i1, i1 0
-  %nop1729 = alloca i1, i1 0
-  %nop1730 = alloca i1, i1 0
-  %nop1731 = alloca i1, i1 0
-  %nop1732 = alloca i1, i1 0
-  %nop1733 = alloca i1, i1 0
-  %nop1734 = alloca i1, i1 0
-  %nop1735 = alloca i1, i1 0
-  %nop1736 = alloca i1, i1 0
-  %nop1737 = alloca i1, i1 0
-  %nop1738 = alloca i1, i1 0
-  %nop1739 = alloca i1, i1 0
-  %nop1740 = alloca i1, i1 0
-  %nop1741 = alloca i1, i1 0
-  %nop1742 = alloca i1, i1 0
-  %nop1743 = alloca i1, i1 0
-  %nop1744 = alloca i1, i1 0
-  %nop1745 = alloca i1, i1 0
-  %nop1746 = alloca i1, i1 0
-  %nop1747 = alloca i1, i1 0
-  %nop1748 = alloca i1, i1 0
-  %nop1749 = alloca i1, i1 0
-  %nop1750 = alloca i1, i1 0
-  %nop1751 = alloca i1, i1 0
-  %nop1752 = alloca i1, i1 0
-  %nop1753 = alloca i1, i1 0
-  %nop1754 = alloca i1, i1 0
-  %nop1755 = alloca i1, i1 0
-  %nop1756 = alloca i1, i1 0
-  %nop1757 = alloca i1, i1 0
-  %nop1758 = alloca i1, i1 0
-  %nop1759 = alloca i1, i1 0
-  %nop1760 = alloca i1, i1 0
-  %nop1761 = alloca i1, i1 0
-  %nop1762 = alloca i1, i1 0
-  %nop1763 = alloca i1, i1 0
-  %nop1764 = alloca i1, i1 0
-  %nop1765 = alloca i1, i1 0
-  %nop1766 = alloca i1, i1 0
-  %nop1767 = alloca i1, i1 0
-  %nop1768 = alloca i1, i1 0
-  %nop1769 = alloca i1, i1 0
-  %nop1770 = alloca i1, i1 0
-  %nop1771 = alloca i1, i1 0
-  %nop1772 = alloca i1, i1 0
-  %nop1773 = alloca i1, i1 0
-  %nop1774 = alloca i1, i1 0
-  %nop1775 = alloca i1, i1 0
-  %nop1776 = alloca i1, i1 0
-  %nop1777 = alloca i1, i1 0
-  %nop1778 = alloca i1, i1 0
-  %nop1779 = alloca i1, i1 0
-  %nop1780 = alloca i1, i1 0
-  %nop1781 = alloca i1, i1 0
-  %nop1782 = alloca i1, i1 0
-  %nop1783 = alloca i1, i1 0
-  %nop1784 = alloca i1, i1 0
-  %nop1785 = alloca i1, i1 0
-  %nop1786 = alloca i1, i1 0
-  %nop1787 = alloca i1, i1 0
-  %nop1788 = alloca i1, i1 0
-  %nop1789 = alloca i1, i1 0
-  %nop1790 = alloca i1, i1 0
-  %nop1791 = alloca i1, i1 0
-  %nop1792 = alloca i1, i1 0
-  %nop1793 = alloca i1, i1 0
-  %nop1794 = alloca i1, i1 0
-  %nop1795 = alloca i1, i1 0
-  %nop1796 = alloca i1, i1 0
-  %nop1797 = alloca i1, i1 0
-  %nop1798 = alloca i1, i1 0
-  %nop1799 = alloca i1, i1 0
-  %nop1800 = alloca i1, i1 0
-  %nop1801 = alloca i1, i1 0
-  %nop1802 = alloca i1, i1 0
-  %nop1803 = alloca i1, i1 0
-  %nop1804 = alloca i1, i1 0
-  %nop1805 = alloca i1, i1 0
-  %nop1806 = alloca i1, i1 0
-  %nop1807 = alloca i1, i1 0
-  %nop1808 = alloca i1, i1 0
-  %nop1809 = alloca i1, i1 0
-  %nop1810 = alloca i1, i1 0
-  %nop1811 = alloca i1, i1 0
-  %nop1812 = alloca i1, i1 0
-  %nop1813 = alloca i1, i1 0
-  %nop1814 = alloca i1, i1 0
-  %nop1815 = alloca i1, i1 0
-  %nop1816 = alloca i1, i1 0
-  %nop1817 = alloca i1, i1 0
-  %nop1818 = alloca i1, i1 0
-  %nop1819 = alloca i1, i1 0
-  %nop1820 = alloca i1, i1 0
-  %nop1821 = alloca i1, i1 0
-  %nop1822 = alloca i1, i1 0
-  %nop1823 = alloca i1, i1 0
-  %nop1824 = alloca i1, i1 0
-  %nop1825 = alloca i1, i1 0
-  %nop1826 = alloca i1, i1 0
-  %nop1827 = alloca i1, i1 0
-  %nop1828 = alloca i1, i1 0
-  %nop1829 = alloca i1, i1 0
-  %nop1830 = alloca i1, i1 0
-  %nop1831 = alloca i1, i1 0
-  %nop1832 = alloca i1, i1 0
-  %nop1833 = alloca i1, i1 0
-  %nop1834 = alloca i1, i1 0
-  %nop1835 = alloca i1, i1 0
-  %nop1836 = alloca i1, i1 0
-  %nop1837 = alloca i1, i1 0
-  %nop1838 = alloca i1, i1 0
-  %nop1839 = alloca i1, i1 0
-  %nop1840 = alloca i1, i1 0
-  %nop1841 = alloca i1, i1 0
-  %nop1842 = alloca i1, i1 0
-  %nop1843 = alloca i1, i1 0
-  %nop1844 = alloca i1, i1 0
-  %nop1845 = alloca i1, i1 0
-  %nop1846 = alloca i1, i1 0
-  %nop1847 = alloca i1, i1 0
-  %nop1848 = alloca i1, i1 0
-  %nop1849 = alloca i1, i1 0
-  %nop1850 = alloca i1, i1 0
-  %nop1851 = alloca i1, i1 0
-  %nop1852 = alloca i1, i1 0
-  %nop1853 = alloca i1, i1 0
-  %nop1854 = alloca i1, i1 0
-  %nop1855 = alloca i1, i1 0
-  %nop1856 = alloca i1, i1 0
-  %nop1857 = alloca i1, i1 0
-  %nop1858 = alloca i1, i1 0
-  %nop1859 = alloca i1, i1 0
-  %nop1860 = alloca i1, i1 0
-  %nop1861 = alloca i1, i1 0
-  %nop1862 = alloca i1, i1 0
-  %nop1863 = alloca i1, i1 0
-  %nop1864 = alloca i1, i1 0
-  %nop1865 = alloca i1, i1 0
-  %nop1866 = alloca i1, i1 0
-  %nop1867 = alloca i1, i1 0
-  %nop1868 = alloca i1, i1 0
-  %nop1869 = alloca i1, i1 0
-  %nop1870 = alloca i1, i1 0
-  %nop1871 = alloca i1, i1 0
-  %nop1872 = alloca i1, i1 0
-  %nop1873 = alloca i1, i1 0
-  %nop1874 = alloca i1, i1 0
-  %nop1875 = alloca i1, i1 0
-  %nop1876 = alloca i1, i1 0
-  %nop1877 = alloca i1, i1 0
-  %nop1878 = alloca i1, i1 0
-  %nop1879 = alloca i1, i1 0
-  %nop1880 = alloca i1, i1 0
-  %nop1881 = alloca i1, i1 0
-  %nop1882 = alloca i1, i1 0
-  %nop1883 = alloca i1, i1 0
-  %nop1884 = alloca i1, i1 0
-  %nop1885 = alloca i1, i1 0
-  %nop1886 = alloca i1, i1 0
-  %nop1887 = alloca i1, i1 0
-  %nop1888 = alloca i1, i1 0
-  %nop1889 = alloca i1, i1 0
-  %nop1890 = alloca i1, i1 0
-  %nop1891 = alloca i1, i1 0
-  %nop1892 = alloca i1, i1 0
-  %nop1893 = alloca i1, i1 0
-  %nop1894 = alloca i1, i1 0
-  %nop1895 = alloca i1, i1 0
-  %nop1896 = alloca i1, i1 0
-  %nop1897 = alloca i1, i1 0
-  %nop1898 = alloca i1, i1 0
-  %nop1899 = alloca i1, i1 0
-  %nop1900 = alloca i1, i1 0
-  %nop1901 = alloca i1, i1 0
-  %nop1902 = alloca i1, i1 0
-  %nop1903 = alloca i1, i1 0
-  %nop1904 = alloca i1, i1 0
-  %nop1905 = alloca i1, i1 0
-  %nop1906 = alloca i1, i1 0
-  %nop1907 = alloca i1, i1 0
-  %nop1908 = alloca i1, i1 0
-  %nop1909 = alloca i1, i1 0
-  %nop1910 = alloca i1, i1 0
-  %nop1911 = alloca i1, i1 0
-  %nop1912 = alloca i1, i1 0
-  %nop1913 = alloca i1, i1 0
-  %nop1914 = alloca i1, i1 0
-  %nop1915 = alloca i1, i1 0
-  %nop1916 = alloca i1, i1 0
-  %nop1917 = alloca i1, i1 0
-  %nop1918 = alloca i1, i1 0
-  %nop1919 = alloca i1, i1 0
-  %nop1920 = alloca i1, i1 0
-  %nop1921 = alloca i1, i1 0
-  %nop1922 = alloca i1, i1 0
-  %nop1923 = alloca i1, i1 0
-  %nop1924 = alloca i1, i1 0
-  %nop1925 = alloca i1, i1 0
-  %nop1926 = alloca i1, i1 0
-  %nop1927 = alloca i1, i1 0
-  %nop1928 = alloca i1, i1 0
-  %nop1929 = alloca i1, i1 0
-  %nop1930 = alloca i1, i1 0
-  %nop1931 = alloca i1, i1 0
-  %nop1932 = alloca i1, i1 0
-  %nop1933 = alloca i1, i1 0
-  %nop1934 = alloca i1, i1 0
-  %nop1935 = alloca i1, i1 0
-  %nop1936 = alloca i1, i1 0
-  %nop1937 = alloca i1, i1 0
-  %nop1938 = alloca i1, i1 0
-  %nop1939 = alloca i1, i1 0
-  %nop1940 = alloca i1, i1 0
-  %nop1941 = alloca i1, i1 0
-  %nop1942 = alloca i1, i1 0
-  %nop1943 = alloca i1, i1 0
-  %nop1944 = alloca i1, i1 0
-  %nop1945 = alloca i1, i1 0
-  %nop1946 = alloca i1, i1 0
-  %nop1947 = alloca i1, i1 0
-  %nop1948 = alloca i1, i1 0
-  %nop1949 = alloca i1, i1 0
-  %nop1950 = alloca i1, i1 0
-  %nop1951 = alloca i1, i1 0
-  %nop1952 = alloca i1, i1 0
-  %nop1953 = alloca i1, i1 0
-  %nop1954 = alloca i1, i1 0
-  %nop1955 = alloca i1, i1 0
-  %nop1956 = alloca i1, i1 0
-  %nop1957 = alloca i1, i1 0
-  %nop1958 = alloca i1, i1 0
-  %nop1959 = alloca i1, i1 0
-  %nop1960 = alloca i1, i1 0
-  %nop1961 = alloca i1, i1 0
-  %nop1962 = alloca i1, i1 0
-  %nop1963 = alloca i1, i1 0
-  %nop1964 = alloca i1, i1 0
-  %nop1965 = alloca i1, i1 0
-  %nop1966 = alloca i1, i1 0
-  %nop1967 = alloca i1, i1 0
-  %nop1968 = alloca i1, i1 0
-  %nop1969 = alloca i1, i1 0
-  %nop1970 = alloca i1, i1 0
-  %nop1971 = alloca i1, i1 0
-  %nop1972 = alloca i1, i1 0
-  %nop1973 = alloca i1, i1 0
-  %nop1974 = alloca i1, i1 0
-  %nop1975 = alloca i1, i1 0
-  %nop1976 = alloca i1, i1 0
-  %nop1977 = alloca i1, i1 0
-  %nop1978 = alloca i1, i1 0
-  %nop1979 = alloca i1, i1 0
-  %nop1980 = alloca i1, i1 0
-  %nop1981 = alloca i1, i1 0
-  %nop1982 = alloca i1, i1 0
-  %nop1983 = alloca i1, i1 0
-  %nop1984 = alloca i1, i1 0
-  %nop1985 = alloca i1, i1 0
-  %nop1986 = alloca i1, i1 0
-  %nop1987 = alloca i1, i1 0
-  %nop1988 = alloca i1, i1 0
-  %nop1989 = alloca i1, i1 0
-  %nop1990 = alloca i1, i1 0
-  %nop1991 = alloca i1, i1 0
-  %nop1992 = alloca i1, i1 0
-  %nop1993 = alloca i1, i1 0
-  %nop1994 = alloca i1, i1 0
-  %nop1995 = alloca i1, i1 0
-  %nop1996 = alloca i1, i1 0
-  %nop1997 = alloca i1, i1 0
-  %nop1998 = alloca i1, i1 0
-  %nop1999 = alloca i1, i1 0
-  %nop2000 = alloca i1, i1 0
-  %nop2001 = alloca i1, i1 0
-  %nop2002 = alloca i1, i1 0
-  %nop2003 = alloca i1, i1 0
-  %nop2004 = alloca i1, i1 0
-  %nop2005 = alloca i1, i1 0
-  %nop2006 = alloca i1, i1 0
-  %nop2007 = alloca i1, i1 0
-  %nop2008 = alloca i1, i1 0
-  %nop2009 = alloca i1, i1 0
-  %nop2010 = alloca i1, i1 0
-  %nop2011 = alloca i1, i1 0
-  %nop2012 = alloca i1, i1 0
-  %nop2013 = alloca i1, i1 0
-  %nop2014 = alloca i1, i1 0
-  %nop2015 = alloca i1, i1 0
-  %nop2016 = alloca i1, i1 0
-  %nop2017 = alloca i1, i1 0
-  %nop2018 = alloca i1, i1 0
-  %nop2019 = alloca i1, i1 0
-  %nop2020 = alloca i1, i1 0
-  %nop2021 = alloca i1, i1 0
-  %nop2022 = alloca i1, i1 0
-  %nop2023 = alloca i1, i1 0
-  %nop2024 = alloca i1, i1 0
-  %nop2025 = alloca i1, i1 0
-  %nop2026 = alloca i1, i1 0
-  %nop2027 = alloca i1, i1 0
-  %nop2028 = alloca i1, i1 0
-  %nop2029 = alloca i1, i1 0
-  %nop2030 = alloca i1, i1 0
-  %nop2031 = alloca i1, i1 0
-  %nop2032 = alloca i1, i1 0
-  %nop2033 = alloca i1, i1 0
-  %nop2034 = alloca i1, i1 0
-  %nop2035 = alloca i1, i1 0
-  %nop2036 = alloca i1, i1 0
-  %nop2037 = alloca i1, i1 0
-  %nop2038 = alloca i1, i1 0
-  %nop2039 = alloca i1, i1 0
-  %nop2040 = alloca i1, i1 0
-  %nop2041 = alloca i1, i1 0
-  %nop2042 = alloca i1, i1 0
-  %nop2043 = alloca i1, i1 0
-  %nop2044 = alloca i1, i1 0
-  %nop2045 = alloca i1, i1 0
-  %nop2046 = alloca i1, i1 0
-  %nop2047 = alloca i1, i1 0
-  %nop2048 = alloca i1, i1 0
-  %nop2049 = alloca i1, i1 0
-  %nop2050 = alloca i1, i1 0
-  %nop2051 = alloca i1, i1 0
-  %nop2052 = alloca i1, i1 0
-  %nop2053 = alloca i1, i1 0
-  %nop2054 = alloca i1, i1 0
-  %nop2055 = alloca i1, i1 0
-  %nop2056 = alloca i1, i1 0
-  %nop2057 = alloca i1, i1 0
-  %nop2058 = alloca i1, i1 0
-  %nop2059 = alloca i1, i1 0
-  %nop2060 = alloca i1, i1 0
-  %nop2061 = alloca i1, i1 0
-  %nop2062 = alloca i1, i1 0
-  %nop2063 = alloca i1, i1 0
-  %nop2064 = alloca i1, i1 0
-  %nop2065 = alloca i1, i1 0
-  %nop2066 = alloca i1, i1 0
-  %nop2067 = alloca i1, i1 0
-  %nop2068 = alloca i1, i1 0
-  %nop2069 = alloca i1, i1 0
-  %nop2070 = alloca i1, i1 0
-  %nop2071 = alloca i1, i1 0
-  %nop2072 = alloca i1, i1 0
-  %nop2073 = alloca i1, i1 0
-  %nop2074 = alloca i1, i1 0
-  %nop2075 = alloca i1, i1 0
-  %nop2076 = alloca i1, i1 0
-  %nop2077 = alloca i1, i1 0
-  %nop2078 = alloca i1, i1 0
-  %nop2079 = alloca i1, i1 0
-  %nop2080 = alloca i1, i1 0
-  %nop2081 = alloca i1, i1 0
-  %nop2082 = alloca i1, i1 0
-  %nop2083 = alloca i1, i1 0
-  %nop2084 = alloca i1, i1 0
-  %nop2085 = alloca i1, i1 0
-  %nop2086 = alloca i1, i1 0
-  %nop2087 = alloca i1, i1 0
-  %nop2088 = alloca i1, i1 0
-  %nop2089 = alloca i1, i1 0
-  %nop2090 = alloca i1, i1 0
-  %nop2091 = alloca i1, i1 0
-  %nop2092 = alloca i1, i1 0
-  %nop2093 = alloca i1, i1 0
-  %nop2094 = alloca i1, i1 0
-  %nop2095 = alloca i1, i1 0
-  %nop2096 = alloca i1, i1 0
-  %nop2097 = alloca i1, i1 0
-  %nop2098 = alloca i1, i1 0
-  %nop2099 = alloca i1, i1 0
-  %nop2100 = alloca i1, i1 0
-  %nop2101 = alloca i1, i1 0
-  %nop2102 = alloca i1, i1 0
-  %nop2103 = alloca i1, i1 0
-  %nop2104 = alloca i1, i1 0
-  %nop2105 = alloca i1, i1 0
-  %nop2106 = alloca i1, i1 0
-  %nop2107 = alloca i1, i1 0
-  %nop2108 = alloca i1, i1 0
-  %nop2109 = alloca i1, i1 0
-  %nop2110 = alloca i1, i1 0
-  %nop2111 = alloca i1, i1 0
-  %nop2112 = alloca i1, i1 0
-  %nop2113 = alloca i1, i1 0
-  %nop2114 = alloca i1, i1 0
-  %nop2115 = alloca i1, i1 0
-  %nop2116 = alloca i1, i1 0
-  %nop2117 = alloca i1, i1 0
-  %nop2118 = alloca i1, i1 0
-  %nop2119 = alloca i1, i1 0
-  %nop2120 = alloca i1, i1 0
-  %nop2121 = alloca i1, i1 0
-  %nop2122 = alloca i1, i1 0
-  %nop2123 = alloca i1, i1 0
-  %nop2124 = alloca i1, i1 0
-  %nop2125 = alloca i1, i1 0
-  %nop2126 = alloca i1, i1 0
-  %nop2127 = alloca i1, i1 0
-  %nop2128 = alloca i1, i1 0
-  %nop2129 = alloca i1, i1 0
-  %nop2130 = alloca i1, i1 0
-  %nop2131 = alloca i1, i1 0
-  %nop2132 = alloca i1, i1 0
-  %nop2133 = alloca i1, i1 0
-  %nop2134 = alloca i1, i1 0
-  %nop2135 = alloca i1, i1 0
-  %nop2136 = alloca i1, i1 0
-  %nop2137 = alloca i1, i1 0
-  %nop2138 = alloca i1, i1 0
-  %nop2139 = alloca i1, i1 0
-  %nop2140 = alloca i1, i1 0
-  %nop2141 = alloca i1, i1 0
-  %nop2142 = alloca i1, i1 0
-  %nop2143 = alloca i1, i1 0
-  %nop2144 = alloca i1, i1 0
-  %nop2145 = alloca i1, i1 0
-  %nop2146 = alloca i1, i1 0
-  %nop2147 = alloca i1, i1 0
-  %nop2148 = alloca i1, i1 0
-  %nop2149 = alloca i1, i1 0
-  %nop2150 = alloca i1, i1 0
-  %nop2151 = alloca i1, i1 0
-  %nop2152 = alloca i1, i1 0
-  %nop2153 = alloca i1, i1 0
-  %nop2154 = alloca i1, i1 0
-  %nop2155 = alloca i1, i1 0
-  %nop2156 = alloca i1, i1 0
-  %nop2157 = alloca i1, i1 0
-  %nop2158 = alloca i1, i1 0
-  %nop2159 = alloca i1, i1 0
-  %nop2160 = alloca i1, i1 0
-  %nop2161 = alloca i1, i1 0
-  %nop2162 = alloca i1, i1 0
-  %nop2163 = alloca i1, i1 0
-  %nop2164 = alloca i1, i1 0
-  %nop2165 = alloca i1, i1 0
-  %nop2166 = alloca i1, i1 0
-  %nop2167 = alloca i1, i1 0
-  %nop2168 = alloca i1, i1 0
-  %nop2169 = alloca i1, i1 0
-  %nop2170 = alloca i1, i1 0
-  %nop2171 = alloca i1, i1 0
-  %nop2172 = alloca i1, i1 0
-  %nop2173 = alloca i1, i1 0
-  %nop2174 = alloca i1, i1 0
-  %nop2175 = alloca i1, i1 0
-  %nop2176 = alloca i1, i1 0
-  %nop2177 = alloca i1, i1 0
-  %nop2178 = alloca i1, i1 0
-  %nop2179 = alloca i1, i1 0
-  %nop2180 = alloca i1, i1 0
-  %nop2181 = alloca i1, i1 0
-  %nop2182 = alloca i1, i1 0
-  %nop2183 = alloca i1, i1 0
-  %nop2184 = alloca i1, i1 0
-  %nop2185 = alloca i1, i1 0
-  %nop2186 = alloca i1, i1 0
-  %nop2187 = alloca i1, i1 0
-  %nop2188 = alloca i1, i1 0
-  %nop2189 = alloca i1, i1 0
-  %nop2190 = alloca i1, i1 0
-  %nop2191 = alloca i1, i1 0
-  %nop2192 = alloca i1, i1 0
-  %nop2193 = alloca i1, i1 0
-  %nop2194 = alloca i1, i1 0
-  %nop2195 = alloca i1, i1 0
-  %nop2196 = alloca i1, i1 0
-  %nop2197 = alloca i1, i1 0
-  %nop2198 = alloca i1, i1 0
-  %nop2199 = alloca i1, i1 0
-  %nop2200 = alloca i1, i1 0
-  %nop2201 = alloca i1, i1 0
-  %nop2202 = alloca i1, i1 0
-  %nop2203 = alloca i1, i1 0
-  %nop2204 = alloca i1, i1 0
-  %nop2205 = alloca i1, i1 0
-  %nop2206 = alloca i1, i1 0
-  %nop2207 = alloca i1, i1 0
-  %nop2208 = alloca i1, i1 0
-  %nop2209 = alloca i1, i1 0
-  %nop2210 = alloca i1, i1 0
-  %nop2211 = alloca i1, i1 0
-  %nop2212 = alloca i1, i1 0
-  %nop2213 = alloca i1, i1 0
-  %nop2214 = alloca i1, i1 0
-  %nop2215 = alloca i1, i1 0
-  %nop2216 = alloca i1, i1 0
-  %nop2217 = alloca i1, i1 0
-  %nop2218 = alloca i1, i1 0
-  %nop2219 = alloca i1, i1 0
-  %nop2220 = alloca i1, i1 0
-  %nop2221 = alloca i1, i1 0
-  %nop2222 = alloca i1, i1 0
-  %nop2223 = alloca i1, i1 0
-  %nop2224 = alloca i1, i1 0
-  %nop2225 = alloca i1, i1 0
-  %nop2226 = alloca i1, i1 0
-  %nop2227 = alloca i1, i1 0
-  %nop2228 = alloca i1, i1 0
-  %nop2229 = alloca i1, i1 0
-  %nop2230 = alloca i1, i1 0
-  %nop2231 = alloca i1, i1 0
-  %nop2232 = alloca i1, i1 0
-  %nop2233 = alloca i1, i1 0
-  %nop2234 = alloca i1, i1 0
-  %nop2235 = alloca i1, i1 0
-  %nop2236 = alloca i1, i1 0
-  %nop2237 = alloca i1, i1 0
-  %nop2238 = alloca i1, i1 0
-  %nop2239 = alloca i1, i1 0
-  %nop2240 = alloca i1, i1 0
-  %nop2241 = alloca i1, i1 0
-  %nop2242 = alloca i1, i1 0
-  %nop2243 = alloca i1, i1 0
-  %nop2244 = alloca i1, i1 0
-  %nop2245 = alloca i1, i1 0
-  %nop2246 = alloca i1, i1 0
-  %nop2247 = alloca i1, i1 0
-  %nop2248 = alloca i1, i1 0
-  %nop2249 = alloca i1, i1 0
-  %nop2250 = alloca i1, i1 0
-  %nop2251 = alloca i1, i1 0
-  %nop2252 = alloca i1, i1 0
-  %nop2253 = alloca i1, i1 0
-  %nop2254 = alloca i1, i1 0
-  %nop2255 = alloca i1, i1 0
-  %nop2256 = alloca i1, i1 0
-  %nop2257 = alloca i1, i1 0
-  %nop2258 = alloca i1, i1 0
-  %nop2259 = alloca i1, i1 0
-  %nop2260 = alloca i1, i1 0
-  %nop2261 = alloca i1, i1 0
-  %nop2262 = alloca i1, i1 0
-  %nop2263 = alloca i1, i1 0
-  %nop2264 = alloca i1, i1 0
-  %nop2265 = alloca i1, i1 0
-  %nop2266 = alloca i1, i1 0
-  %nop2267 = alloca i1, i1 0
-  %nop2268 = alloca i1, i1 0
-  %nop2269 = alloca i1, i1 0
-  %nop2270 = alloca i1, i1 0
-  %nop2271 = alloca i1, i1 0
-  %nop2272 = alloca i1, i1 0
-  %nop2273 = alloca i1, i1 0
-  %nop2274 = alloca i1, i1 0
-  %nop2275 = alloca i1, i1 0
-  %nop2276 = alloca i1, i1 0
-  %nop2277 = alloca i1, i1 0
-  %nop2278 = alloca i1, i1 0
-  %nop2279 = alloca i1, i1 0
-  %nop2280 = alloca i1, i1 0
-  %nop2281 = alloca i1, i1 0
-  %nop2282 = alloca i1, i1 0
-  %nop2283 = alloca i1, i1 0
-  %nop2284 = alloca i1, i1 0
-  %nop2285 = alloca i1, i1 0
-  %nop2286 = alloca i1, i1 0
-  %nop2287 = alloca i1, i1 0
-  %nop2288 = alloca i1, i1 0
-  %nop2289 = alloca i1, i1 0
-  %nop2290 = alloca i1, i1 0
-  %nop2291 = alloca i1, i1 0
-  %nop2292 = alloca i1, i1 0
-  %nop2293 = alloca i1, i1 0
-  %nop2294 = alloca i1, i1 0
-  %nop2295 = alloca i1, i1 0
-  %nop2296 = alloca i1, i1 0
-  %nop2297 = alloca i1, i1 0
-  %nop2298 = alloca i1, i1 0
-  %nop2299 = alloca i1, i1 0
-  %nop2300 = alloca i1, i1 0
-  %nop2301 = alloca i1, i1 0
-  %nop2302 = alloca i1, i1 0
-  %nop2303 = alloca i1, i1 0
-  %nop2304 = alloca i1, i1 0
-  %nop2305 = alloca i1, i1 0
-  %nop2306 = alloca i1, i1 0
-  %nop2307 = alloca i1, i1 0
-  %nop2308 = alloca i1, i1 0
-  %nop2309 = alloca i1, i1 0
-  %nop2310 = alloca i1, i1 0
-  %nop2311 = alloca i1, i1 0
-  %nop2312 = alloca i1, i1 0
-  %nop2313 = alloca i1, i1 0
-  %nop2314 = alloca i1, i1 0
-  %nop2315 = alloca i1, i1 0
-  %nop2316 = alloca i1, i1 0
-  %nop2317 = alloca i1, i1 0
-  %nop2318 = alloca i1, i1 0
-  %nop2319 = alloca i1, i1 0
-  %nop2320 = alloca i1, i1 0
-  %nop2321 = alloca i1, i1 0
-  %nop2322 = alloca i1, i1 0
-  %nop2323 = alloca i1, i1 0
-  %nop2324 = alloca i1, i1 0
-  %nop2325 = alloca i1, i1 0
-  %nop2326 = alloca i1, i1 0
-  %nop2327 = alloca i1, i1 0
-  %nop2328 = alloca i1, i1 0
-  %nop2329 = alloca i1, i1 0
-  %nop2330 = alloca i1, i1 0
-  %nop2331 = alloca i1, i1 0
-  %nop2332 = alloca i1, i1 0
-  %nop2333 = alloca i1, i1 0
-  %nop2334 = alloca i1, i1 0
-  %nop2335 = alloca i1, i1 0
-  %nop2336 = alloca i1, i1 0
-  %nop2337 = alloca i1, i1 0
-  %nop2338 = alloca i1, i1 0
-  %nop2339 = alloca i1, i1 0
-  %nop2340 = alloca i1, i1 0
-  %nop2341 = alloca i1, i1 0
-  %nop2342 = alloca i1, i1 0
-  %nop2343 = alloca i1, i1 0
-  %nop2344 = alloca i1, i1 0
-  %nop2345 = alloca i1, i1 0
-  %nop2346 = alloca i1, i1 0
-  %nop2347 = alloca i1, i1 0
-  %nop2348 = alloca i1, i1 0
-  %nop2349 = alloca i1, i1 0
-  %nop2350 = alloca i1, i1 0
-  %nop2351 = alloca i1, i1 0
-  %nop2352 = alloca i1, i1 0
-  %nop2353 = alloca i1, i1 0
-  %nop2354 = alloca i1, i1 0
-  %nop2355 = alloca i1, i1 0
-  %nop2356 = alloca i1, i1 0
-  %nop2357 = alloca i1, i1 0
-  %nop2358 = alloca i1, i1 0
-  %nop2359 = alloca i1, i1 0
-  %nop2360 = alloca i1, i1 0
-  %nop2361 = alloca i1, i1 0
-  %nop2362 = alloca i1, i1 0
-  %nop2363 = alloca i1, i1 0
-  %nop2364 = alloca i1, i1 0
-  %nop2365 = alloca i1, i1 0
-  %nop2366 = alloca i1, i1 0
-  %nop2367 = alloca i1, i1 0
-  %nop2368 = alloca i1, i1 0
-  %nop2369 = alloca i1, i1 0
-  %nop2370 = alloca i1, i1 0
-  %nop2371 = alloca i1, i1 0
-  %nop2372 = alloca i1, i1 0
-  %nop2373 = alloca i1, i1 0
-  %nop2374 = alloca i1, i1 0
-  %nop2375 = alloca i1, i1 0
-  %nop2376 = alloca i1, i1 0
-  %nop2377 = alloca i1, i1 0
-  %nop2378 = alloca i1, i1 0
-  %nop2379 = alloca i1, i1 0
-  %nop2380 = alloca i1, i1 0
-  %nop2381 = alloca i1, i1 0
-  %nop2382 = alloca i1, i1 0
-  %nop2383 = alloca i1, i1 0
-  %nop2384 = alloca i1, i1 0
-  %nop2385 = alloca i1, i1 0
-  %nop2386 = alloca i1, i1 0
-  %nop2387 = alloca i1, i1 0
-  %nop2388 = alloca i1, i1 0
-  %nop2389 = alloca i1, i1 0
-  %nop2390 = alloca i1, i1 0
-  %nop2391 = alloca i1, i1 0
-  %nop2392 = alloca i1, i1 0
-  %nop2393 = alloca i1, i1 0
-  %nop2394 = alloca i1, i1 0
-  %nop2395 = alloca i1, i1 0
-  %nop2396 = alloca i1, i1 0
-  %nop2397 = alloca i1, i1 0
-  %nop2398 = alloca i1, i1 0
-  %nop2399 = alloca i1, i1 0
-  %nop2400 = alloca i1, i1 0
-  %nop2401 = alloca i1, i1 0
-  %nop2402 = alloca i1, i1 0
-  %nop2403 = alloca i1, i1 0
-  %nop2404 = alloca i1, i1 0
-  %nop2405 = alloca i1, i1 0
-  %nop2406 = alloca i1, i1 0
-  %nop2407 = alloca i1, i1 0
-  %nop2408 = alloca i1, i1 0
-  %nop2409 = alloca i1, i1 0
-  %nop2410 = alloca i1, i1 0
-  %nop2411 = alloca i1, i1 0
-  %nop2412 = alloca i1, i1 0
-  %nop2413 = alloca i1, i1 0
-  %nop2414 = alloca i1, i1 0
-  %nop2415 = alloca i1, i1 0
-  %nop2416 = alloca i1, i1 0
-  %nop2417 = alloca i1, i1 0
-  %nop2418 = alloca i1, i1 0
-  %nop2419 = alloca i1, i1 0
-  %nop2420 = alloca i1, i1 0
-  %nop2421 = alloca i1, i1 0
-  %nop2422 = alloca i1, i1 0
-  %nop2423 = alloca i1, i1 0
-  %nop2424 = alloca i1, i1 0
-  %nop2425 = alloca i1, i1 0
-  %nop2426 = alloca i1, i1 0
-  %nop2427 = alloca i1, i1 0
-  %nop2428 = alloca i1, i1 0
-  %nop2429 = alloca i1, i1 0
-  %nop2430 = alloca i1, i1 0
-  %nop2431 = alloca i1, i1 0
-  %nop2432 = alloca i1, i1 0
-  %nop2433 = alloca i1, i1 0
-  %nop2434 = alloca i1, i1 0
-  %nop2435 = alloca i1, i1 0
-  %nop2436 = alloca i1, i1 0
-  %nop2437 = alloca i1, i1 0
-  %nop2438 = alloca i1, i1 0
-  %nop2439 = alloca i1, i1 0
-  %nop2440 = alloca i1, i1 0
-  %nop2441 = alloca i1, i1 0
-  %nop2442 = alloca i1, i1 0
-  %nop2443 = alloca i1, i1 0
-  %nop2444 = alloca i1, i1 0
-  %nop2445 = alloca i1, i1 0
-  %nop2446 = alloca i1, i1 0
-  %nop2447 = alloca i1, i1 0
-  %nop2448 = alloca i1, i1 0
-  %nop2449 = alloca i1, i1 0
-  %nop2450 = alloca i1, i1 0
-  %nop2451 = alloca i1, i1 0
-  %nop2452 = alloca i1, i1 0
-  %nop2453 = alloca i1, i1 0
-  %nop2454 = alloca i1, i1 0
-  %nop2455 = alloca i1, i1 0
-  %nop2456 = alloca i1, i1 0
-  %nop2457 = alloca i1, i1 0
-  %nop2458 = alloca i1, i1 0
-  %nop2459 = alloca i1, i1 0
-  %nop2460 = alloca i1, i1 0
-  %nop2461 = alloca i1, i1 0
-  %nop2462 = alloca i1, i1 0
-  %nop2463 = alloca i1, i1 0
-  %nop2464 = alloca i1, i1 0
-  %nop2465 = alloca i1, i1 0
-  %nop2466 = alloca i1, i1 0
-  %nop2467 = alloca i1, i1 0
-  %nop2468 = alloca i1, i1 0
-  %nop2469 = alloca i1, i1 0
-  %nop2470 = alloca i1, i1 0
-  %nop2471 = alloca i1, i1 0
-  %nop2472 = alloca i1, i1 0
-  %nop2473 = alloca i1, i1 0
-  %nop2474 = alloca i1, i1 0
-  %nop2475 = alloca i1, i1 0
-  %nop2476 = alloca i1, i1 0
-  %nop2477 = alloca i1, i1 0
-  %nop2478 = alloca i1, i1 0
-  %nop2479 = alloca i1, i1 0
-  %nop2480 = alloca i1, i1 0
-  %nop2481 = alloca i1, i1 0
-  %nop2482 = alloca i1, i1 0
-  %nop2483 = alloca i1, i1 0
-  %nop2484 = alloca i1, i1 0
-  %nop2485 = alloca i1, i1 0
-  %nop2486 = alloca i1, i1 0
-  %nop2487 = alloca i1, i1 0
-  %nop2488 = alloca i1, i1 0
-  %nop2489 = alloca i1, i1 0
-  %nop2490 = alloca i1, i1 0
-  %nop2491 = alloca i1, i1 0
-  %nop2492 = alloca i1, i1 0
-  %nop2493 = alloca i1, i1 0
-  %nop2494 = alloca i1, i1 0
-  %nop2495 = alloca i1, i1 0
-  %nop2496 = alloca i1, i1 0
-  %nop2497 = alloca i1, i1 0
-  %nop2498 = alloca i1, i1 0
-  %nop2499 = alloca i1, i1 0
-  %nop2500 = alloca i1, i1 0
-  %nop2501 = alloca i1, i1 0
-  %nop2502 = alloca i1, i1 0
-  %nop2503 = alloca i1, i1 0
-  %nop2504 = alloca i1, i1 0
-  %nop2505 = alloca i1, i1 0
-  %nop2506 = alloca i1, i1 0
-  %nop2507 = alloca i1, i1 0
-  %nop2508 = alloca i1, i1 0
-  %nop2509 = alloca i1, i1 0
-  %nop2510 = alloca i1, i1 0
-  %nop2511 = alloca i1, i1 0
-  %nop2512 = alloca i1, i1 0
-  %nop2513 = alloca i1, i1 0
-  %nop2514 = alloca i1, i1 0
-  %nop2515 = alloca i1, i1 0
-  %nop2516 = alloca i1, i1 0
-  %nop2517 = alloca i1, i1 0
-  %nop2518 = alloca i1, i1 0
-  %nop2519 = alloca i1, i1 0
-  %nop2520 = alloca i1, i1 0
-  %nop2521 = alloca i1, i1 0
-  %nop2522 = alloca i1, i1 0
-  %nop2523 = alloca i1, i1 0
-  %nop2524 = alloca i1, i1 0
-  %nop2525 = alloca i1, i1 0
-  %nop2526 = alloca i1, i1 0
-  %nop2527 = alloca i1, i1 0
-  %nop2528 = alloca i1, i1 0
-  %nop2529 = alloca i1, i1 0
-  %nop2530 = alloca i1, i1 0
-  %nop2531 = alloca i1, i1 0
-  %nop2532 = alloca i1, i1 0
-  %nop2533 = alloca i1, i1 0
-  %nop2534 = alloca i1, i1 0
-  %nop2535 = alloca i1, i1 0
-  %nop2536 = alloca i1, i1 0
-  %nop2537 = alloca i1, i1 0
-  %nop2538 = alloca i1, i1 0
-  %nop2539 = alloca i1, i1 0
-  %nop2540 = alloca i1, i1 0
-  %nop2541 = alloca i1, i1 0
-  %nop2542 = alloca i1, i1 0
-  %nop2543 = alloca i1, i1 0
-  %nop2544 = alloca i1, i1 0
-  %nop2545 = alloca i1, i1 0
-  %nop2546 = alloca i1, i1 0
-  %nop2547 = alloca i1, i1 0
-  %nop2548 = alloca i1, i1 0
-  %nop2549 = alloca i1, i1 0
-  %nop2550 = alloca i1, i1 0
-  %nop2551 = alloca i1, i1 0
-  %nop2552 = alloca i1, i1 0
-  %nop2553 = alloca i1, i1 0
-  %nop2554 = alloca i1, i1 0
-  %nop2555 = alloca i1, i1 0
-  %nop2556 = alloca i1, i1 0
-  %nop2557 = alloca i1, i1 0
-  %nop2558 = alloca i1, i1 0
-  %nop2559 = alloca i1, i1 0
-  %nop2560 = alloca i1, i1 0
-  %nop2561 = alloca i1, i1 0
-  %nop2562 = alloca i1, i1 0
-  %nop2563 = alloca i1, i1 0
-  %nop2564 = alloca i1, i1 0
-  %nop2565 = alloca i1, i1 0
-  %nop2566 = alloca i1, i1 0
-  %nop2567 = alloca i1, i1 0
-  %nop2568 = alloca i1, i1 0
-  %nop2569 = alloca i1, i1 0
-  %nop2570 = alloca i1, i1 0
-  %nop2571 = alloca i1, i1 0
-  %nop2572 = alloca i1, i1 0
-  %nop2573 = alloca i1, i1 0
-  %nop2574 = alloca i1, i1 0
-  %nop2575 = alloca i1, i1 0
-  %nop2576 = alloca i1, i1 0
-  %nop2577 = alloca i1, i1 0
-  %nop2578 = alloca i1, i1 0
-  %nop2579 = alloca i1, i1 0
-  %nop2580 = alloca i1, i1 0
-  %nop2581 = alloca i1, i1 0
-  %nop2582 = alloca i1, i1 0
-  %nop2583 = alloca i1, i1 0
-  %nop2584 = alloca i1, i1 0
-  %nop2585 = alloca i1, i1 0
-  %nop2586 = alloca i1, i1 0
-  %nop2587 = alloca i1, i1 0
-  %nop2588 = alloca i1, i1 0
-  %nop2589 = alloca i1, i1 0
-  %nop2590 = alloca i1, i1 0
-  %nop2591 = alloca i1, i1 0
-  %nop2592 = alloca i1, i1 0
-  %nop2593 = alloca i1, i1 0
-  %nop2594 = alloca i1, i1 0
-  %nop2595 = alloca i1, i1 0
-  %nop2596 = alloca i1, i1 0
-  %nop2597 = alloca i1, i1 0
-  %nop2598 = alloca i1, i1 0
-  %nop2599 = alloca i1, i1 0
-  %nop2600 = alloca i1, i1 0
-  %nop2601 = alloca i1, i1 0
-  %nop2602 = alloca i1, i1 0
-  %nop2603 = alloca i1, i1 0
-  %nop2604 = alloca i1, i1 0
-  %nop2605 = alloca i1, i1 0
-  %nop2606 = alloca i1, i1 0
-  %nop2607 = alloca i1, i1 0
-  %nop2608 = alloca i1, i1 0
-  %nop2609 = alloca i1, i1 0
-  %nop2610 = alloca i1, i1 0
-  %nop2611 = alloca i1, i1 0
-  %nop2612 = alloca i1, i1 0
-  %nop2613 = alloca i1, i1 0
-  %nop2614 = alloca i1, i1 0
-  %nop2615 = alloca i1, i1 0
-  %nop2616 = alloca i1, i1 0
-  %nop2617 = alloca i1, i1 0
-  %nop2618 = alloca i1, i1 0
-  %nop2619 = alloca i1, i1 0
-  %nop2620 = alloca i1, i1 0
-  %nop2621 = alloca i1, i1 0
-  %nop2622 = alloca i1, i1 0
-  %nop2623 = alloca i1, i1 0
-  %nop2624 = alloca i1, i1 0
-  %nop2625 = alloca i1, i1 0
-  %nop2626 = alloca i1, i1 0
-  %nop2627 = alloca i1, i1 0
-  %nop2628 = alloca i1, i1 0
-  %nop2629 = alloca i1, i1 0
-  %nop2630 = alloca i1, i1 0
-  %nop2631 = alloca i1, i1 0
-  %nop2632 = alloca i1, i1 0
-  %nop2633 = alloca i1, i1 0
-  %nop2634 = alloca i1, i1 0
-  %nop2635 = alloca i1, i1 0
-  %nop2636 = alloca i1, i1 0
-  %nop2637 = alloca i1, i1 0
-  %nop2638 = alloca i1, i1 0
-  %nop2639 = alloca i1, i1 0
-  %nop2640 = alloca i1, i1 0
-  %nop2641 = alloca i1, i1 0
-  %nop2642 = alloca i1, i1 0
-  %nop2643 = alloca i1, i1 0
-  %nop2644 = alloca i1, i1 0
-  %nop2645 = alloca i1, i1 0
-  %nop2646 = alloca i1, i1 0
-  %nop2647 = alloca i1, i1 0
-  %nop2648 = alloca i1, i1 0
-  %nop2649 = alloca i1, i1 0
-  %nop2650 = alloca i1, i1 0
-  %nop2651 = alloca i1, i1 0
-  %nop2652 = alloca i1, i1 0
-  %nop2653 = alloca i1, i1 0
-  %nop2654 = alloca i1, i1 0
-  %nop2655 = alloca i1, i1 0
-  %nop2656 = alloca i1, i1 0
-  %nop2657 = alloca i1, i1 0
-  %nop2658 = alloca i1, i1 0
-  %nop2659 = alloca i1, i1 0
-  %nop2660 = alloca i1, i1 0
-  %nop2661 = alloca i1, i1 0
-  %nop2662 = alloca i1, i1 0
-  %nop2663 = alloca i1, i1 0
-  %nop2664 = alloca i1, i1 0
-  %nop2665 = alloca i1, i1 0
-  %nop2666 = alloca i1, i1 0
-  %nop2667 = alloca i1, i1 0
-  %nop2668 = alloca i1, i1 0
-  %nop2669 = alloca i1, i1 0
-  %nop2670 = alloca i1, i1 0
-  %nop2671 = alloca i1, i1 0
-  %nop2672 = alloca i1, i1 0
-  %nop2673 = alloca i1, i1 0
-  %nop2674 = alloca i1, i1 0
-  %nop2675 = alloca i1, i1 0
-  %nop2676 = alloca i1, i1 0
-  %nop2677 = alloca i1, i1 0
-  %nop2678 = alloca i1, i1 0
-  %nop2679 = alloca i1, i1 0
-  %nop2680 = alloca i1, i1 0
-  %nop2681 = alloca i1, i1 0
-  %nop2682 = alloca i1, i1 0
-  %nop2683 = alloca i1, i1 0
-  %nop2684 = alloca i1, i1 0
-  %nop2685 = alloca i1, i1 0
-  %nop2686 = alloca i1, i1 0
-  %nop2687 = alloca i1, i1 0
-  %nop2688 = alloca i1, i1 0
-  %nop2689 = alloca i1, i1 0
-  %nop2690 = alloca i1, i1 0
-  %nop2691 = alloca i1, i1 0
-  %nop2692 = alloca i1, i1 0
-  %nop2693 = alloca i1, i1 0
-  %nop2694 = alloca i1, i1 0
-  %nop2695 = alloca i1, i1 0
-  %nop2696 = alloca i1, i1 0
-  %nop2697 = alloca i1, i1 0
-  %nop2698 = alloca i1, i1 0
-  %nop2699 = alloca i1, i1 0
-  %nop2700 = alloca i1, i1 0
-  %nop2701 = alloca i1, i1 0
-  %nop2702 = alloca i1, i1 0
-  %nop2703 = alloca i1, i1 0
-  %nop2704 = alloca i1, i1 0
-  %nop2705 = alloca i1, i1 0
-  %nop2706 = alloca i1, i1 0
-  %nop2707 = alloca i1, i1 0
-  %nop2708 = alloca i1, i1 0
-  %nop2709 = alloca i1, i1 0
-  %nop2710 = alloca i1, i1 0
-  %nop2711 = alloca i1, i1 0
-  %nop2712 = alloca i1, i1 0
-  %nop2713 = alloca i1, i1 0
-  %nop2714 = alloca i1, i1 0
-  %nop2715 = alloca i1, i1 0
-  %nop2716 = alloca i1, i1 0
-  %nop2717 = alloca i1, i1 0
-  %nop2718 = alloca i1, i1 0
-  %nop2719 = alloca i1, i1 0
-  %nop2720 = alloca i1, i1 0
-  %nop2721 = alloca i1, i1 0
-  %nop2722 = alloca i1, i1 0
-  %nop2723 = alloca i1, i1 0
-  %nop2724 = alloca i1, i1 0
-  %nop2725 = alloca i1, i1 0
-  %nop2726 = alloca i1, i1 0
-  %nop2727 = alloca i1, i1 0
-  %nop2728 = alloca i1, i1 0
-  %nop2729 = alloca i1, i1 0
-  %nop2730 = alloca i1, i1 0
-  %nop2731 = alloca i1, i1 0
-  %nop2732 = alloca i1, i1 0
-  %nop2733 = alloca i1, i1 0
-  %nop2734 = alloca i1, i1 0
-  %nop2735 = alloca i1, i1 0
-  %nop2736 = alloca i1, i1 0
-  %nop2737 = alloca i1, i1 0
-  %nop2738 = alloca i1, i1 0
-  %nop2739 = alloca i1, i1 0
-  %nop2740 = alloca i1, i1 0
-  %nop2741 = alloca i1, i1 0
-  %nop2742 = alloca i1, i1 0
-  %nop2743 = alloca i1, i1 0
-  %nop2744 = alloca i1, i1 0
-  %nop2745 = alloca i1, i1 0
-  %nop2746 = alloca i1, i1 0
-  %nop2747 = alloca i1, i1 0
-  %nop2748 = alloca i1, i1 0
-  %nop2749 = alloca i1, i1 0
-  %nop2750 = alloca i1, i1 0
-  %nop2751 = alloca i1, i1 0
-  %nop2752 = alloca i1, i1 0
-  %nop2753 = alloca i1, i1 0
-  %nop2754 = alloca i1, i1 0
-  %nop2755 = alloca i1, i1 0
-  %nop2756 = alloca i1, i1 0
-  %nop2757 = alloca i1, i1 0
-  %nop2758 = alloca i1, i1 0
-  %nop2759 = alloca i1, i1 0
-  %nop2760 = alloca i1, i1 0
-  %nop2761 = alloca i1, i1 0
-  %nop2762 = alloca i1, i1 0
-  %nop2763 = alloca i1, i1 0
-  %nop2764 = alloca i1, i1 0
-  %nop2765 = alloca i1, i1 0
-  %nop2766 = alloca i1, i1 0
-  %nop2767 = alloca i1, i1 0
-  %nop2768 = alloca i1, i1 0
-  %nop2769 = alloca i1, i1 0
-  %nop2770 = alloca i1, i1 0
-  %nop2771 = alloca i1, i1 0
-  %nop2772 = alloca i1, i1 0
-  %nop2773 = alloca i1, i1 0
-  %nop2774 = alloca i1, i1 0
-  %nop2775 = alloca i1, i1 0
-  %nop2776 = alloca i1, i1 0
-  %nop2777 = alloca i1, i1 0
-  %nop2778 = alloca i1, i1 0
-  %nop2779 = alloca i1, i1 0
-  %nop2780 = alloca i1, i1 0
-  %nop2781 = alloca i1, i1 0
-  %nop2782 = alloca i1, i1 0
-  %nop2783 = alloca i1, i1 0
-  %nop2784 = alloca i1, i1 0
-  %nop2785 = alloca i1, i1 0
-  %nop2786 = alloca i1, i1 0
-  %nop2787 = alloca i1, i1 0
-  %nop2788 = alloca i1, i1 0
-  %nop2789 = alloca i1, i1 0
-  %nop2790 = alloca i1, i1 0
-  %nop2791 = alloca i1, i1 0
-  %nop2792 = alloca i1, i1 0
-  %nop2793 = alloca i1, i1 0
-  %nop2794 = alloca i1, i1 0
-  %nop2795 = alloca i1, i1 0
-  %nop2796 = alloca i1, i1 0
-  %nop2797 = alloca i1, i1 0
-  %nop2798 = alloca i1, i1 0
-  %nop2799 = alloca i1, i1 0
-  %nop2800 = alloca i1, i1 0
-  %nop2801 = alloca i1, i1 0
-  %nop2802 = alloca i1, i1 0
-  %nop2803 = alloca i1, i1 0
-  %nop2804 = alloca i1, i1 0
-  %nop2805 = alloca i1, i1 0
-  %nop2806 = alloca i1, i1 0
-  %nop2807 = alloca i1, i1 0
-  %nop2808 = alloca i1, i1 0
-  %nop2809 = alloca i1, i1 0
-  %nop2810 = alloca i1, i1 0
-  %nop2811 = alloca i1, i1 0
-  %nop2812 = alloca i1, i1 0
-  %nop2813 = alloca i1, i1 0
-  %nop2814 = alloca i1, i1 0
-  %nop2815 = alloca i1, i1 0
-  %nop2816 = alloca i1, i1 0
-  %nop2817 = alloca i1, i1 0
-  %nop2818 = alloca i1, i1 0
-  %nop2819 = alloca i1, i1 0
-  %nop2820 = alloca i1, i1 0
-  %nop2821 = alloca i1, i1 0
-  %nop2822 = alloca i1, i1 0
-  %nop2823 = alloca i1, i1 0
-  %nop2824 = alloca i1, i1 0
-  %nop2825 = alloca i1, i1 0
-  %nop2826 = alloca i1, i1 0
-  %nop2827 = alloca i1, i1 0
-  %nop2828 = alloca i1, i1 0
-  %nop2829 = alloca i1, i1 0
-  %nop2830 = alloca i1, i1 0
-  %nop2831 = alloca i1, i1 0
-  %nop2832 = alloca i1, i1 0
-  %nop2833 = alloca i1, i1 0
-  %nop2834 = alloca i1, i1 0
-  %nop2835 = alloca i1, i1 0
-  %nop2836 = alloca i1, i1 0
-  %nop2837 = alloca i1, i1 0
-  %nop2838 = alloca i1, i1 0
-  %nop2839 = alloca i1, i1 0
-  %nop2840 = alloca i1, i1 0
-  %nop2841 = alloca i1, i1 0
-  %nop2842 = alloca i1, i1 0
-  %nop2843 = alloca i1, i1 0
-  %nop2844 = alloca i1, i1 0
-  %nop2845 = alloca i1, i1 0
-  %nop2846 = alloca i1, i1 0
-  %nop2847 = alloca i1, i1 0
-  %nop2848 = alloca i1, i1 0
-  %nop2849 = alloca i1, i1 0
-  %nop2850 = alloca i1, i1 0
-  %nop2851 = alloca i1, i1 0
-  %nop2852 = alloca i1, i1 0
-  %nop2853 = alloca i1, i1 0
-  %nop2854 = alloca i1, i1 0
-  %nop2855 = alloca i1, i1 0
-  %nop2856 = alloca i1, i1 0
-  %nop2857 = alloca i1, i1 0
-  %nop2858 = alloca i1, i1 0
-  %nop2859 = alloca i1, i1 0
-  %nop2860 = alloca i1, i1 0
-  %nop2861 = alloca i1, i1 0
-  %nop2862 = alloca i1, i1 0
-  %nop2863 = alloca i1, i1 0
-  %nop2864 = alloca i1, i1 0
-  %nop2865 = alloca i1, i1 0
-  %nop2866 = alloca i1, i1 0
-  %nop2867 = alloca i1, i1 0
-  %nop2868 = alloca i1, i1 0
-  %nop2869 = alloca i1, i1 0
-  %nop2870 = alloca i1, i1 0
-  %nop2871 = alloca i1, i1 0
-  %nop2872 = alloca i1, i1 0
-  %nop2873 = alloca i1, i1 0
-  %nop2874 = alloca i1, i1 0
-  %nop2875 = alloca i1, i1 0
-  %nop2876 = alloca i1, i1 0
-  %nop2877 = alloca i1, i1 0
-  %nop2878 = alloca i1, i1 0
-  %nop2879 = alloca i1, i1 0
-  %nop2880 = alloca i1, i1 0
-  %nop2881 = alloca i1, i1 0
-  %nop2882 = alloca i1, i1 0
-  %nop2883 = alloca i1, i1 0
-  %nop2884 = alloca i1, i1 0
-  %nop2885 = alloca i1, i1 0
-  %nop2886 = alloca i1, i1 0
-  %nop2887 = alloca i1, i1 0
-  %nop2888 = alloca i1, i1 0
-  %nop2889 = alloca i1, i1 0
-  %nop2890 = alloca i1, i1 0
-  %nop2891 = alloca i1, i1 0
-  %nop2892 = alloca i1, i1 0
-  %nop2893 = alloca i1, i1 0
-  %nop2894 = alloca i1, i1 0
-  %nop2895 = alloca i1, i1 0
-  %nop2896 = alloca i1, i1 0
-  %nop2897 = alloca i1, i1 0
-  %nop2898 = alloca i1, i1 0
-  %nop2899 = alloca i1, i1 0
-  %nop2900 = alloca i1, i1 0
-  %nop2901 = alloca i1, i1 0
-  %nop2902 = alloca i1, i1 0
-  %nop2903 = alloca i1, i1 0
-  %nop2904 = alloca i1, i1 0
-  %nop2905 = alloca i1, i1 0
-  %nop2906 = alloca i1, i1 0
-  %nop2907 = alloca i1, i1 0
-  %nop2908 = alloca i1, i1 0
-  %nop2909 = alloca i1, i1 0
-  %nop2910 = alloca i1, i1 0
-  %nop2911 = alloca i1, i1 0
-  %nop2912 = alloca i1, i1 0
-  %nop2913 = alloca i1, i1 0
-  %nop2914 = alloca i1, i1 0
-  %nop2915 = alloca i1, i1 0
-  %nop2916 = alloca i1, i1 0
-  %nop2917 = alloca i1, i1 0
-  %nop2918 = alloca i1, i1 0
-  %nop2919 = alloca i1, i1 0
-  %nop2920 = alloca i1, i1 0
-  %nop2921 = alloca i1, i1 0
-  %nop2922 = alloca i1, i1 0
-  %nop2923 = alloca i1, i1 0
-  %nop2924 = alloca i1, i1 0
-  %nop2925 = alloca i1, i1 0
-  %nop2926 = alloca i1, i1 0
-  %nop2927 = alloca i1, i1 0
-  %nop2928 = alloca i1, i1 0
-  %nop2929 = alloca i1, i1 0
-  %nop2930 = alloca i1, i1 0
-  %nop2931 = alloca i1, i1 0
-  %nop2932 = alloca i1, i1 0
-  %nop2933 = alloca i1, i1 0
-  %nop2934 = alloca i1, i1 0
-  %nop2935 = alloca i1, i1 0
-  %nop2936 = alloca i1, i1 0
-  %nop2937 = alloca i1, i1 0
-  %nop2938 = alloca i1, i1 0
-  %nop2939 = alloca i1, i1 0
-  %nop2940 = alloca i1, i1 0
-  %nop2941 = alloca i1, i1 0
-  %nop2942 = alloca i1, i1 0
-  %nop2943 = alloca i1, i1 0
-  %nop2944 = alloca i1, i1 0
-  %nop2945 = alloca i1, i1 0
-  %nop2946 = alloca i1, i1 0
-  %nop2947 = alloca i1, i1 0
-  %nop2948 = alloca i1, i1 0
-  %nop2949 = alloca i1, i1 0
-  %nop2950 = alloca i1, i1 0
-  %nop2951 = alloca i1, i1 0
-  %nop2952 = alloca i1, i1 0
-  %nop2953 = alloca i1, i1 0
-  %nop2954 = alloca i1, i1 0
-  %nop2955 = alloca i1, i1 0
-  %nop2956 = alloca i1, i1 0
-  %nop2957 = alloca i1, i1 0
-  %nop2958 = alloca i1, i1 0
-  %nop2959 = alloca i1, i1 0
-  %nop2960 = alloca i1, i1 0
-  %nop2961 = alloca i1, i1 0
-  %nop2962 = alloca i1, i1 0
-  %nop2963 = alloca i1, i1 0
-  %nop2964 = alloca i1, i1 0
-  %nop2965 = alloca i1, i1 0
-  %nop2966 = alloca i1, i1 0
-  %nop2967 = alloca i1, i1 0
-  %nop2968 = alloca i1, i1 0
-  %nop2969 = alloca i1, i1 0
-  %nop2970 = alloca i1, i1 0
-  %nop2971 = alloca i1, i1 0
-  %nop2972 = alloca i1, i1 0
-  %nop2973 = alloca i1, i1 0
-  %nop2974 = alloca i1, i1 0
-  %nop2975 = alloca i1, i1 0
-  %nop2976 = alloca i1, i1 0
-  %nop2977 = alloca i1, i1 0
-  %nop2978 = alloca i1, i1 0
-  %nop2979 = alloca i1, i1 0
-  %nop2980 = alloca i1, i1 0
-  %nop2981 = alloca i1, i1 0
-  %nop2982 = alloca i1, i1 0
-  %nop2983 = alloca i1, i1 0
-  %nop2984 = alloca i1, i1 0
-  %nop2985 = alloca i1, i1 0
-  %nop2986 = alloca i1, i1 0
-  %nop2987 = alloca i1, i1 0
-  %nop2988 = alloca i1, i1 0
-  %nop2989 = alloca i1, i1 0
-  %nop2990 = alloca i1, i1 0
-  %nop2991 = alloca i1, i1 0
-  %nop2992 = alloca i1, i1 0
-  %nop2993 = alloca i1, i1 0
-  %nop2994 = alloca i1, i1 0
-  %nop2995 = alloca i1, i1 0
-  %nop2996 = alloca i1, i1 0
-  %nop2997 = alloca i1, i1 0
-  %nop2998 = alloca i1, i1 0
-  %nop2999 = alloca i1, i1 0
-  %nop3000 = alloca i1, i1 0
-  %nop3001 = alloca i1, i1 0
-  %nop3002 = alloca i1, i1 0
-  %nop3003 = alloca i1, i1 0
-  %nop3004 = alloca i1, i1 0
-  %nop3005 = alloca i1, i1 0
-  %nop3006 = alloca i1, i1 0
-  %nop3007 = alloca i1, i1 0
-  %nop3008 = alloca i1, i1 0
-  %nop3009 = alloca i1, i1 0
-  %nop3010 = alloca i1, i1 0
-  %nop3011 = alloca i1, i1 0
-  %nop3012 = alloca i1, i1 0
-  %nop3013 = alloca i1, i1 0
-  %nop3014 = alloca i1, i1 0
-  %nop3015 = alloca i1, i1 0
-  %nop3016 = alloca i1, i1 0
-  %nop3017 = alloca i1, i1 0
-  %nop3018 = alloca i1, i1 0
-  %nop3019 = alloca i1, i1 0
-  %nop3020 = alloca i1, i1 0
-  %nop3021 = alloca i1, i1 0
-  %nop3022 = alloca i1, i1 0
-  %nop3023 = alloca i1, i1 0
-  %nop3024 = alloca i1, i1 0
-  %nop3025 = alloca i1, i1 0
-  %nop3026 = alloca i1, i1 0
-  %nop3027 = alloca i1, i1 0
-  %nop3028 = alloca i1, i1 0
-  %nop3029 = alloca i1, i1 0
-  %nop3030 = alloca i1, i1 0
-  %nop3031 = alloca i1, i1 0
-  %nop3032 = alloca i1, i1 0
-  %nop3033 = alloca i1, i1 0
-  %nop3034 = alloca i1, i1 0
-  %nop3035 = alloca i1, i1 0
-  %nop3036 = alloca i1, i1 0
-  %nop3037 = alloca i1, i1 0
-  %nop3038 = alloca i1, i1 0
-  %nop3039 = alloca i1, i1 0
-  %nop3040 = alloca i1, i1 0
-  %nop3041 = alloca i1, i1 0
-  %nop3042 = alloca i1, i1 0
-  %nop3043 = alloca i1, i1 0
-  %nop3044 = alloca i1, i1 0
-  %nop3045 = alloca i1, i1 0
-  %nop3046 = alloca i1, i1 0
-  %nop3047 = alloca i1, i1 0
-  %nop3048 = alloca i1, i1 0
-  %nop3049 = alloca i1, i1 0
-  %nop3050 = alloca i1, i1 0
-  %nop3051 = alloca i1, i1 0
-  %nop3052 = alloca i1, i1 0
-  %nop3053 = alloca i1, i1 0
-  %nop3054 = alloca i1, i1 0
-  %nop3055 = alloca i1, i1 0
-  %nop3056 = alloca i1, i1 0
-  %nop3057 = alloca i1, i1 0
-  %nop3058 = alloca i1, i1 0
-  %nop3059 = alloca i1, i1 0
-  %nop3060 = alloca i1, i1 0
-  %nop3061 = alloca i1, i1 0
-  %nop3062 = alloca i1, i1 0
-  %nop3063 = alloca i1, i1 0
-  %nop3064 = alloca i1, i1 0
-  %nop3065 = alloca i1, i1 0
-  %nop3066 = alloca i1, i1 0
-  %nop3067 = alloca i1, i1 0
-  %nop3068 = alloca i1, i1 0
-  %nop3069 = alloca i1, i1 0
-  %nop3070 = alloca i1, i1 0
-  %nop3071 = alloca i1, i1 0
-  %nop3072 = alloca i1, i1 0
-  %nop3073 = alloca i1, i1 0
-  %nop3074 = alloca i1, i1 0
-  %nop3075 = alloca i1, i1 0
-  %nop3076 = alloca i1, i1 0
-  %nop3077 = alloca i1, i1 0
-  %nop3078 = alloca i1, i1 0
-  %nop3079 = alloca i1, i1 0
-  %nop3080 = alloca i1, i1 0
-  %nop3081 = alloca i1, i1 0
-  %nop3082 = alloca i1, i1 0
-  %nop3083 = alloca i1, i1 0
-  %nop3084 = alloca i1, i1 0
-  %nop3085 = alloca i1, i1 0
-  %nop3086 = alloca i1, i1 0
-  %nop3087 = alloca i1, i1 0
-  %nop3088 = alloca i1, i1 0
-  %nop3089 = alloca i1, i1 0
-  %nop3090 = alloca i1, i1 0
-  %nop3091 = alloca i1, i1 0
-  %nop3092 = alloca i1, i1 0
-  %nop3093 = alloca i1, i1 0
-  %nop3094 = alloca i1, i1 0
-  %nop3095 = alloca i1, i1 0
-  %nop3096 = alloca i1, i1 0
-  %nop3097 = alloca i1, i1 0
-  %nop3098 = alloca i1, i1 0
-  %nop3099 = alloca i1, i1 0
-  %nop3100 = alloca i1, i1 0
-  %nop3101 = alloca i1, i1 0
-  %nop3102 = alloca i1, i1 0
-  %nop3103 = alloca i1, i1 0
-  %nop3104 = alloca i1, i1 0
-  %nop3105 = alloca i1, i1 0
-  %nop3106 = alloca i1, i1 0
-  %nop3107 = alloca i1, i1 0
-  %nop3108 = alloca i1, i1 0
-  %nop3109 = alloca i1, i1 0
-  %nop3110 = alloca i1, i1 0
-  %nop3111 = alloca i1, i1 0
-  %nop3112 = alloca i1, i1 0
-  %nop3113 = alloca i1, i1 0
-  %nop3114 = alloca i1, i1 0
-  %nop3115 = alloca i1, i1 0
-  %nop3116 = alloca i1, i1 0
-  %nop3117 = alloca i1, i1 0
-  %nop3118 = alloca i1, i1 0
-  %nop3119 = alloca i1, i1 0
-  %nop3120 = alloca i1, i1 0
-  %nop3121 = alloca i1, i1 0
-  %nop3122 = alloca i1, i1 0
-  %nop3123 = alloca i1, i1 0
-  %nop3124 = alloca i1, i1 0
-  %nop3125 = alloca i1, i1 0
-  %nop3126 = alloca i1, i1 0
-  %nop3127 = alloca i1, i1 0
-  %nop3128 = alloca i1, i1 0
-  %nop3129 = alloca i1, i1 0
-  %nop3130 = alloca i1, i1 0
-  %nop3131 = alloca i1, i1 0
-  %nop3132 = alloca i1, i1 0
-  %nop3133 = alloca i1, i1 0
-  %nop3134 = alloca i1, i1 0
-  %nop3135 = alloca i1, i1 0
-  %nop3136 = alloca i1, i1 0
-  %nop3137 = alloca i1, i1 0
-  %nop3138 = alloca i1, i1 0
-  %nop3139 = alloca i1, i1 0
-  %nop3140 = alloca i1, i1 0
-  %nop3141 = alloca i1, i1 0
-  %nop3142 = alloca i1, i1 0
-  %nop3143 = alloca i1, i1 0
-  %nop3144 = alloca i1, i1 0
-  %nop3145 = alloca i1, i1 0
-  %nop3146 = alloca i1, i1 0
-  %nop3147 = alloca i1, i1 0
-  %nop3148 = alloca i1, i1 0
-  %nop3149 = alloca i1, i1 0
-  %nop3150 = alloca i1, i1 0
-  %nop3151 = alloca i1, i1 0
-  %nop3152 = alloca i1, i1 0
-  %nop3153 = alloca i1, i1 0
-  %nop3154 = alloca i1, i1 0
-  %nop3155 = alloca i1, i1 0
-  %nop3156 = alloca i1, i1 0
-  %nop3157 = alloca i1, i1 0
-  %nop3158 = alloca i1, i1 0
-  %nop3159 = alloca i1, i1 0
-  %nop3160 = alloca i1, i1 0
-  %nop3161 = alloca i1, i1 0
-  %nop3162 = alloca i1, i1 0
-  %nop3163 = alloca i1, i1 0
-  %nop3164 = alloca i1, i1 0
-  %nop3165 = alloca i1, i1 0
-  %nop3166 = alloca i1, i1 0
-  %nop3167 = alloca i1, i1 0
-  %nop3168 = alloca i1, i1 0
-  %nop3169 = alloca i1, i1 0
-  %nop3170 = alloca i1, i1 0
-  %nop3171 = alloca i1, i1 0
-  %nop3172 = alloca i1, i1 0
-  %nop3173 = alloca i1, i1 0
-  %nop3174 = alloca i1, i1 0
-  %nop3175 = alloca i1, i1 0
-  %nop3176 = alloca i1, i1 0
-  %nop3177 = alloca i1, i1 0
-  %nop3178 = alloca i1, i1 0
-  %nop3179 = alloca i1, i1 0
-  %nop3180 = alloca i1, i1 0
-  %nop3181 = alloca i1, i1 0
-  %nop3182 = alloca i1, i1 0
-  %nop3183 = alloca i1, i1 0
-  %nop3184 = alloca i1, i1 0
-  %nop3185 = alloca i1, i1 0
-  %nop3186 = alloca i1, i1 0
-  %nop3187 = alloca i1, i1 0
-  %nop3188 = alloca i1, i1 0
-  %nop3189 = alloca i1, i1 0
-  %nop3190 = alloca i1, i1 0
-  %nop3191 = alloca i1, i1 0
-  %nop3192 = alloca i1, i1 0
-  %nop3193 = alloca i1, i1 0
-  %nop3194 = alloca i1, i1 0
-  %nop3195 = alloca i1, i1 0
-  %nop3196 = alloca i1, i1 0
-  %nop3197 = alloca i1, i1 0
-  %nop3198 = alloca i1, i1 0
-  %nop3199 = alloca i1, i1 0
-  %nop3200 = alloca i1, i1 0
-  %nop3201 = alloca i1, i1 0
-  %nop3202 = alloca i1, i1 0
-  %nop3203 = alloca i1, i1 0
-  %nop3204 = alloca i1, i1 0
-  %nop3205 = alloca i1, i1 0
-  %nop3206 = alloca i1, i1 0
-  %nop3207 = alloca i1, i1 0
-  %nop3208 = alloca i1, i1 0
-  %nop3209 = alloca i1, i1 0
-  %nop3210 = alloca i1, i1 0
-  %nop3211 = alloca i1, i1 0
-  %nop3212 = alloca i1, i1 0
-  %nop3213 = alloca i1, i1 0
-  %nop3214 = alloca i1, i1 0
-  %nop3215 = alloca i1, i1 0
-  %nop3216 = alloca i1, i1 0
-  %nop3217 = alloca i1, i1 0
-  %nop3218 = alloca i1, i1 0
-  %nop3219 = alloca i1, i1 0
-  %nop3220 = alloca i1, i1 0
-  %nop3221 = alloca i1, i1 0
-  %nop3222 = alloca i1, i1 0
-  %nop3223 = alloca i1, i1 0
-  %nop3224 = alloca i1, i1 0
-  %nop3225 = alloca i1, i1 0
-  %nop3226 = alloca i1, i1 0
-  %nop3227 = alloca i1, i1 0
-  %nop3228 = alloca i1, i1 0
-  %nop3229 = alloca i1, i1 0
-  %nop3230 = alloca i1, i1 0
-  %nop3231 = alloca i1, i1 0
-  %nop3232 = alloca i1, i1 0
-  %nop3233 = alloca i1, i1 0
-  %nop3234 = alloca i1, i1 0
-  %nop3235 = alloca i1, i1 0
-  %nop3236 = alloca i1, i1 0
-  %nop3237 = alloca i1, i1 0
-  %nop3238 = alloca i1, i1 0
-  %nop3239 = alloca i1, i1 0
-  %nop3240 = alloca i1, i1 0
-  %nop3241 = alloca i1, i1 0
-  %nop3242 = alloca i1, i1 0
-  %nop3243 = alloca i1, i1 0
-  %nop3244 = alloca i1, i1 0
-  %nop3245 = alloca i1, i1 0
-  %nop3246 = alloca i1, i1 0
-  %nop3247 = alloca i1, i1 0
-  %nop3248 = alloca i1, i1 0
-  %nop3249 = alloca i1, i1 0
-  %nop3250 = alloca i1, i1 0
-  %nop3251 = alloca i1, i1 0
-  %nop3252 = alloca i1, i1 0
-  %nop3253 = alloca i1, i1 0
-  %nop3254 = alloca i1, i1 0
-  %nop3255 = alloca i1, i1 0
-  %nop3256 = alloca i1, i1 0
-  %nop3257 = alloca i1, i1 0
-  %nop3258 = alloca i1, i1 0
-  %nop3259 = alloca i1, i1 0
-  %nop3260 = alloca i1, i1 0
-  %nop3261 = alloca i1, i1 0
-  %nop3262 = alloca i1, i1 0
-  %nop3263 = alloca i1, i1 0
-  %nop3264 = alloca i1, i1 0
-  %nop3265 = alloca i1, i1 0
-  %nop3266 = alloca i1, i1 0
-  %nop3267 = alloca i1, i1 0
-  %nop3268 = alloca i1, i1 0
-  %nop3269 = alloca i1, i1 0
-  %nop3270 = alloca i1, i1 0
-  %nop3271 = alloca i1, i1 0
-  %nop3272 = alloca i1, i1 0
-  %nop3273 = alloca i1, i1 0
-  %nop3274 = alloca i1, i1 0
-  %nop3275 = alloca i1, i1 0
-  %nop3276 = alloca i1, i1 0
-  %nop3277 = alloca i1, i1 0
-  %nop3278 = alloca i1, i1 0
-  %nop3279 = alloca i1, i1 0
-  %nop3280 = alloca i1, i1 0
-  %nop3281 = alloca i1, i1 0
-  %nop3282 = alloca i1, i1 0
-  %nop3283 = alloca i1, i1 0
-  %nop3284 = alloca i1, i1 0
-  %nop3285 = alloca i1, i1 0
-  %nop3286 = alloca i1, i1 0
-  %nop3287 = alloca i1, i1 0
-  %nop3288 = alloca i1, i1 0
-  %nop3289 = alloca i1, i1 0
-  %nop3290 = alloca i1, i1 0
-  %nop3291 = alloca i1, i1 0
-  %nop3292 = alloca i1, i1 0
-  %nop3293 = alloca i1, i1 0
-  %nop3294 = alloca i1, i1 0
-  %nop3295 = alloca i1, i1 0
-  %nop3296 = alloca i1, i1 0
-  %nop3297 = alloca i1, i1 0
-  %nop3298 = alloca i1, i1 0
-  %nop3299 = alloca i1, i1 0
-  %nop3300 = alloca i1, i1 0
-  %nop3301 = alloca i1, i1 0
-  %nop3302 = alloca i1, i1 0
-  %nop3303 = alloca i1, i1 0
-  %nop3304 = alloca i1, i1 0
-  %nop3305 = alloca i1, i1 0
-  %nop3306 = alloca i1, i1 0
-  %nop3307 = alloca i1, i1 0
-  %nop3308 = alloca i1, i1 0
-  %nop3309 = alloca i1, i1 0
-  %nop3310 = alloca i1, i1 0
-  %nop3311 = alloca i1, i1 0
-  %nop3312 = alloca i1, i1 0
-  %nop3313 = alloca i1, i1 0
-  %nop3314 = alloca i1, i1 0
-  %nop3315 = alloca i1, i1 0
-  %nop3316 = alloca i1, i1 0
-  %nop3317 = alloca i1, i1 0
-  %nop3318 = alloca i1, i1 0
-  %nop3319 = alloca i1, i1 0
-  %nop3320 = alloca i1, i1 0
-  %nop3321 = alloca i1, i1 0
-  %nop3322 = alloca i1, i1 0
-  %nop3323 = alloca i1, i1 0
-  %nop3324 = alloca i1, i1 0
-  %nop3325 = alloca i1, i1 0
-  %nop3326 = alloca i1, i1 0
-  %nop3327 = alloca i1, i1 0
-  %nop3328 = alloca i1, i1 0
-  %nop3329 = alloca i1, i1 0
-  %nop3330 = alloca i1, i1 0
-  %nop3331 = alloca i1, i1 0
-  %nop3332 = alloca i1, i1 0
-  %nop3333 = alloca i1, i1 0
-  %nop3334 = alloca i1, i1 0
-  %nop3335 = alloca i1, i1 0
-  %nop3336 = alloca i1, i1 0
-  %nop3337 = alloca i1, i1 0
-  %nop3338 = alloca i1, i1 0
-  %nop3339 = alloca i1, i1 0
-  %nop3340 = alloca i1, i1 0
-  %nop3341 = alloca i1, i1 0
-  %nop3342 = alloca i1, i1 0
-  %nop3343 = alloca i1, i1 0
-  %nop3344 = alloca i1, i1 0
-  %nop3345 = alloca i1, i1 0
-  %nop3346 = alloca i1, i1 0
-  %nop3347 = alloca i1, i1 0
-  %nop3348 = alloca i1, i1 0
-  %nop3349 = alloca i1, i1 0
-  %nop3350 = alloca i1, i1 0
-  %nop3351 = alloca i1, i1 0
-  %nop3352 = alloca i1, i1 0
-  %nop3353 = alloca i1, i1 0
-  %nop3354 = alloca i1, i1 0
-  %nop3355 = alloca i1, i1 0
-  %nop3356 = alloca i1, i1 0
-  %nop3357 = alloca i1, i1 0
-  %nop3358 = alloca i1, i1 0
-  %nop3359 = alloca i1, i1 0
-  %nop3360 = alloca i1, i1 0
-  %nop3361 = alloca i1, i1 0
-  %nop3362 = alloca i1, i1 0
-  %nop3363 = alloca i1, i1 0
-  %nop3364 = alloca i1, i1 0
-  %nop3365 = alloca i1, i1 0
-  %nop3366 = alloca i1, i1 0
-  %nop3367 = alloca i1, i1 0
-  %nop3368 = alloca i1, i1 0
-  %nop3369 = alloca i1, i1 0
-  %nop3370 = alloca i1, i1 0
-  %nop3371 = alloca i1, i1 0
-  %nop3372 = alloca i1, i1 0
-  %nop3373 = alloca i1, i1 0
-  %nop3374 = alloca i1, i1 0
-  %nop3375 = alloca i1, i1 0
-  %nop3376 = alloca i1, i1 0
-  %nop3377 = alloca i1, i1 0
-  %nop3378 = alloca i1, i1 0
-  %nop3379 = alloca i1, i1 0
-  %nop3380 = alloca i1, i1 0
-  %nop3381 = alloca i1, i1 0
-  %nop3382 = alloca i1, i1 0
-  %nop3383 = alloca i1, i1 0
-  %nop3384 = alloca i1, i1 0
-  %nop3385 = alloca i1, i1 0
-  %nop3386 = alloca i1, i1 0
-  %nop3387 = alloca i1, i1 0
-  %nop3388 = alloca i1, i1 0
-  %nop3389 = alloca i1, i1 0
-  %nop3390 = alloca i1, i1 0
-  %nop3391 = alloca i1, i1 0
-  %nop3392 = alloca i1, i1 0
-  %nop3393 = alloca i1, i1 0
-  %nop3394 = alloca i1, i1 0
-  %nop3395 = alloca i1, i1 0
-  %nop3396 = alloca i1, i1 0
-  %nop3397 = alloca i1, i1 0
-  %nop3398 = alloca i1, i1 0
-  %nop3399 = alloca i1, i1 0
-  %nop3400 = alloca i1, i1 0
-  %nop3401 = alloca i1, i1 0
-  %nop3402 = alloca i1, i1 0
-  %nop3403 = alloca i1, i1 0
-  %nop3404 = alloca i1, i1 0
-  %nop3405 = alloca i1, i1 0
-  %nop3406 = alloca i1, i1 0
-  %nop3407 = alloca i1, i1 0
-  %nop3408 = alloca i1, i1 0
-  %nop3409 = alloca i1, i1 0
-  %nop3410 = alloca i1, i1 0
-  %nop3411 = alloca i1, i1 0
-  %nop3412 = alloca i1, i1 0
-  %nop3413 = alloca i1, i1 0
-  %nop3414 = alloca i1, i1 0
-  %nop3415 = alloca i1, i1 0
-  %nop3416 = alloca i1, i1 0
-  %nop3417 = alloca i1, i1 0
-  %nop3418 = alloca i1, i1 0
-  %nop3419 = alloca i1, i1 0
-  %nop3420 = alloca i1, i1 0
-  %nop3421 = alloca i1, i1 0
-  %nop3422 = alloca i1, i1 0
-  %nop3423 = alloca i1, i1 0
-  %nop3424 = alloca i1, i1 0
-  %nop3425 = alloca i1, i1 0
-  %nop3426 = alloca i1, i1 0
-  %nop3427 = alloca i1, i1 0
-  %nop3428 = alloca i1, i1 0
-  %nop3429 = alloca i1, i1 0
-  %nop3430 = alloca i1, i1 0
-  %nop3431 = alloca i1, i1 0
-  %nop3432 = alloca i1, i1 0
-  %nop3433 = alloca i1, i1 0
-  %nop3434 = alloca i1, i1 0
-  %nop3435 = alloca i1, i1 0
-  %nop3436 = alloca i1, i1 0
-  %nop3437 = alloca i1, i1 0
-  %nop3438 = alloca i1, i1 0
-  %nop3439 = alloca i1, i1 0
-  %nop3440 = alloca i1, i1 0
-  %nop3441 = alloca i1, i1 0
-  %nop3442 = alloca i1, i1 0
-  %nop3443 = alloca i1, i1 0
-  %nop3444 = alloca i1, i1 0
-  %nop3445 = alloca i1, i1 0
-  %nop3446 = alloca i1, i1 0
-  %nop3447 = alloca i1, i1 0
-  %nop3448 = alloca i1, i1 0
-  %nop3449 = alloca i1, i1 0
-  %nop3450 = alloca i1, i1 0
-  %nop3451 = alloca i1, i1 0
-  %nop3452 = alloca i1, i1 0
-  %nop3453 = alloca i1, i1 0
-  %nop3454 = alloca i1, i1 0
-  %nop3455 = alloca i1, i1 0
-  %nop3456 = alloca i1, i1 0
-  %nop3457 = alloca i1, i1 0
-  %nop3458 = alloca i1, i1 0
-  %nop3459 = alloca i1, i1 0
-  %nop3460 = alloca i1, i1 0
-  %nop3461 = alloca i1, i1 0
-  %nop3462 = alloca i1, i1 0
-  %nop3463 = alloca i1, i1 0
-  %nop3464 = alloca i1, i1 0
-  %nop3465 = alloca i1, i1 0
-  %nop3466 = alloca i1, i1 0
-  %nop3467 = alloca i1, i1 0
-  %nop3468 = alloca i1, i1 0
-  %nop3469 = alloca i1, i1 0
-  %nop3470 = alloca i1, i1 0
-  %nop3471 = alloca i1, i1 0
-  %nop3472 = alloca i1, i1 0
-  %nop3473 = alloca i1, i1 0
-  %nop3474 = alloca i1, i1 0
-  %nop3475 = alloca i1, i1 0
-  %nop3476 = alloca i1, i1 0
-  %nop3477 = alloca i1, i1 0
-  %nop3478 = alloca i1, i1 0
-  %nop3479 = alloca i1, i1 0
-  %nop3480 = alloca i1, i1 0
-  %nop3481 = alloca i1, i1 0
-  %nop3482 = alloca i1, i1 0
-  %nop3483 = alloca i1, i1 0
-  %nop3484 = alloca i1, i1 0
-  %nop3485 = alloca i1, i1 0
-  %nop3486 = alloca i1, i1 0
-  %nop3487 = alloca i1, i1 0
-  %nop3488 = alloca i1, i1 0
-  %nop3489 = alloca i1, i1 0
-  %nop3490 = alloca i1, i1 0
-  %nop3491 = alloca i1, i1 0
-  %nop3492 = alloca i1, i1 0
-  %nop3493 = alloca i1, i1 0
-  %nop3494 = alloca i1, i1 0
-  %nop3495 = alloca i1, i1 0
-  %nop3496 = alloca i1, i1 0
-  %nop3497 = alloca i1, i1 0
-  %nop3498 = alloca i1, i1 0
-  %nop3499 = alloca i1, i1 0
-  %nop3500 = alloca i1, i1 0
-  %nop3501 = alloca i1, i1 0
-  %nop3502 = alloca i1, i1 0
-  %nop3503 = alloca i1, i1 0
-  %nop3504 = alloca i1, i1 0
-  %nop3505 = alloca i1, i1 0
-  %nop3506 = alloca i1, i1 0
-  %nop3507 = alloca i1, i1 0
-  %nop3508 = alloca i1, i1 0
-  %nop3509 = alloca i1, i1 0
-  %nop3510 = alloca i1, i1 0
-  %nop3511 = alloca i1, i1 0
-  %nop3512 = alloca i1, i1 0
-  %nop3513 = alloca i1, i1 0
-  %nop3514 = alloca i1, i1 0
-  %nop3515 = alloca i1, i1 0
-  %nop3516 = alloca i1, i1 0
-  %nop3517 = alloca i1, i1 0
-  %nop3518 = alloca i1, i1 0
-  %nop3519 = alloca i1, i1 0
-  %nop3520 = alloca i1, i1 0
-  %nop3521 = alloca i1, i1 0
-  %nop3522 = alloca i1, i1 0
-  %nop3523 = alloca i1, i1 0
-  %nop3524 = alloca i1, i1 0
-  %nop3525 = alloca i1, i1 0
-  %nop3526 = alloca i1, i1 0
-  %nop3527 = alloca i1, i1 0
-  %nop3528 = alloca i1, i1 0
-  %nop3529 = alloca i1, i1 0
-  %nop3530 = alloca i1, i1 0
-  %nop3531 = alloca i1, i1 0
-  %nop3532 = alloca i1, i1 0
-  %nop3533 = alloca i1, i1 0
-  %nop3534 = alloca i1, i1 0
-  %nop3535 = alloca i1, i1 0
-  %nop3536 = alloca i1, i1 0
-  %nop3537 = alloca i1, i1 0
-  %nop3538 = alloca i1, i1 0
-  %nop3539 = alloca i1, i1 0
-  %nop3540 = alloca i1, i1 0
-  %nop3541 = alloca i1, i1 0
-  %nop3542 = alloca i1, i1 0
-  %nop3543 = alloca i1, i1 0
-  %nop3544 = alloca i1, i1 0
-  %nop3545 = alloca i1, i1 0
-  %nop3546 = alloca i1, i1 0
-  %nop3547 = alloca i1, i1 0
-  %nop3548 = alloca i1, i1 0
-  %nop3549 = alloca i1, i1 0
-  %nop3550 = alloca i1, i1 0
-  %nop3551 = alloca i1, i1 0
-  %nop3552 = alloca i1, i1 0
-  %nop3553 = alloca i1, i1 0
-  %nop3554 = alloca i1, i1 0
-  %nop3555 = alloca i1, i1 0
-  %nop3556 = alloca i1, i1 0
-  %nop3557 = alloca i1, i1 0
-  %nop3558 = alloca i1, i1 0
-  %nop3559 = alloca i1, i1 0
-  %nop3560 = alloca i1, i1 0
-  %nop3561 = alloca i1, i1 0
-  %nop3562 = alloca i1, i1 0
-  %nop3563 = alloca i1, i1 0
-  %nop3564 = alloca i1, i1 0
-  %nop3565 = alloca i1, i1 0
-  %nop3566 = alloca i1, i1 0
-  %nop3567 = alloca i1, i1 0
-  %nop3568 = alloca i1, i1 0
-  %nop3569 = alloca i1, i1 0
-  %nop3570 = alloca i1, i1 0
-  %nop3571 = alloca i1, i1 0
-  %nop3572 = alloca i1, i1 0
-  %nop3573 = alloca i1, i1 0
-  %nop3574 = alloca i1, i1 0
-  %nop3575 = alloca i1, i1 0
-  %nop3576 = alloca i1, i1 0
-  %nop3577 = alloca i1, i1 0
-  %nop3578 = alloca i1, i1 0
-  %nop3579 = alloca i1, i1 0
-  %nop3580 = alloca i1, i1 0
-  %nop3581 = alloca i1, i1 0
-  %nop3582 = alloca i1, i1 0
-  %nop3583 = alloca i1, i1 0
-  %nop3584 = alloca i1, i1 0
-  %nop3585 = alloca i1, i1 0
-  %nop3586 = alloca i1, i1 0
-  %nop3587 = alloca i1, i1 0
-  %nop3588 = alloca i1, i1 0
-  %nop3589 = alloca i1, i1 0
-  %nop3590 = alloca i1, i1 0
-  %nop3591 = alloca i1, i1 0
-  %nop3592 = alloca i1, i1 0
-  %nop3593 = alloca i1, i1 0
-  %nop3594 = alloca i1, i1 0
-  %nop3595 = alloca i1, i1 0
-  %nop3596 = alloca i1, i1 0
-  %nop3597 = alloca i1, i1 0
-  %nop3598 = alloca i1, i1 0
-  %nop3599 = alloca i1, i1 0
-  %nop3600 = alloca i1, i1 0
-  %nop3601 = alloca i1, i1 0
-  %nop3602 = alloca i1, i1 0
-  %nop3603 = alloca i1, i1 0
-  %nop3604 = alloca i1, i1 0
-  %nop3605 = alloca i1, i1 0
-  %nop3606 = alloca i1, i1 0
-  %nop3607 = alloca i1, i1 0
-  %nop3608 = alloca i1, i1 0
-  %nop3609 = alloca i1, i1 0
-  %nop3610 = alloca i1, i1 0
-  %nop3611 = alloca i1, i1 0
-  %nop3612 = alloca i1, i1 0
-  %nop3613 = alloca i1, i1 0
-  %nop3614 = alloca i1, i1 0
-  %nop3615 = alloca i1, i1 0
-  %nop3616 = alloca i1, i1 0
-  %nop3617 = alloca i1, i1 0
-  %nop3618 = alloca i1, i1 0
-  %nop3619 = alloca i1, i1 0
-  %nop3620 = alloca i1, i1 0
-  %nop3621 = alloca i1, i1 0
-  %nop3622 = alloca i1, i1 0
-  %nop3623 = alloca i1, i1 0
-  %nop3624 = alloca i1, i1 0
-  %nop3625 = alloca i1, i1 0
-  %nop3626 = alloca i1, i1 0
-  %nop3627 = alloca i1, i1 0
-  %nop3628 = alloca i1, i1 0
-  %nop3629 = alloca i1, i1 0
-  %nop3630 = alloca i1, i1 0
-  %nop3631 = alloca i1, i1 0
-  %nop3632 = alloca i1, i1 0
-  %nop3633 = alloca i1, i1 0
-  %nop3634 = alloca i1, i1 0
-  %nop3635 = alloca i1, i1 0
-  %nop3636 = alloca i1, i1 0
-  %nop3637 = alloca i1, i1 0
-  %nop3638 = alloca i1, i1 0
-  %nop3639 = alloca i1, i1 0
-  %nop3640 = alloca i1, i1 0
-  %nop3641 = alloca i1, i1 0
-  %nop3642 = alloca i1, i1 0
-  %nop3643 = alloca i1, i1 0
-  %nop3644 = alloca i1, i1 0
-  %nop3645 = alloca i1, i1 0
-  %nop3646 = alloca i1, i1 0
-  %nop3647 = alloca i1, i1 0
-  %nop3648 = alloca i1, i1 0
-  %nop3649 = alloca i1, i1 0
-  %nop3650 = alloca i1, i1 0
-  %nop3651 = alloca i1, i1 0
-  %nop3652 = alloca i1, i1 0
-  %nop3653 = alloca i1, i1 0
-  %nop3654 = alloca i1, i1 0
-  %nop3655 = alloca i1, i1 0
-  %nop3656 = alloca i1, i1 0
-  %nop3657 = alloca i1, i1 0
-  %nop3658 = alloca i1, i1 0
-  %nop3659 = alloca i1, i1 0
-  %nop3660 = alloca i1, i1 0
-  %nop3661 = alloca i1, i1 0
-  %nop3662 = alloca i1, i1 0
-  %nop3663 = alloca i1, i1 0
-  %nop3664 = alloca i1, i1 0
-  %nop3665 = alloca i1, i1 0
-  %nop3666 = alloca i1, i1 0
-  %nop3667 = alloca i1, i1 0
-  %nop3668 = alloca i1, i1 0
-  %nop3669 = alloca i1, i1 0
-  %nop3670 = alloca i1, i1 0
-  %nop3671 = alloca i1, i1 0
-  %nop3672 = alloca i1, i1 0
-  %nop3673 = alloca i1, i1 0
-  %nop3674 = alloca i1, i1 0
-  %nop3675 = alloca i1, i1 0
-  %nop3676 = alloca i1, i1 0
-  %nop3677 = alloca i1, i1 0
-  %nop3678 = alloca i1, i1 0
-  %nop3679 = alloca i1, i1 0
-  %nop3680 = alloca i1, i1 0
-  %nop3681 = alloca i1, i1 0
-  %nop3682 = alloca i1, i1 0
-  %nop3683 = alloca i1, i1 0
-  %nop3684 = alloca i1, i1 0
-  %nop3685 = alloca i1, i1 0
-  %nop3686 = alloca i1, i1 0
-  %nop3687 = alloca i1, i1 0
-  %nop3688 = alloca i1, i1 0
-  %nop3689 = alloca i1, i1 0
-  %nop3690 = alloca i1, i1 0
-  %nop3691 = alloca i1, i1 0
-  %nop3692 = alloca i1, i1 0
-  %nop3693 = alloca i1, i1 0
-  %nop3694 = alloca i1, i1 0
-  %nop3695 = alloca i1, i1 0
-  %nop3696 = alloca i1, i1 0
-  %nop3697 = alloca i1, i1 0
-  %nop3698 = alloca i1, i1 0
-  %nop3699 = alloca i1, i1 0
-  %nop3700 = alloca i1, i1 0
-  %nop3701 = alloca i1, i1 0
-  %nop3702 = alloca i1, i1 0
-  %nop3703 = alloca i1, i1 0
-  %nop3704 = alloca i1, i1 0
-  %nop3705 = alloca i1, i1 0
-  %nop3706 = alloca i1, i1 0
-  %nop3707 = alloca i1, i1 0
-  %nop3708 = alloca i1, i1 0
-  %nop3709 = alloca i1, i1 0
-  %nop3710 = alloca i1, i1 0
-  %nop3711 = alloca i1, i1 0
-  %nop3712 = alloca i1, i1 0
-  %nop3713 = alloca i1, i1 0
-  %nop3714 = alloca i1, i1 0
-  %nop3715 = alloca i1, i1 0
-  %nop3716 = alloca i1, i1 0
-  %nop3717 = alloca i1, i1 0
-  %nop3718 = alloca i1, i1 0
-  %nop3719 = alloca i1, i1 0
-  %nop3720 = alloca i1, i1 0
-  %nop3721 = alloca i1, i1 0
-  %nop3722 = alloca i1, i1 0
-  %nop3723 = alloca i1, i1 0
-  %nop3724 = alloca i1, i1 0
-  %nop3725 = alloca i1, i1 0
-  %nop3726 = alloca i1, i1 0
-  %nop3727 = alloca i1, i1 0
-  %nop3728 = alloca i1, i1 0
-  %nop3729 = alloca i1, i1 0
-  %nop3730 = alloca i1, i1 0
-  %nop3731 = alloca i1, i1 0
-  %nop3732 = alloca i1, i1 0
-  %nop3733 = alloca i1, i1 0
-  %nop3734 = alloca i1, i1 0
-  %nop3735 = alloca i1, i1 0
-  %nop3736 = alloca i1, i1 0
-  %nop3737 = alloca i1, i1 0
-  %nop3738 = alloca i1, i1 0
-  %nop3739 = alloca i1, i1 0
-  %nop3740 = alloca i1, i1 0
-  %nop3741 = alloca i1, i1 0
-  %nop3742 = alloca i1, i1 0
-  %nop3743 = alloca i1, i1 0
-  %nop3744 = alloca i1, i1 0
-  %nop3745 = alloca i1, i1 0
-  %nop3746 = alloca i1, i1 0
-  %nop3747 = alloca i1, i1 0
-  %nop3748 = alloca i1, i1 0
-  %nop3749 = alloca i1, i1 0
-  %nop3750 = alloca i1, i1 0
-  %nop3751 = alloca i1, i1 0
-  %nop3752 = alloca i1, i1 0
-  %nop3753 = alloca i1, i1 0
-  %nop3754 = alloca i1, i1 0
-  %nop3755 = alloca i1, i1 0
-  %nop3756 = alloca i1, i1 0
-  %nop3757 = alloca i1, i1 0
-  %nop3758 = alloca i1, i1 0
-  %nop3759 = alloca i1, i1 0
-  %nop3760 = alloca i1, i1 0
-  %nop3761 = alloca i1, i1 0
-  %nop3762 = alloca i1, i1 0
-  %nop3763 = alloca i1, i1 0
-  %nop3764 = alloca i1, i1 0
-  %nop3765 = alloca i1, i1 0
-  %nop3766 = alloca i1, i1 0
-  %nop3767 = alloca i1, i1 0
-  %nop3768 = alloca i1, i1 0
-  %nop3769 = alloca i1, i1 0
-  %nop3770 = alloca i1, i1 0
-  %nop3771 = alloca i1, i1 0
-  %nop3772 = alloca i1, i1 0
-  %nop3773 = alloca i1, i1 0
-  %nop3774 = alloca i1, i1 0
-  %nop3775 = alloca i1, i1 0
-  %nop3776 = alloca i1, i1 0
-  %nop3777 = alloca i1, i1 0
-  %nop3778 = alloca i1, i1 0
-  %nop3779 = alloca i1, i1 0
-  %nop3780 = alloca i1, i1 0
-  %nop3781 = alloca i1, i1 0
-  %nop3782 = alloca i1, i1 0
-  %nop3783 = alloca i1, i1 0
-  %nop3784 = alloca i1, i1 0
-  %nop3785 = alloca i1, i1 0
-  %nop3786 = alloca i1, i1 0
-  %nop3787 = alloca i1, i1 0
-  %nop3788 = alloca i1, i1 0
-  %nop3789 = alloca i1, i1 0
-  %nop3790 = alloca i1, i1 0
-  %nop3791 = alloca i1, i1 0
-  %nop3792 = alloca i1, i1 0
-  %nop3793 = alloca i1, i1 0
-  %nop3794 = alloca i1, i1 0
-  %nop3795 = alloca i1, i1 0
-  %nop3796 = alloca i1, i1 0
-  %nop3797 = alloca i1, i1 0
-  %nop3798 = alloca i1, i1 0
-  %nop3799 = alloca i1, i1 0
-  %nop3800 = alloca i1, i1 0
-  %nop3801 = alloca i1, i1 0
-  %nop3802 = alloca i1, i1 0
-  %nop3803 = alloca i1, i1 0
-  %nop3804 = alloca i1, i1 0
-  %nop3805 = alloca i1, i1 0
-  %nop3806 = alloca i1, i1 0
-  %nop3807 = alloca i1, i1 0
-  %nop3808 = alloca i1, i1 0
-  %nop3809 = alloca i1, i1 0
-  %nop3810 = alloca i1, i1 0
-  %nop3811 = alloca i1, i1 0
-  %nop3812 = alloca i1, i1 0
-  %nop3813 = alloca i1, i1 0
-  %nop3814 = alloca i1, i1 0
-  %nop3815 = alloca i1, i1 0
-  %nop3816 = alloca i1, i1 0
-  %nop3817 = alloca i1, i1 0
-  %nop3818 = alloca i1, i1 0
-  %nop3819 = alloca i1, i1 0
-  %nop3820 = alloca i1, i1 0
-  %nop3821 = alloca i1, i1 0
-  %nop3822 = alloca i1, i1 0
-  %nop3823 = alloca i1, i1 0
-  %nop3824 = alloca i1, i1 0
-  %nop3825 = alloca i1, i1 0
-  %nop3826 = alloca i1, i1 0
-  %nop3827 = alloca i1, i1 0
-  %nop3828 = alloca i1, i1 0
-  %nop3829 = alloca i1, i1 0
-  %nop3830 = alloca i1, i1 0
-  %nop3831 = alloca i1, i1 0
-  %nop3832 = alloca i1, i1 0
-  %nop3833 = alloca i1, i1 0
-  %nop3834 = alloca i1, i1 0
-  %nop3835 = alloca i1, i1 0
-  %nop3836 = alloca i1, i1 0
-  %nop3837 = alloca i1, i1 0
-  %nop3838 = alloca i1, i1 0
-  %nop3839 = alloca i1, i1 0
-  %nop3840 = alloca i1, i1 0
-  %nop3841 = alloca i1, i1 0
-  %nop3842 = alloca i1, i1 0
-  %nop3843 = alloca i1, i1 0
-  %nop3844 = alloca i1, i1 0
-  %nop3845 = alloca i1, i1 0
-  %nop3846 = alloca i1, i1 0
-  %nop3847 = alloca i1, i1 0
-  %nop3848 = alloca i1, i1 0
-  %nop3849 = alloca i1, i1 0
-  %nop3850 = alloca i1, i1 0
-  %nop3851 = alloca i1, i1 0
-  %nop3852 = alloca i1, i1 0
-  %nop3853 = alloca i1, i1 0
-  %nop3854 = alloca i1, i1 0
-  %nop3855 = alloca i1, i1 0
-  %nop3856 = alloca i1, i1 0
-  %nop3857 = alloca i1, i1 0
-  %nop3858 = alloca i1, i1 0
-  %nop3859 = alloca i1, i1 0
-  %nop3860 = alloca i1, i1 0
-  %nop3861 = alloca i1, i1 0
-  %nop3862 = alloca i1, i1 0
-  %nop3863 = alloca i1, i1 0
-  %nop3864 = alloca i1, i1 0
-  %nop3865 = alloca i1, i1 0
-  %nop3866 = alloca i1, i1 0
-  %nop3867 = alloca i1, i1 0
-  %nop3868 = alloca i1, i1 0
-  %nop3869 = alloca i1, i1 0
-  %nop3870 = alloca i1, i1 0
-  %nop3871 = alloca i1, i1 0
-  %nop3872 = alloca i1, i1 0
-  %nop3873 = alloca i1, i1 0
-  %nop3874 = alloca i1, i1 0
-  %nop3875 = alloca i1, i1 0
-  %nop3876 = alloca i1, i1 0
-  %nop3877 = alloca i1, i1 0
-  %nop3878 = alloca i1, i1 0
-  %nop3879 = alloca i1, i1 0
-  %nop3880 = alloca i1, i1 0
-  %nop3881 = alloca i1, i1 0
-  %nop3882 = alloca i1, i1 0
-  %nop3883 = alloca i1, i1 0
-  %nop3884 = alloca i1, i1 0
-  %nop3885 = alloca i1, i1 0
-  %nop3886 = alloca i1, i1 0
-  %nop3887 = alloca i1, i1 0
-  %nop3888 = alloca i1, i1 0
-  %nop3889 = alloca i1, i1 0
-  %nop3890 = alloca i1, i1 0
-  %nop3891 = alloca i1, i1 0
-  %nop3892 = alloca i1, i1 0
-  %nop3893 = alloca i1, i1 0
-  %nop3894 = alloca i1, i1 0
-  %nop3895 = alloca i1, i1 0
-  %nop3896 = alloca i1, i1 0
-  %nop3897 = alloca i1, i1 0
-  %nop3898 = alloca i1, i1 0
-  %nop3899 = alloca i1, i1 0
-  %nop3900 = alloca i1, i1 0
-  %nop3901 = alloca i1, i1 0
-  %nop3902 = alloca i1, i1 0
-  %nop3903 = alloca i1, i1 0
-  %nop3904 = alloca i1, i1 0
-  %nop3905 = alloca i1, i1 0
-  %nop3906 = alloca i1, i1 0
-  %nop3907 = alloca i1, i1 0
-  %nop3908 = alloca i1, i1 0
-  %nop3909 = alloca i1, i1 0
-  %nop3910 = alloca i1, i1 0
-  %nop3911 = alloca i1, i1 0
-  %nop3912 = alloca i1, i1 0
-  %nop3913 = alloca i1, i1 0
-  %nop3914 = alloca i1, i1 0
-  %nop3915 = alloca i1, i1 0
-  %nop3916 = alloca i1, i1 0
-  %nop3917 = alloca i1, i1 0
-  %nop3918 = alloca i1, i1 0
-  %nop3919 = alloca i1, i1 0
-  %nop3920 = alloca i1, i1 0
-  %nop3921 = alloca i1, i1 0
-  %nop3922 = alloca i1, i1 0
-  %nop3923 = alloca i1, i1 0
-  %nop3924 = alloca i1, i1 0
-  %nop3925 = alloca i1, i1 0
-  %nop3926 = alloca i1, i1 0
-  %nop3927 = alloca i1, i1 0
-  %nop3928 = alloca i1, i1 0
-  %nop3929 = alloca i1, i1 0
-  %nop3930 = alloca i1, i1 0
-  %nop3931 = alloca i1, i1 0
-  %nop3932 = alloca i1, i1 0
-  %nop3933 = alloca i1, i1 0
-  %nop3934 = alloca i1, i1 0
-  %nop3935 = alloca i1, i1 0
-  %nop3936 = alloca i1, i1 0
-  %nop3937 = alloca i1, i1 0
-  %nop3938 = alloca i1, i1 0
-  %nop3939 = alloca i1, i1 0
-  %nop3940 = alloca i1, i1 0
-  %nop3941 = alloca i1, i1 0
-  %nop3942 = alloca i1, i1 0
-  %nop3943 = alloca i1, i1 0
-  %nop3944 = alloca i1, i1 0
-  %nop3945 = alloca i1, i1 0
-  %nop3946 = alloca i1, i1 0
-  %nop3947 = alloca i1, i1 0
-  %nop3948 = alloca i1, i1 0
-  %nop3949 = alloca i1, i1 0
-  %nop3950 = alloca i1, i1 0
-  %nop3951 = alloca i1, i1 0
-  %nop3952 = alloca i1, i1 0
-  %nop3953 = alloca i1, i1 0
-  %nop3954 = alloca i1, i1 0
-  %nop3955 = alloca i1, i1 0
-  %nop3956 = alloca i1, i1 0
-  %nop3957 = alloca i1, i1 0
-  %nop3958 = alloca i1, i1 0
-  %nop3959 = alloca i1, i1 0
-  %nop3960 = alloca i1, i1 0
-  %nop3961 = alloca i1, i1 0
-  %nop3962 = alloca i1, i1 0
-  %nop3963 = alloca i1, i1 0
-  %nop3964 = alloca i1, i1 0
-  %nop3965 = alloca i1, i1 0
-  %nop3966 = alloca i1, i1 0
-  %nop3967 = alloca i1, i1 0
-  %nop3968 = alloca i1, i1 0
-  %nop3969 = alloca i1, i1 0
-  %nop3970 = alloca i1, i1 0
-  %nop3971 = alloca i1, i1 0
-  %nop3972 = alloca i1, i1 0
-  %nop3973 = alloca i1, i1 0
-  %nop3974 = alloca i1, i1 0
-  %nop3975 = alloca i1, i1 0
-  %nop3976 = alloca i1, i1 0
-  %nop3977 = alloca i1, i1 0
-  %nop3978 = alloca i1, i1 0
-  %nop3979 = alloca i1, i1 0
-  %nop3980 = alloca i1, i1 0
-  %nop3981 = alloca i1, i1 0
-  %nop3982 = alloca i1, i1 0
-  %nop3983 = alloca i1, i1 0
-  %nop3984 = alloca i1, i1 0
-  %nop3985 = alloca i1, i1 0
-  %nop3986 = alloca i1, i1 0
-  %nop3987 = alloca i1, i1 0
-  %nop3988 = alloca i1, i1 0
-  %nop3989 = alloca i1, i1 0
-  %nop3990 = alloca i1, i1 0
-  %nop3991 = alloca i1, i1 0
-  %nop3992 = alloca i1, i1 0
-  %nop3993 = alloca i1, i1 0
-  %nop3994 = alloca i1, i1 0
-  %nop3995 = alloca i1, i1 0
-  %nop3996 = alloca i1, i1 0
-  %nop3997 = alloca i1, i1 0
-  %nop3998 = alloca i1, i1 0
-  %nop3999 = alloca i1, i1 0
-  %nop4000 = alloca i1, i1 0
-  %nop4001 = alloca i1, i1 0
-  %nop4002 = alloca i1, i1 0
-  %nop4003 = alloca i1, i1 0
-  %nop4004 = alloca i1, i1 0
-  %nop4005 = alloca i1, i1 0
-  %nop4006 = alloca i1, i1 0
-  %nop4007 = alloca i1, i1 0
-  %nop4008 = alloca i1, i1 0
-  %nop4009 = alloca i1, i1 0
-  %nop4010 = alloca i1, i1 0
-  %nop4011 = alloca i1, i1 0
-  %nop4012 = alloca i1, i1 0
-  %nop4013 = alloca i1, i1 0
-  %nop4014 = alloca i1, i1 0
-  %nop4015 = alloca i1, i1 0
-  %nop4016 = alloca i1, i1 0
-  %nop4017 = alloca i1, i1 0
-  %nop4018 = alloca i1, i1 0
-  %nop4019 = alloca i1, i1 0
-  %nop4020 = alloca i1, i1 0
-  %nop4021 = alloca i1, i1 0
-  %nop4022 = alloca i1, i1 0
-  %nop4023 = alloca i1, i1 0
-  %nop4024 = alloca i1, i1 0
-  %nop4025 = alloca i1, i1 0
-  %nop4026 = alloca i1, i1 0
-  %nop4027 = alloca i1, i1 0
-  %nop4028 = alloca i1, i1 0
-  %nop4029 = alloca i1, i1 0
-  %nop4030 = alloca i1, i1 0
-  %nop4031 = alloca i1, i1 0
-  %nop4032 = alloca i1, i1 0
-  %nop4033 = alloca i1, i1 0
-  %nop4034 = alloca i1, i1 0
-  %nop4035 = alloca i1, i1 0
-  %nop4036 = alloca i1, i1 0
-  %nop4037 = alloca i1, i1 0
-  %nop4038 = alloca i1, i1 0
-  %nop4039 = alloca i1, i1 0
-  %nop4040 = alloca i1, i1 0
-  %nop4041 = alloca i1, i1 0
-  %nop4042 = alloca i1, i1 0
-  %nop4043 = alloca i1, i1 0
-  %nop4044 = alloca i1, i1 0
-  %nop4045 = alloca i1, i1 0
-  %nop4046 = alloca i1, i1 0
-  %nop4047 = alloca i1, i1 0
-  %nop4048 = alloca i1, i1 0
-  %nop4049 = alloca i1, i1 0
-  %nop4050 = alloca i1, i1 0
-  %nop4051 = alloca i1, i1 0
-  %nop4052 = alloca i1, i1 0
-  %nop4053 = alloca i1, i1 0
-  %nop4054 = alloca i1, i1 0
-  %nop4055 = alloca i1, i1 0
-  %nop4056 = alloca i1, i1 0
-  %nop4057 = alloca i1, i1 0
-  %nop4058 = alloca i1, i1 0
-  %nop4059 = alloca i1, i1 0
-  %nop4060 = alloca i1, i1 0
-  %nop4061 = alloca i1, i1 0
-  %nop4062 = alloca i1, i1 0
-  %nop4063 = alloca i1, i1 0
-  %nop4064 = alloca i1, i1 0
-  %nop4065 = alloca i1, i1 0
-  %nop4066 = alloca i1, i1 0
-  %nop4067 = alloca i1, i1 0
-  %nop4068 = alloca i1, i1 0
-  %nop4069 = alloca i1, i1 0
-  %nop4070 = alloca i1, i1 0
-  %nop4071 = alloca i1, i1 0
-  %nop4072 = alloca i1, i1 0
-  %nop4073 = alloca i1, i1 0
-  %nop4074 = alloca i1, i1 0
-  %nop4075 = alloca i1, i1 0
-  %nop4076 = alloca i1, i1 0
-  %nop4077 = alloca i1, i1 0
-  %nop4078 = alloca i1, i1 0
-  %nop4079 = alloca i1, i1 0
-  %nop4080 = alloca i1, i1 0
-  %nop4081 = alloca i1, i1 0
-  %nop4082 = alloca i1, i1 0
-  %nop4083 = alloca i1, i1 0
-  %nop4084 = alloca i1, i1 0
-  %nop4085 = alloca i1, i1 0
-  %nop4086 = alloca i1, i1 0
-  %nop4087 = alloca i1, i1 0
-  %nop4088 = alloca i1, i1 0
-  %nop4089 = alloca i1, i1 0
-  %nop4090 = alloca i1, i1 0
-  %nop4091 = alloca i1, i1 0
-  %nop4092 = alloca i1, i1 0
-  %nop4093 = alloca i1, i1 0
-  %nop4094 = alloca i1, i1 0
-  %nop4095 = alloca i1, i1 0
-  %nop4096 = alloca i1, i1 0
-  %nop4097 = alloca i1, i1 0
-  %nop4098 = alloca i1, i1 0
-  %nop4099 = alloca i1, i1 0
-  %nop4100 = alloca i1, i1 0
-  %nop4101 = alloca i1, i1 0
-  %nop4102 = alloca i1, i1 0
-  %nop4103 = alloca i1, i1 0
-  %nop4104 = alloca i1, i1 0
-  %nop4105 = alloca i1, i1 0
-  %nop4106 = alloca i1, i1 0
-  %nop4107 = alloca i1, i1 0
-  %nop4108 = alloca i1, i1 0
-  %nop4109 = alloca i1, i1 0
-  %nop4110 = alloca i1, i1 0
-  %nop4111 = alloca i1, i1 0
-  %nop4112 = alloca i1, i1 0
-  %nop4113 = alloca i1, i1 0
-  %nop4114 = alloca i1, i1 0
-  %nop4115 = alloca i1, i1 0
-  %nop4116 = alloca i1, i1 0
-  %nop4117 = alloca i1, i1 0
-  %nop4118 = alloca i1, i1 0
-  %nop4119 = alloca i1, i1 0
-  %nop4120 = alloca i1, i1 0
-  %nop4121 = alloca i1, i1 0
-  %nop4122 = alloca i1, i1 0
-  %nop4123 = alloca i1, i1 0
-  %nop4124 = alloca i1, i1 0
-  %nop4125 = alloca i1, i1 0
-  %nop4126 = alloca i1, i1 0
-  %nop4127 = alloca i1, i1 0
-  %nop4128 = alloca i1, i1 0
-  %nop4129 = alloca i1, i1 0
-  %nop4130 = alloca i1, i1 0
-  %nop4131 = alloca i1, i1 0
-  %nop4132 = alloca i1, i1 0
-  %nop4133 = alloca i1, i1 0
-  %nop4134 = alloca i1, i1 0
-  %nop4135 = alloca i1, i1 0
-  %nop4136 = alloca i1, i1 0
-  %nop4137 = alloca i1, i1 0
-  %nop4138 = alloca i1, i1 0
-  %nop4139 = alloca i1, i1 0
-  %nop4140 = alloca i1, i1 0
-  %nop4141 = alloca i1, i1 0
-  %nop4142 = alloca i1, i1 0
-  %nop4143 = alloca i1, i1 0
-  %nop4144 = alloca i1, i1 0
-  %nop4145 = alloca i1, i1 0
-  %nop4146 = alloca i1, i1 0
-  %nop4147 = alloca i1, i1 0
-  %nop4148 = alloca i1, i1 0
-  %nop4149 = alloca i1, i1 0
-  %nop4150 = alloca i1, i1 0
-  %nop4151 = alloca i1, i1 0
-  %nop4152 = alloca i1, i1 0
-  %nop4153 = alloca i1, i1 0
-  %nop4154 = alloca i1, i1 0
-  %nop4155 = alloca i1, i1 0
-  %nop4156 = alloca i1, i1 0
-  %nop4157 = alloca i1, i1 0
-  %nop4158 = alloca i1, i1 0
-  %nop4159 = alloca i1, i1 0
-  %nop4160 = alloca i1, i1 0
-  %nop4161 = alloca i1, i1 0
-  %nop4162 = alloca i1, i1 0
-  %nop4163 = alloca i1, i1 0
-  %nop4164 = alloca i1, i1 0
-  %nop4165 = alloca i1, i1 0
-  %nop4166 = alloca i1, i1 0
-  %nop4167 = alloca i1, i1 0
-  %nop4168 = alloca i1, i1 0
-  %nop4169 = alloca i1, i1 0
-  %nop4170 = alloca i1, i1 0
-  %nop4171 = alloca i1, i1 0
-  %nop4172 = alloca i1, i1 0
-  %nop4173 = alloca i1, i1 0
-  %nop4174 = alloca i1, i1 0
-  %nop4175 = alloca i1, i1 0
-  %nop4176 = alloca i1, i1 0
-  %nop4177 = alloca i1, i1 0
-  %nop4178 = alloca i1, i1 0
-  %nop4179 = alloca i1, i1 0
-  %nop4180 = alloca i1, i1 0
-  %nop4181 = alloca i1, i1 0
-  %nop4182 = alloca i1, i1 0
-  %nop4183 = alloca i1, i1 0
-  %nop4184 = alloca i1, i1 0
-  %nop4185 = alloca i1, i1 0
-  %nop4186 = alloca i1, i1 0
-  %nop4187 = alloca i1, i1 0
-  %nop4188 = alloca i1, i1 0
-  %nop4189 = alloca i1, i1 0
-  %nop4190 = alloca i1, i1 0
-  %nop4191 = alloca i1, i1 0
-  %nop4192 = alloca i1, i1 0
-  %nop4193 = alloca i1, i1 0
-  %nop4194 = alloca i1, i1 0
-  %nop4195 = alloca i1, i1 0
-  %nop4196 = alloca i1, i1 0
-  %nop4197 = alloca i1, i1 0
-  %nop4198 = alloca i1, i1 0
-  %nop4199 = alloca i1, i1 0
-  %nop4200 = alloca i1, i1 0
-  %nop4201 = alloca i1, i1 0
-  %nop4202 = alloca i1, i1 0
-  %nop4203 = alloca i1, i1 0
-  %nop4204 = alloca i1, i1 0
-  %nop4205 = alloca i1, i1 0
-  %nop4206 = alloca i1, i1 0
-  %nop4207 = alloca i1, i1 0
-  %nop4208 = alloca i1, i1 0
-  %nop4209 = alloca i1, i1 0
-  %nop4210 = alloca i1, i1 0
-  %nop4211 = alloca i1, i1 0
-  %nop4212 = alloca i1, i1 0
-  %nop4213 = alloca i1, i1 0
-  %nop4214 = alloca i1, i1 0
-  %nop4215 = alloca i1, i1 0
-  %nop4216 = alloca i1, i1 0
-  %nop4217 = alloca i1, i1 0
-  %nop4218 = alloca i1, i1 0
-  %nop4219 = alloca i1, i1 0
-  %nop4220 = alloca i1, i1 0
-  %nop4221 = alloca i1, i1 0
-  %nop4222 = alloca i1, i1 0
-  %nop4223 = alloca i1, i1 0
-  %nop4224 = alloca i1, i1 0
-  %nop4225 = alloca i1, i1 0
-  %nop4226 = alloca i1, i1 0
-  %nop4227 = alloca i1, i1 0
-  %nop4228 = alloca i1, i1 0
-  %nop4229 = alloca i1, i1 0
-  %nop4230 = alloca i1, i1 0
-  %nop4231 = alloca i1, i1 0
-  %nop4232 = alloca i1, i1 0
-  %nop4233 = alloca i1, i1 0
-  %nop4234 = alloca i1, i1 0
-  %nop4235 = alloca i1, i1 0
-  %nop4236 = alloca i1, i1 0
-  %nop4237 = alloca i1, i1 0
-  %nop4238 = alloca i1, i1 0
-  %nop4239 = alloca i1, i1 0
-  %nop4240 = alloca i1, i1 0
-  %nop4241 = alloca i1, i1 0
-  %nop4242 = alloca i1, i1 0
-  %nop4243 = alloca i1, i1 0
-  %nop4244 = alloca i1, i1 0
-  %nop4245 = alloca i1, i1 0
-  %nop4246 = alloca i1, i1 0
-  %nop4247 = alloca i1, i1 0
-  %nop4248 = alloca i1, i1 0
-  %nop4249 = alloca i1, i1 0
-  %nop4250 = alloca i1, i1 0
-  %nop4251 = alloca i1, i1 0
-  %nop4252 = alloca i1, i1 0
-  %nop4253 = alloca i1, i1 0
-  %nop4254 = alloca i1, i1 0
-  %nop4255 = alloca i1, i1 0
-  %nop4256 = alloca i1, i1 0
-  %nop4257 = alloca i1, i1 0
-  %nop4258 = alloca i1, i1 0
-  %nop4259 = alloca i1, i1 0
-  %nop4260 = alloca i1, i1 0
-  %nop4261 = alloca i1, i1 0
-  %nop4262 = alloca i1, i1 0
-  %nop4263 = alloca i1, i1 0
-  %nop4264 = alloca i1, i1 0
-  %nop4265 = alloca i1, i1 0
-  %nop4266 = alloca i1, i1 0
-  %nop4267 = alloca i1, i1 0
-  %nop4268 = alloca i1, i1 0
-  %nop4269 = alloca i1, i1 0
-  %nop4270 = alloca i1, i1 0
-  %nop4271 = alloca i1, i1 0
-  %nop4272 = alloca i1, i1 0
-  %nop4273 = alloca i1, i1 0
-  %nop4274 = alloca i1, i1 0
-  %nop4275 = alloca i1, i1 0
-  %nop4276 = alloca i1, i1 0
-  %nop4277 = alloca i1, i1 0
-  %nop4278 = alloca i1, i1 0
-  %nop4279 = alloca i1, i1 0
-  %nop4280 = alloca i1, i1 0
-  %nop4281 = alloca i1, i1 0
-  %nop4282 = alloca i1, i1 0
-  %nop4283 = alloca i1, i1 0
-  %nop4284 = alloca i1, i1 0
-  %nop4285 = alloca i1, i1 0
-  %nop4286 = alloca i1, i1 0
-  %nop4287 = alloca i1, i1 0
-  %nop4288 = alloca i1, i1 0
-  %nop4289 = alloca i1, i1 0
-  %nop4290 = alloca i1, i1 0
-  %nop4291 = alloca i1, i1 0
-  %nop4292 = alloca i1, i1 0
-  %nop4293 = alloca i1, i1 0
-  %nop4294 = alloca i1, i1 0
-  %nop4295 = alloca i1, i1 0
-  %nop4296 = alloca i1, i1 0
-  %nop4297 = alloca i1, i1 0
-  %nop4298 = alloca i1, i1 0
-  %nop4299 = alloca i1, i1 0
-  %nop4300 = alloca i1, i1 0
-  %nop4301 = alloca i1, i1 0
-  %nop4302 = alloca i1, i1 0
-  %nop4303 = alloca i1, i1 0
-  %nop4304 = alloca i1, i1 0
-  %nop4305 = alloca i1, i1 0
-  %nop4306 = alloca i1, i1 0
-  %nop4307 = alloca i1, i1 0
-  %nop4308 = alloca i1, i1 0
-  %nop4309 = alloca i1, i1 0
-  %nop4310 = alloca i1, i1 0
-  %nop4311 = alloca i1, i1 0
-  %nop4312 = alloca i1, i1 0
-  %nop4313 = alloca i1, i1 0
-  %nop4314 = alloca i1, i1 0
-  %nop4315 = alloca i1, i1 0
-  %nop4316 = alloca i1, i1 0
-  %nop4317 = alloca i1, i1 0
-  %nop4318 = alloca i1, i1 0
-  %nop4319 = alloca i1, i1 0
-  %nop4320 = alloca i1, i1 0
-  %nop4321 = alloca i1, i1 0
-  %nop4322 = alloca i1, i1 0
-  %nop4323 = alloca i1, i1 0
-  %nop4324 = alloca i1, i1 0
-  %nop4325 = alloca i1, i1 0
-  %nop4326 = alloca i1, i1 0
-  %nop4327 = alloca i1, i1 0
-  %nop4328 = alloca i1, i1 0
-  %nop4329 = alloca i1, i1 0
-  %nop4330 = alloca i1, i1 0
-  %nop4331 = alloca i1, i1 0
-  %nop4332 = alloca i1, i1 0
-  %nop4333 = alloca i1, i1 0
-  %nop4334 = alloca i1, i1 0
-  %nop4335 = alloca i1, i1 0
-  %nop4336 = alloca i1, i1 0
-  %nop4337 = alloca i1, i1 0
-  %nop4338 = alloca i1, i1 0
-  %nop4339 = alloca i1, i1 0
-  %nop4340 = alloca i1, i1 0
-  %nop4341 = alloca i1, i1 0
-  %nop4342 = alloca i1, i1 0
-  %nop4343 = alloca i1, i1 0
-  %nop4344 = alloca i1, i1 0
-  %nop4345 = alloca i1, i1 0
-  %nop4346 = alloca i1, i1 0
-  %nop4347 = alloca i1, i1 0
-  %nop4348 = alloca i1, i1 0
-  %nop4349 = alloca i1, i1 0
-  %nop4350 = alloca i1, i1 0
-  %nop4351 = alloca i1, i1 0
-  %nop4352 = alloca i1, i1 0
-  %nop4353 = alloca i1, i1 0
-  %nop4354 = alloca i1, i1 0
-  %nop4355 = alloca i1, i1 0
-  %nop4356 = alloca i1, i1 0
-  %nop4357 = alloca i1, i1 0
-  %nop4358 = alloca i1, i1 0
-  %nop4359 = alloca i1, i1 0
-  %nop4360 = alloca i1, i1 0
-  %nop4361 = alloca i1, i1 0
-  %nop4362 = alloca i1, i1 0
-  %nop4363 = alloca i1, i1 0
-  %nop4364 = alloca i1, i1 0
-  %nop4365 = alloca i1, i1 0
-  %nop4366 = alloca i1, i1 0
-  %nop4367 = alloca i1, i1 0
-  %nop4368 = alloca i1, i1 0
-  %nop4369 = alloca i1, i1 0
-  %nop4370 = alloca i1, i1 0
-  %nop4371 = alloca i1, i1 0
-  %nop4372 = alloca i1, i1 0
-  %nop4373 = alloca i1, i1 0
-  %nop4374 = alloca i1, i1 0
-  %nop4375 = alloca i1, i1 0
-  %nop4376 = alloca i1, i1 0
-  %nop4377 = alloca i1, i1 0
-  %nop4378 = alloca i1, i1 0
-  %nop4379 = alloca i1, i1 0
-  %nop4380 = alloca i1, i1 0
-  %nop4381 = alloca i1, i1 0
-  %nop4382 = alloca i1, i1 0
-  %nop4383 = alloca i1, i1 0
-  %nop4384 = alloca i1, i1 0
-  %nop4385 = alloca i1, i1 0
-  %nop4386 = alloca i1, i1 0
-  %nop4387 = alloca i1, i1 0
-  %nop4388 = alloca i1, i1 0
-  %nop4389 = alloca i1, i1 0
-  %nop4390 = alloca i1, i1 0
-  %nop4391 = alloca i1, i1 0
-  %nop4392 = alloca i1, i1 0
-  %nop4393 = alloca i1, i1 0
-  %nop4394 = alloca i1, i1 0
-  %nop4395 = alloca i1, i1 0
-  %nop4396 = alloca i1, i1 0
-  %nop4397 = alloca i1, i1 0
-  %nop4398 = alloca i1, i1 0
-  %nop4399 = alloca i1, i1 0
-  %nop4400 = alloca i1, i1 0
-  %nop4401 = alloca i1, i1 0
-  %nop4402 = alloca i1, i1 0
-  %nop4403 = alloca i1, i1 0
-  %nop4404 = alloca i1, i1 0
-  %nop4405 = alloca i1, i1 0
-  %nop4406 = alloca i1, i1 0
-  %nop4407 = alloca i1, i1 0
-  %nop4408 = alloca i1, i1 0
-  %nop4409 = alloca i1, i1 0
-  %nop4410 = alloca i1, i1 0
-  %nop4411 = alloca i1, i1 0
-  %nop4412 = alloca i1, i1 0
-  %nop4413 = alloca i1, i1 0
-  %nop4414 = alloca i1, i1 0
-  %nop4415 = alloca i1, i1 0
-  %nop4416 = alloca i1, i1 0
-  %nop4417 = alloca i1, i1 0
-  %nop4418 = alloca i1, i1 0
-  %nop4419 = alloca i1, i1 0
-  %nop4420 = alloca i1, i1 0
-  %nop4421 = alloca i1, i1 0
-  %nop4422 = alloca i1, i1 0
-  %nop4423 = alloca i1, i1 0
-  %nop4424 = alloca i1, i1 0
-  %nop4425 = alloca i1, i1 0
-  %nop4426 = alloca i1, i1 0
-  %nop4427 = alloca i1, i1 0
-  %nop4428 = alloca i1, i1 0
-  %nop4429 = alloca i1, i1 0
-  %nop4430 = alloca i1, i1 0
-  %nop4431 = alloca i1, i1 0
-  %nop4432 = alloca i1, i1 0
-  %nop4433 = alloca i1, i1 0
-  %nop4434 = alloca i1, i1 0
-  %nop4435 = alloca i1, i1 0
-  %nop4436 = alloca i1, i1 0
-  %nop4437 = alloca i1, i1 0
-  %nop4438 = alloca i1, i1 0
-  %nop4439 = alloca i1, i1 0
-  %nop4440 = alloca i1, i1 0
-  %nop4441 = alloca i1, i1 0
-  %nop4442 = alloca i1, i1 0
-  %nop4443 = alloca i1, i1 0
-  %nop4444 = alloca i1, i1 0
-  %nop4445 = alloca i1, i1 0
-  %nop4446 = alloca i1, i1 0
-  %nop4447 = alloca i1, i1 0
-  %nop4448 = alloca i1, i1 0
-  %nop4449 = alloca i1, i1 0
-  %nop4450 = alloca i1, i1 0
-  %nop4451 = alloca i1, i1 0
-  %nop4452 = alloca i1, i1 0
-  %nop4453 = alloca i1, i1 0
-  %nop4454 = alloca i1, i1 0
-  %nop4455 = alloca i1, i1 0
-  %nop4456 = alloca i1, i1 0
-  %nop4457 = alloca i1, i1 0
-  %nop4458 = alloca i1, i1 0
-  %nop4459 = alloca i1, i1 0
-  %nop4460 = alloca i1, i1 0
-  %nop4461 = alloca i1, i1 0
-  %nop4462 = alloca i1, i1 0
-  %nop4463 = alloca i1, i1 0
-  %nop4464 = alloca i1, i1 0
-  %nop4465 = alloca i1, i1 0
-  %nop4466 = alloca i1, i1 0
-  %nop4467 = alloca i1, i1 0
-  %nop4468 = alloca i1, i1 0
-  %nop4469 = alloca i1, i1 0
-  %nop4470 = alloca i1, i1 0
-  %nop4471 = alloca i1, i1 0
-  %nop4472 = alloca i1, i1 0
-  %nop4473 = alloca i1, i1 0
-  %nop4474 = alloca i1, i1 0
-  %nop4475 = alloca i1, i1 0
-  %nop4476 = alloca i1, i1 0
-  %nop4477 = alloca i1, i1 0
-  %nop4478 = alloca i1, i1 0
-  %nop4479 = alloca i1, i1 0
-  %nop4480 = alloca i1, i1 0
-  %nop4481 = alloca i1, i1 0
-  %nop4482 = alloca i1, i1 0
-  %nop4483 = alloca i1, i1 0
-  %nop4484 = alloca i1, i1 0
-  %nop4485 = alloca i1, i1 0
-  %nop4486 = alloca i1, i1 0
-  %nop4487 = alloca i1, i1 0
-  %nop4488 = alloca i1, i1 0
-  %nop4489 = alloca i1, i1 0
-  %nop4490 = alloca i1, i1 0
-  %nop4491 = alloca i1, i1 0
-  %nop4492 = alloca i1, i1 0
-  %nop4493 = alloca i1, i1 0
-  %nop4494 = alloca i1, i1 0
-  %nop4495 = alloca i1, i1 0
-  %nop4496 = alloca i1, i1 0
-  %nop4497 = alloca i1, i1 0
-  %nop4498 = alloca i1, i1 0
-  %nop4499 = alloca i1, i1 0
-  %nop4500 = alloca i1, i1 0
-  %nop4501 = alloca i1, i1 0
-  %nop4502 = alloca i1, i1 0
-  %nop4503 = alloca i1, i1 0
-  %nop4504 = alloca i1, i1 0
-  %nop4505 = alloca i1, i1 0
-  %nop4506 = alloca i1, i1 0
-  %nop4507 = alloca i1, i1 0
-  %nop4508 = alloca i1, i1 0
-  %nop4509 = alloca i1, i1 0
-  %nop4510 = alloca i1, i1 0
-  %nop4511 = alloca i1, i1 0
-  %nop4512 = alloca i1, i1 0
-  %nop4513 = alloca i1, i1 0
-  %nop4514 = alloca i1, i1 0
-  %nop4515 = alloca i1, i1 0
-  %nop4516 = alloca i1, i1 0
-  %nop4517 = alloca i1, i1 0
-  %nop4518 = alloca i1, i1 0
-  %nop4519 = alloca i1, i1 0
-  %nop4520 = alloca i1, i1 0
-  %nop4521 = alloca i1, i1 0
-  %nop4522 = alloca i1, i1 0
-  %nop4523 = alloca i1, i1 0
-  %nop4524 = alloca i1, i1 0
-  %nop4525 = alloca i1, i1 0
-  %nop4526 = alloca i1, i1 0
-  %nop4527 = alloca i1, i1 0
-  %nop4528 = alloca i1, i1 0
-  %nop4529 = alloca i1, i1 0
-  %nop4530 = alloca i1, i1 0
-  %nop4531 = alloca i1, i1 0
-  %nop4532 = alloca i1, i1 0
-  %nop4533 = alloca i1, i1 0
-  %nop4534 = alloca i1, i1 0
-  %nop4535 = alloca i1, i1 0
-  %nop4536 = alloca i1, i1 0
-  %nop4537 = alloca i1, i1 0
-  %nop4538 = alloca i1, i1 0
-  %nop4539 = alloca i1, i1 0
-  %nop4540 = alloca i1, i1 0
-  %nop4541 = alloca i1, i1 0
-  %nop4542 = alloca i1, i1 0
-  %nop4543 = alloca i1, i1 0
-  %nop4544 = alloca i1, i1 0
-  %nop4545 = alloca i1, i1 0
-  %nop4546 = alloca i1, i1 0
-  %nop4547 = alloca i1, i1 0
-  %nop4548 = alloca i1, i1 0
-  %nop4549 = alloca i1, i1 0
-  %nop4550 = alloca i1, i1 0
-  %nop4551 = alloca i1, i1 0
-  %nop4552 = alloca i1, i1 0
-  %nop4553 = alloca i1, i1 0
-  %nop4554 = alloca i1, i1 0
-  %nop4555 = alloca i1, i1 0
-  %nop4556 = alloca i1, i1 0
-  %nop4557 = alloca i1, i1 0
-  %nop4558 = alloca i1, i1 0
-  %nop4559 = alloca i1, i1 0
-  %nop4560 = alloca i1, i1 0
-  %nop4561 = alloca i1, i1 0
-  %nop4562 = alloca i1, i1 0
-  %nop4563 = alloca i1, i1 0
-  %nop4564 = alloca i1, i1 0
-  %nop4565 = alloca i1, i1 0
-  %nop4566 = alloca i1, i1 0
-  %nop4567 = alloca i1, i1 0
-  %nop4568 = alloca i1, i1 0
-  %nop4569 = alloca i1, i1 0
-  %nop4570 = alloca i1, i1 0
-  %nop4571 = alloca i1, i1 0
-  %nop4572 = alloca i1, i1 0
-  %nop4573 = alloca i1, i1 0
-  %nop4574 = alloca i1, i1 0
-  %nop4575 = alloca i1, i1 0
-  %nop4576 = alloca i1, i1 0
-  %nop4577 = alloca i1, i1 0
-  %nop4578 = alloca i1, i1 0
-  %nop4579 = alloca i1, i1 0
-  %nop4580 = alloca i1, i1 0
-  %nop4581 = alloca i1, i1 0
-  %nop4582 = alloca i1, i1 0
-  %nop4583 = alloca i1, i1 0
-  %nop4584 = alloca i1, i1 0
-  %nop4585 = alloca i1, i1 0
-  %nop4586 = alloca i1, i1 0
-  %nop4587 = alloca i1, i1 0
-  %nop4588 = alloca i1, i1 0
-  %nop4589 = alloca i1, i1 0
-  %nop4590 = alloca i1, i1 0
-  %nop4591 = alloca i1, i1 0
-  %nop4592 = alloca i1, i1 0
-  %nop4593 = alloca i1, i1 0
-  %nop4594 = alloca i1, i1 0
-  %nop4595 = alloca i1, i1 0
-  %nop4596 = alloca i1, i1 0
-  %nop4597 = alloca i1, i1 0
-  %nop4598 = alloca i1, i1 0
-  %nop4599 = alloca i1, i1 0
-  %nop4600 = alloca i1, i1 0
-  %nop4601 = alloca i1, i1 0
-  %nop4602 = alloca i1, i1 0
-  %nop4603 = alloca i1, i1 0
-  %nop4604 = alloca i1, i1 0
-  %nop4605 = alloca i1, i1 0
-  %nop4606 = alloca i1, i1 0
-  %nop4607 = alloca i1, i1 0
-  %nop4608 = alloca i1, i1 0
-  %nop4609 = alloca i1, i1 0
-  %nop4610 = alloca i1, i1 0
-  %nop4611 = alloca i1, i1 0
-  %nop4612 = alloca i1, i1 0
-  %nop4613 = alloca i1, i1 0
-  %nop4614 = alloca i1, i1 0
-  %nop4615 = alloca i1, i1 0
-  %nop4616 = alloca i1, i1 0
-  %nop4617 = alloca i1, i1 0
-  %nop4618 = alloca i1, i1 0
-  %nop4619 = alloca i1, i1 0
-  %nop4620 = alloca i1, i1 0
-  %nop4621 = alloca i1, i1 0
-  %nop4622 = alloca i1, i1 0
-  %nop4623 = alloca i1, i1 0
-  %nop4624 = alloca i1, i1 0
-  %nop4625 = alloca i1, i1 0
-  %nop4626 = alloca i1, i1 0
-  %nop4627 = alloca i1, i1 0
-  %nop4628 = alloca i1, i1 0
-  %nop4629 = alloca i1, i1 0
-  %nop4630 = alloca i1, i1 0
-  %nop4631 = alloca i1, i1 0
-  %nop4632 = alloca i1, i1 0
-  %nop4633 = alloca i1, i1 0
-  %nop4634 = alloca i1, i1 0
-  %nop4635 = alloca i1, i1 0
-  %nop4636 = alloca i1, i1 0
-  %nop4637 = alloca i1, i1 0
-  %nop4638 = alloca i1, i1 0
-  %nop4639 = alloca i1, i1 0
-  %nop4640 = alloca i1, i1 0
-  %nop4641 = alloca i1, i1 0
-  %nop4642 = alloca i1, i1 0
-  %nop4643 = alloca i1, i1 0
-  %nop4644 = alloca i1, i1 0
-  %nop4645 = alloca i1, i1 0
-  %nop4646 = alloca i1, i1 0
-  %nop4647 = alloca i1, i1 0
-  %nop4648 = alloca i1, i1 0
-  %nop4649 = alloca i1, i1 0
-  %nop4650 = alloca i1, i1 0
-  %nop4651 = alloca i1, i1 0
-  %nop4652 = alloca i1, i1 0
-  %nop4653 = alloca i1, i1 0
-  %nop4654 = alloca i1, i1 0
-  %nop4655 = alloca i1, i1 0
-  %nop4656 = alloca i1, i1 0
-  %nop4657 = alloca i1, i1 0
-  %nop4658 = alloca i1, i1 0
-  %nop4659 = alloca i1, i1 0
-  %nop4660 = alloca i1, i1 0
-  %nop4661 = alloca i1, i1 0
-  %nop4662 = alloca i1, i1 0
-  %nop4663 = alloca i1, i1 0
-  %nop4664 = alloca i1, i1 0
-  %nop4665 = alloca i1, i1 0
-  %nop4666 = alloca i1, i1 0
-  %nop4667 = alloca i1, i1 0
-  %nop4668 = alloca i1, i1 0
-  %nop4669 = alloca i1, i1 0
-  %nop4670 = alloca i1, i1 0
-  %nop4671 = alloca i1, i1 0
-  %nop4672 = alloca i1, i1 0
-  %nop4673 = alloca i1, i1 0
-  %nop4674 = alloca i1, i1 0
-  %nop4675 = alloca i1, i1 0
-  %nop4676 = alloca i1, i1 0
-  %nop4677 = alloca i1, i1 0
-  %nop4678 = alloca i1, i1 0
-  %nop4679 = alloca i1, i1 0
-  %nop4680 = alloca i1, i1 0
-  %nop4681 = alloca i1, i1 0
-  %nop4682 = alloca i1, i1 0
-  %nop4683 = alloca i1, i1 0
-  %nop4684 = alloca i1, i1 0
-  %nop4685 = alloca i1, i1 0
-  %nop4686 = alloca i1, i1 0
-  %nop4687 = alloca i1, i1 0
-  %nop4688 = alloca i1, i1 0
-  %nop4689 = alloca i1, i1 0
-  %nop4690 = alloca i1, i1 0
-  %nop4691 = alloca i1, i1 0
-  %nop4692 = alloca i1, i1 0
-  %nop4693 = alloca i1, i1 0
-  %nop4694 = alloca i1, i1 0
-  %nop4695 = alloca i1, i1 0
-  %nop4696 = alloca i1, i1 0
-  %nop4697 = alloca i1, i1 0
-  %nop4698 = alloca i1, i1 0
-  %nop4699 = alloca i1, i1 0
-  %nop4700 = alloca i1, i1 0
-  %nop4701 = alloca i1, i1 0
-  %nop4702 = alloca i1, i1 0
-  %nop4703 = alloca i1, i1 0
-  %nop4704 = alloca i1, i1 0
-  %nop4705 = alloca i1, i1 0
-  %nop4706 = alloca i1, i1 0
-  %nop4707 = alloca i1, i1 0
-  %nop4708 = alloca i1, i1 0
-  %nop4709 = alloca i1, i1 0
-  %nop4710 = alloca i1, i1 0
-  %nop4711 = alloca i1, i1 0
-  %nop4712 = alloca i1, i1 0
-  %nop4713 = alloca i1, i1 0
-  %nop4714 = alloca i1, i1 0
-  %nop4715 = alloca i1, i1 0
-  %nop4716 = alloca i1, i1 0
-  %nop4717 = alloca i1, i1 0
-  %nop4718 = alloca i1, i1 0
-  %nop4719 = alloca i1, i1 0
-  %nop4720 = alloca i1, i1 0
-  %nop4721 = alloca i1, i1 0
-  %nop4722 = alloca i1, i1 0
-  %nop4723 = alloca i1, i1 0
-  %nop4724 = alloca i1, i1 0
-  %nop4725 = alloca i1, i1 0
-  %nop4726 = alloca i1, i1 0
-  %nop4727 = alloca i1, i1 0
-  %nop4728 = alloca i1, i1 0
-  %nop4729 = alloca i1, i1 0
-  %nop4730 = alloca i1, i1 0
-  %nop4731 = alloca i1, i1 0
-  %nop4732 = alloca i1, i1 0
-  %nop4733 = alloca i1, i1 0
-  %nop4734 = alloca i1, i1 0
-  %nop4735 = alloca i1, i1 0
-  %nop4736 = alloca i1, i1 0
-  %nop4737 = alloca i1, i1 0
-  %nop4738 = alloca i1, i1 0
-  %nop4739 = alloca i1, i1 0
-  %nop4740 = alloca i1, i1 0
-  %nop4741 = alloca i1, i1 0
-  %nop4742 = alloca i1, i1 0
-  %nop4743 = alloca i1, i1 0
-  %nop4744 = alloca i1, i1 0
-  %nop4745 = alloca i1, i1 0
-  %nop4746 = alloca i1, i1 0
-  %nop4747 = alloca i1, i1 0
-  %nop4748 = alloca i1, i1 0
-  %nop4749 = alloca i1, i1 0
-  %nop4750 = alloca i1, i1 0
-  %nop4751 = alloca i1, i1 0
-  %nop4752 = alloca i1, i1 0
-  %nop4753 = alloca i1, i1 0
-  %nop4754 = alloca i1, i1 0
-  %nop4755 = alloca i1, i1 0
-  %nop4756 = alloca i1, i1 0
-  %nop4757 = alloca i1, i1 0
-  %nop4758 = alloca i1, i1 0
-  %nop4759 = alloca i1, i1 0
-  %nop4760 = alloca i1, i1 0
-  %nop4761 = alloca i1, i1 0
-  %nop4762 = alloca i1, i1 0
-  %nop4763 = alloca i1, i1 0
-  %nop4764 = alloca i1, i1 0
-  %nop4765 = alloca i1, i1 0
-  %nop4766 = alloca i1, i1 0
-  %nop4767 = alloca i1, i1 0
-  %nop4768 = alloca i1, i1 0
-  %nop4769 = alloca i1, i1 0
-  %nop4770 = alloca i1, i1 0
-  %nop4771 = alloca i1, i1 0
-  %nop4772 = alloca i1, i1 0
-  %nop4773 = alloca i1, i1 0
-  %nop4774 = alloca i1, i1 0
-  %nop4775 = alloca i1, i1 0
-  %nop4776 = alloca i1, i1 0
-  %nop4777 = alloca i1, i1 0
-  %nop4778 = alloca i1, i1 0
-  %nop4779 = alloca i1, i1 0
-  %nop4780 = alloca i1, i1 0
-  %nop4781 = alloca i1, i1 0
-  %nop4782 = alloca i1, i1 0
-  %nop4783 = alloca i1, i1 0
-  %nop4784 = alloca i1, i1 0
-  %nop4785 = alloca i1, i1 0
-  %nop4786 = alloca i1, i1 0
-  %nop4787 = alloca i1, i1 0
-  %nop4788 = alloca i1, i1 0
-  %nop4789 = alloca i1, i1 0
-  %nop4790 = alloca i1, i1 0
-  %nop4791 = alloca i1, i1 0
-  %nop4792 = alloca i1, i1 0
-  %nop4793 = alloca i1, i1 0
-  %nop4794 = alloca i1, i1 0
-  %nop4795 = alloca i1, i1 0
-  %nop4796 = alloca i1, i1 0
-  %nop4797 = alloca i1, i1 0
-  %nop4798 = alloca i1, i1 0
-  %nop4799 = alloca i1, i1 0
-  %nop4800 = alloca i1, i1 0
-  %nop4801 = alloca i1, i1 0
-  %nop4802 = alloca i1, i1 0
-  %nop4803 = alloca i1, i1 0
-  %nop4804 = alloca i1, i1 0
-  %nop4805 = alloca i1, i1 0
-  %nop4806 = alloca i1, i1 0
-  %nop4807 = alloca i1, i1 0
-  %nop4808 = alloca i1, i1 0
-  %nop4809 = alloca i1, i1 0
-  %nop4810 = alloca i1, i1 0
-  %nop4811 = alloca i1, i1 0
-  %nop4812 = alloca i1, i1 0
-  %nop4813 = alloca i1, i1 0
-  %nop4814 = alloca i1, i1 0
-  %nop4815 = alloca i1, i1 0
-  %nop4816 = alloca i1, i1 0
-  %nop4817 = alloca i1, i1 0
-  %nop4818 = alloca i1, i1 0
-  %nop4819 = alloca i1, i1 0
-  %nop4820 = alloca i1, i1 0
-  %nop4821 = alloca i1, i1 0
-  %nop4822 = alloca i1, i1 0
-  %nop4823 = alloca i1, i1 0
-  %nop4824 = alloca i1, i1 0
-  %nop4825 = alloca i1, i1 0
-  %nop4826 = alloca i1, i1 0
-  %nop4827 = alloca i1, i1 0
-  %nop4828 = alloca i1, i1 0
-  %nop4829 = alloca i1, i1 0
-  %nop4830 = alloca i1, i1 0
-  %nop4831 = alloca i1, i1 0
-  %nop4832 = alloca i1, i1 0
-  %nop4833 = alloca i1, i1 0
-  %nop4834 = alloca i1, i1 0
-  %nop4835 = alloca i1, i1 0
-  %nop4836 = alloca i1, i1 0
-  %nop4837 = alloca i1, i1 0
-  %nop4838 = alloca i1, i1 0
-  %nop4839 = alloca i1, i1 0
-  %nop4840 = alloca i1, i1 0
-  %nop4841 = alloca i1, i1 0
-  %nop4842 = alloca i1, i1 0
-  %nop4843 = alloca i1, i1 0
-  %nop4844 = alloca i1, i1 0
-  %nop4845 = alloca i1, i1 0
-  %nop4846 = alloca i1, i1 0
-  %nop4847 = alloca i1, i1 0
-  %nop4848 = alloca i1, i1 0
-  %nop4849 = alloca i1, i1 0
-  %nop4850 = alloca i1, i1 0
-  %nop4851 = alloca i1, i1 0
-  %nop4852 = alloca i1, i1 0
-  %nop4853 = alloca i1, i1 0
-  %nop4854 = alloca i1, i1 0
-  %nop4855 = alloca i1, i1 0
-  %nop4856 = alloca i1, i1 0
-  %nop4857 = alloca i1, i1 0
-  %nop4858 = alloca i1, i1 0
-  %nop4859 = alloca i1, i1 0
-  %nop4860 = alloca i1, i1 0
-  %nop4861 = alloca i1, i1 0
-  %nop4862 = alloca i1, i1 0
-  %nop4863 = alloca i1, i1 0
-  %nop4864 = alloca i1, i1 0
-  %nop4865 = alloca i1, i1 0
-  %nop4866 = alloca i1, i1 0
-  %nop4867 = alloca i1, i1 0
-  %nop4868 = alloca i1, i1 0
-  %nop4869 = alloca i1, i1 0
-  %nop4870 = alloca i1, i1 0
-  %nop4871 = alloca i1, i1 0
-  %nop4872 = alloca i1, i1 0
-  %nop4873 = alloca i1, i1 0
-  %nop4874 = alloca i1, i1 0
-  %nop4875 = alloca i1, i1 0
-  %nop4876 = alloca i1, i1 0
-  %nop4877 = alloca i1, i1 0
-  %nop4878 = alloca i1, i1 0
-  %nop4879 = alloca i1, i1 0
-  %nop4880 = alloca i1, i1 0
-  %nop4881 = alloca i1, i1 0
-  %nop4882 = alloca i1, i1 0
-  %nop4883 = alloca i1, i1 0
-  %nop4884 = alloca i1, i1 0
-  %nop4885 = alloca i1, i1 0
-  %nop4886 = alloca i1, i1 0
-  %nop4887 = alloca i1, i1 0
-  %nop4888 = alloca i1, i1 0
-  %nop4889 = alloca i1, i1 0
-  %nop4890 = alloca i1, i1 0
-  %nop4891 = alloca i1, i1 0
-  %nop4892 = alloca i1, i1 0
-  %nop4893 = alloca i1, i1 0
-  %nop4894 = alloca i1, i1 0
-  %nop4895 = alloca i1, i1 0
-  %nop4896 = alloca i1, i1 0
-  %nop4897 = alloca i1, i1 0
-  %nop4898 = alloca i1, i1 0
-  %nop4899 = alloca i1, i1 0
-  %nop4900 = alloca i1, i1 0
-  %nop4901 = alloca i1, i1 0
-  %nop4902 = alloca i1, i1 0
-  %nop4903 = alloca i1, i1 0
-  %nop4904 = alloca i1, i1 0
-  %nop4905 = alloca i1, i1 0
-  %nop4906 = alloca i1, i1 0
-  %nop4907 = alloca i1, i1 0
-  %nop4908 = alloca i1, i1 0
-  %nop4909 = alloca i1, i1 0
-  %nop4910 = alloca i1, i1 0
-  %nop4911 = alloca i1, i1 0
-  %nop4912 = alloca i1, i1 0
-  %nop4913 = alloca i1, i1 0
-  %nop4914 = alloca i1, i1 0
-  %nop4915 = alloca i1, i1 0
-  %nop4916 = alloca i1, i1 0
-  %nop4917 = alloca i1, i1 0
-  %nop4918 = alloca i1, i1 0
-  %nop4919 = alloca i1, i1 0
-  %nop4920 = alloca i1, i1 0
-  %nop4921 = alloca i1, i1 0
-  %nop4922 = alloca i1, i1 0
-  %nop4923 = alloca i1, i1 0
-  %nop4924 = alloca i1, i1 0
-  %nop4925 = alloca i1, i1 0
-  %nop4926 = alloca i1, i1 0
-  %nop4927 = alloca i1, i1 0
-  %nop4928 = alloca i1, i1 0
-  %nop4929 = alloca i1, i1 0
-  %nop4930 = alloca i1, i1 0
-  %nop4931 = alloca i1, i1 0
-  %nop4932 = alloca i1, i1 0
-  %nop4933 = alloca i1, i1 0
-  %nop4934 = alloca i1, i1 0
-  %nop4935 = alloca i1, i1 0
-  %nop4936 = alloca i1, i1 0
-  %nop4937 = alloca i1, i1 0
-  %nop4938 = alloca i1, i1 0
-  %nop4939 = alloca i1, i1 0
-  %nop4940 = alloca i1, i1 0
-  %nop4941 = alloca i1, i1 0
-  %nop4942 = alloca i1, i1 0
-  %nop4943 = alloca i1, i1 0
-  %nop4944 = alloca i1, i1 0
-  %nop4945 = alloca i1, i1 0
-  %nop4946 = alloca i1, i1 0
-  %nop4947 = alloca i1, i1 0
-  %nop4948 = alloca i1, i1 0
-  %nop4949 = alloca i1, i1 0
-  %nop4950 = alloca i1, i1 0
-  %nop4951 = alloca i1, i1 0
-  %nop4952 = alloca i1, i1 0
-  %nop4953 = alloca i1, i1 0
-  %nop4954 = alloca i1, i1 0
-  %nop4955 = alloca i1, i1 0
-  %nop4956 = alloca i1, i1 0
-  %nop4957 = alloca i1, i1 0
-  %nop4958 = alloca i1, i1 0
-  %nop4959 = alloca i1, i1 0
-  %nop4960 = alloca i1, i1 0
-  %nop4961 = alloca i1, i1 0
-  %nop4962 = alloca i1, i1 0
-  %nop4963 = alloca i1, i1 0
-  %nop4964 = alloca i1, i1 0
-  %nop4965 = alloca i1, i1 0
-  %nop4966 = alloca i1, i1 0
-  %nop4967 = alloca i1, i1 0
-  %nop4968 = alloca i1, i1 0
-  %nop4969 = alloca i1, i1 0
-  %nop4970 = alloca i1, i1 0
-  %nop4971 = alloca i1, i1 0
-  %nop4972 = alloca i1, i1 0
-  %nop4973 = alloca i1, i1 0
-  %nop4974 = alloca i1, i1 0
-  %nop4975 = alloca i1, i1 0
-  %nop4976 = alloca i1, i1 0
-  %nop4977 = alloca i1, i1 0
-  %nop4978 = alloca i1, i1 0
-  %nop4979 = alloca i1, i1 0
-  %nop4980 = alloca i1, i1 0
-  %nop4981 = alloca i1, i1 0
-  %nop4982 = alloca i1, i1 0
-  %nop4983 = alloca i1, i1 0
-  %nop4984 = alloca i1, i1 0
-  %nop4985 = alloca i1, i1 0
-  %nop4986 = alloca i1, i1 0
-  %nop4987 = alloca i1, i1 0
-  %nop4988 = alloca i1, i1 0
-  %nop4989 = alloca i1, i1 0
-  %nop4990 = alloca i1, i1 0
-  %nop4991 = alloca i1, i1 0
-  %nop4992 = alloca i1, i1 0
-  %nop4993 = alloca i1, i1 0
-  %nop4994 = alloca i1, i1 0
-  %nop4995 = alloca i1, i1 0
-  %nop4996 = alloca i1, i1 0
-  %nop4997 = alloca i1, i1 0
-  %nop4998 = alloca i1, i1 0
-  %nop4999 = alloca i1, i1 0
-  %nop5000 = alloca i1, i1 0
-  %nop5001 = alloca i1, i1 0
-  %nop5002 = alloca i1, i1 0
-  %nop5003 = alloca i1, i1 0
-  %nop5004 = alloca i1, i1 0
-  %nop5005 = alloca i1, i1 0
-  %nop5006 = alloca i1, i1 0
-  %nop5007 = alloca i1, i1 0
-  %nop5008 = alloca i1, i1 0
-  %nop5009 = alloca i1, i1 0
-  %nop5010 = alloca i1, i1 0
-  %nop5011 = alloca i1, i1 0
-  %nop5012 = alloca i1, i1 0
-  %nop5013 = alloca i1, i1 0
-  %nop5014 = alloca i1, i1 0
-  %nop5015 = alloca i1, i1 0
-  %nop5016 = alloca i1, i1 0
-  %nop5017 = alloca i1, i1 0
-  %nop5018 = alloca i1, i1 0
-  %nop5019 = alloca i1, i1 0
-  %nop5020 = alloca i1, i1 0
-  %nop5021 = alloca i1, i1 0
-  %nop5022 = alloca i1, i1 0
-  %nop5023 = alloca i1, i1 0
-  %nop5024 = alloca i1, i1 0
-  %nop5025 = alloca i1, i1 0
-  %nop5026 = alloca i1, i1 0
-  %nop5027 = alloca i1, i1 0
-  %nop5028 = alloca i1, i1 0
-  %nop5029 = alloca i1, i1 0
-  %nop5030 = alloca i1, i1 0
-  %nop5031 = alloca i1, i1 0
-  %nop5032 = alloca i1, i1 0
-  %nop5033 = alloca i1, i1 0
-  %nop5034 = alloca i1, i1 0
-  %nop5035 = alloca i1, i1 0
-  %nop5036 = alloca i1, i1 0
-  %nop5037 = alloca i1, i1 0
-  %nop5038 = alloca i1, i1 0
-  %nop5039 = alloca i1, i1 0
-  %nop5040 = alloca i1, i1 0
-  %nop5041 = alloca i1, i1 0
-  %nop5042 = alloca i1, i1 0
-  %nop5043 = alloca i1, i1 0
-  %nop5044 = alloca i1, i1 0
-  %nop5045 = alloca i1, i1 0
-  %nop5046 = alloca i1, i1 0
-  %nop5047 = alloca i1, i1 0
-  %nop5048 = alloca i1, i1 0
-  %nop5049 = alloca i1, i1 0
-  %nop5050 = alloca i1, i1 0
-  %nop5051 = alloca i1, i1 0
-  %nop5052 = alloca i1, i1 0
-  %nop5053 = alloca i1, i1 0
-  %nop5054 = alloca i1, i1 0
-  %nop5055 = alloca i1, i1 0
-  %nop5056 = alloca i1, i1 0
-  %nop5057 = alloca i1, i1 0
-  %nop5058 = alloca i1, i1 0
-  %nop5059 = alloca i1, i1 0
-  %nop5060 = alloca i1, i1 0
-  %nop5061 = alloca i1, i1 0
-  %nop5062 = alloca i1, i1 0
-  %nop5063 = alloca i1, i1 0
-  %nop5064 = alloca i1, i1 0
-  %nop5065 = alloca i1, i1 0
-  %nop5066 = alloca i1, i1 0
-  %nop5067 = alloca i1, i1 0
-  %nop5068 = alloca i1, i1 0
-  %nop5069 = alloca i1, i1 0
-  %nop5070 = alloca i1, i1 0
-  %nop5071 = alloca i1, i1 0
-  %nop5072 = alloca i1, i1 0
-  %nop5073 = alloca i1, i1 0
-  %nop5074 = alloca i1, i1 0
-  %nop5075 = alloca i1, i1 0
-  %nop5076 = alloca i1, i1 0
-  %nop5077 = alloca i1, i1 0
-  %nop5078 = alloca i1, i1 0
-  %nop5079 = alloca i1, i1 0
-  %nop5080 = alloca i1, i1 0
-  %nop5081 = alloca i1, i1 0
-  %nop5082 = alloca i1, i1 0
-  %nop5083 = alloca i1, i1 0
-  %nop5084 = alloca i1, i1 0
-  %nop5085 = alloca i1, i1 0
-  %nop5086 = alloca i1, i1 0
-  %nop5087 = alloca i1, i1 0
-  %nop5088 = alloca i1, i1 0
-  %nop5089 = alloca i1, i1 0
-  %nop5090 = alloca i1, i1 0
-  %nop5091 = alloca i1, i1 0
-  %nop5092 = alloca i1, i1 0
-  %nop5093 = alloca i1, i1 0
-  %nop5094 = alloca i1, i1 0
-  %nop5095 = alloca i1, i1 0
-  %nop5096 = alloca i1, i1 0
-  %nop5097 = alloca i1, i1 0
-  %nop5098 = alloca i1, i1 0
-  %nop5099 = alloca i1, i1 0
-  %nop5100 = alloca i1, i1 0
-  %nop5101 = alloca i1, i1 0
-  %nop5102 = alloca i1, i1 0
-  %nop5103 = alloca i1, i1 0
-  %nop5104 = alloca i1, i1 0
-  %nop5105 = alloca i1, i1 0
-  %nop5106 = alloca i1, i1 0
-  %nop5107 = alloca i1, i1 0
-  %nop5108 = alloca i1, i1 0
-  %nop5109 = alloca i1, i1 0
-  %nop5110 = alloca i1, i1 0
-  %nop5111 = alloca i1, i1 0
-  %nop5112 = alloca i1, i1 0
-  %nop5113 = alloca i1, i1 0
-  %nop5114 = alloca i1, i1 0
-  %nop5115 = alloca i1, i1 0
-  %nop5116 = alloca i1, i1 0
-  %nop5117 = alloca i1, i1 0
-  %nop5118 = alloca i1, i1 0
-  %nop5119 = alloca i1, i1 0
-  %nop5120 = alloca i1, i1 0
-  %nop5121 = alloca i1, i1 0
-  %nop5122 = alloca i1, i1 0
-  %nop5123 = alloca i1, i1 0
-  %nop5124 = alloca i1, i1 0
-  %nop5125 = alloca i1, i1 0
-  %nop5126 = alloca i1, i1 0
-  %nop5127 = alloca i1, i1 0
-  %nop5128 = alloca i1, i1 0
-  %nop5129 = alloca i1, i1 0
-  %nop5130 = alloca i1, i1 0
-  %nop5131 = alloca i1, i1 0
-  %nop5132 = alloca i1, i1 0
-  %nop5133 = alloca i1, i1 0
-  %nop5134 = alloca i1, i1 0
-  %nop5135 = alloca i1, i1 0
-  %nop5136 = alloca i1, i1 0
-  %nop5137 = alloca i1, i1 0
-  %nop5138 = alloca i1, i1 0
-  %nop5139 = alloca i1, i1 0
-  %nop5140 = alloca i1, i1 0
-  %nop5141 = alloca i1, i1 0
-  %nop5142 = alloca i1, i1 0
-  %nop5143 = alloca i1, i1 0
-  %nop5144 = alloca i1, i1 0
-  %nop5145 = alloca i1, i1 0
-  %nop5146 = alloca i1, i1 0
-  %nop5147 = alloca i1, i1 0
-  %nop5148 = alloca i1, i1 0
-  %nop5149 = alloca i1, i1 0
-  %nop5150 = alloca i1, i1 0
-  %nop5151 = alloca i1, i1 0
-  %nop5152 = alloca i1, i1 0
-  %nop5153 = alloca i1, i1 0
-  %nop5154 = alloca i1, i1 0
-  %nop5155 = alloca i1, i1 0
-  %nop5156 = alloca i1, i1 0
-  %nop5157 = alloca i1, i1 0
-  %nop5158 = alloca i1, i1 0
-  %nop5159 = alloca i1, i1 0
-  %nop5160 = alloca i1, i1 0
-  %nop5161 = alloca i1, i1 0
-  %nop5162 = alloca i1, i1 0
-  %nop5163 = alloca i1, i1 0
-  %nop5164 = alloca i1, i1 0
-  %nop5165 = alloca i1, i1 0
-  %nop5166 = alloca i1, i1 0
-  %nop5167 = alloca i1, i1 0
-  %nop5168 = alloca i1, i1 0
-  %nop5169 = alloca i1, i1 0
-  %nop5170 = alloca i1, i1 0
-  %nop5171 = alloca i1, i1 0
-  %nop5172 = alloca i1, i1 0
-  %nop5173 = alloca i1, i1 0
-  %nop5174 = alloca i1, i1 0
-  %nop5175 = alloca i1, i1 0
-  %nop5176 = alloca i1, i1 0
-  %nop5177 = alloca i1, i1 0
-  %nop5178 = alloca i1, i1 0
-  %nop5179 = alloca i1, i1 0
-  %nop5180 = alloca i1, i1 0
-  %nop5181 = alloca i1, i1 0
-  %nop5182 = alloca i1, i1 0
-  %nop5183 = alloca i1, i1 0
-  %nop5184 = alloca i1, i1 0
-  %nop5185 = alloca i1, i1 0
-  %nop5186 = alloca i1, i1 0
-  %nop5187 = alloca i1, i1 0
-  %nop5188 = alloca i1, i1 0
-  %nop5189 = alloca i1, i1 0
-  %nop5190 = alloca i1, i1 0
-  %nop5191 = alloca i1, i1 0
-  %nop5192 = alloca i1, i1 0
-  %nop5193 = alloca i1, i1 0
-  %nop5194 = alloca i1, i1 0
-  %nop5195 = alloca i1, i1 0
-  %nop5196 = alloca i1, i1 0
-  %nop5197 = alloca i1, i1 0
-  %nop5198 = alloca i1, i1 0
-  %nop5199 = alloca i1, i1 0
-  %nop5200 = alloca i1, i1 0
-  %nop5201 = alloca i1, i1 0
-  %nop5202 = alloca i1, i1 0
-  %nop5203 = alloca i1, i1 0
-  %nop5204 = alloca i1, i1 0
-  %nop5205 = alloca i1, i1 0
-  %nop5206 = alloca i1, i1 0
-  %nop5207 = alloca i1, i1 0
-  %nop5208 = alloca i1, i1 0
-  %nop5209 = alloca i1, i1 0
-  %nop5210 = alloca i1, i1 0
-  %nop5211 = alloca i1, i1 0
-  %nop5212 = alloca i1, i1 0
-  %nop5213 = alloca i1, i1 0
-  %nop5214 = alloca i1, i1 0
-  %nop5215 = alloca i1, i1 0
-  %nop5216 = alloca i1, i1 0
-  %nop5217 = alloca i1, i1 0
-  %nop5218 = alloca i1, i1 0
-  %nop5219 = alloca i1, i1 0
-  %nop5220 = alloca i1, i1 0
-  %nop5221 = alloca i1, i1 0
-  %nop5222 = alloca i1, i1 0
-  %nop5223 = alloca i1, i1 0
-  %nop5224 = alloca i1, i1 0
-  %nop5225 = alloca i1, i1 0
-  %nop5226 = alloca i1, i1 0
-  %nop5227 = alloca i1, i1 0
-  %nop5228 = alloca i1, i1 0
-  %nop5229 = alloca i1, i1 0
-  %nop5230 = alloca i1, i1 0
-  %nop5231 = alloca i1, i1 0
-  %nop5232 = alloca i1, i1 0
-  %nop5233 = alloca i1, i1 0
-  %nop5234 = alloca i1, i1 0
-  %nop5235 = alloca i1, i1 0
-  %nop5236 = alloca i1, i1 0
-  %nop5237 = alloca i1, i1 0
-  %nop5238 = alloca i1, i1 0
-  %nop5239 = alloca i1, i1 0
-  %nop5240 = alloca i1, i1 0
-  %nop5241 = alloca i1, i1 0
-  %nop5242 = alloca i1, i1 0
-  %nop5243 = alloca i1, i1 0
-  %nop5244 = alloca i1, i1 0
-  %nop5245 = alloca i1, i1 0
-  %nop5246 = alloca i1, i1 0
-  %nop5247 = alloca i1, i1 0
-  %nop5248 = alloca i1, i1 0
-  %nop5249 = alloca i1, i1 0
-  %nop5250 = alloca i1, i1 0
-  %nop5251 = alloca i1, i1 0
-  %nop5252 = alloca i1, i1 0
-  %nop5253 = alloca i1, i1 0
-  %nop5254 = alloca i1, i1 0
-  %nop5255 = alloca i1, i1 0
-  %nop5256 = alloca i1, i1 0
-  %nop5257 = alloca i1, i1 0
-  %nop5258 = alloca i1, i1 0
-  %nop5259 = alloca i1, i1 0
-  %nop5260 = alloca i1, i1 0
-  %nop5261 = alloca i1, i1 0
-  %nop5262 = alloca i1, i1 0
-  %nop5263 = alloca i1, i1 0
-  %nop5264 = alloca i1, i1 0
-  %nop5265 = alloca i1, i1 0
-  %nop5266 = alloca i1, i1 0
-  %nop5267 = alloca i1, i1 0
-  %nop5268 = alloca i1, i1 0
-  %nop5269 = alloca i1, i1 0
-  %nop5270 = alloca i1, i1 0
-  %nop5271 = alloca i1, i1 0
-  %nop5272 = alloca i1, i1 0
-  %nop5273 = alloca i1, i1 0
-  %nop5274 = alloca i1, i1 0
-  %nop5275 = alloca i1, i1 0
-  %nop5276 = alloca i1, i1 0
-  %nop5277 = alloca i1, i1 0
-  %nop5278 = alloca i1, i1 0
-  %nop5279 = alloca i1, i1 0
-  %nop5280 = alloca i1, i1 0
-  %nop5281 = alloca i1, i1 0
-  %nop5282 = alloca i1, i1 0
-  %nop5283 = alloca i1, i1 0
-  %nop5284 = alloca i1, i1 0
-  %nop5285 = alloca i1, i1 0
-  %nop5286 = alloca i1, i1 0
-  %nop5287 = alloca i1, i1 0
-  %nop5288 = alloca i1, i1 0
-  %nop5289 = alloca i1, i1 0
-  %nop5290 = alloca i1, i1 0
-  %nop5291 = alloca i1, i1 0
-  %nop5292 = alloca i1, i1 0
-  %nop5293 = alloca i1, i1 0
-  %nop5294 = alloca i1, i1 0
-  %nop5295 = alloca i1, i1 0
-  %nop5296 = alloca i1, i1 0
-  %nop5297 = alloca i1, i1 0
-  %nop5298 = alloca i1, i1 0
-  %nop5299 = alloca i1, i1 0
-  %nop5300 = alloca i1, i1 0
-  %nop5301 = alloca i1, i1 0
-  %nop5302 = alloca i1, i1 0
-  %nop5303 = alloca i1, i1 0
-  %nop5304 = alloca i1, i1 0
-  %nop5305 = alloca i1, i1 0
-  %nop5306 = alloca i1, i1 0
-  %nop5307 = alloca i1, i1 0
-  %nop5308 = alloca i1, i1 0
-  %nop5309 = alloca i1, i1 0
-  %nop5310 = alloca i1, i1 0
-  %nop5311 = alloca i1, i1 0
-  %nop5312 = alloca i1, i1 0
-  %nop5313 = alloca i1, i1 0
-  %nop5314 = alloca i1, i1 0
-  %nop5315 = alloca i1, i1 0
-  %nop5316 = alloca i1, i1 0
-  %nop5317 = alloca i1, i1 0
-  %nop5318 = alloca i1, i1 0
-  %nop5319 = alloca i1, i1 0
-  %nop5320 = alloca i1, i1 0
-  %nop5321 = alloca i1, i1 0
-  %nop5322 = alloca i1, i1 0
-  %nop5323 = alloca i1, i1 0
-  %nop5324 = alloca i1, i1 0
-  %nop5325 = alloca i1, i1 0
-  %nop5326 = alloca i1, i1 0
-  %nop5327 = alloca i1, i1 0
-  %nop5328 = alloca i1, i1 0
-  %nop5329 = alloca i1, i1 0
-  %nop5330 = alloca i1, i1 0
-  %nop5331 = alloca i1, i1 0
-  %nop5332 = alloca i1, i1 0
-  %nop5333 = alloca i1, i1 0
-  %nop5334 = alloca i1, i1 0
-  %nop5335 = alloca i1, i1 0
-  %nop5336 = alloca i1, i1 0
-  %nop5337 = alloca i1, i1 0
-  %nop5338 = alloca i1, i1 0
-  %nop5339 = alloca i1, i1 0
-  %nop5340 = alloca i1, i1 0
-  %nop5341 = alloca i1, i1 0
-  %nop5342 = alloca i1, i1 0
-  %nop5343 = alloca i1, i1 0
-  %nop5344 = alloca i1, i1 0
-  %nop5345 = alloca i1, i1 0
-  %nop5346 = alloca i1, i1 0
-  %nop5347 = alloca i1, i1 0
-  %nop5348 = alloca i1, i1 0
-  %nop5349 = alloca i1, i1 0
-  %nop5350 = alloca i1, i1 0
-  %nop5351 = alloca i1, i1 0
-  %nop5352 = alloca i1, i1 0
-  %nop5353 = alloca i1, i1 0
-  %nop5354 = alloca i1, i1 0
-  %nop5355 = alloca i1, i1 0
-  %nop5356 = alloca i1, i1 0
-  %nop5357 = alloca i1, i1 0
-  %nop5358 = alloca i1, i1 0
-  %nop5359 = alloca i1, i1 0
-  %nop5360 = alloca i1, i1 0
-  %nop5361 = alloca i1, i1 0
-  %nop5362 = alloca i1, i1 0
-  %nop5363 = alloca i1, i1 0
-  %nop5364 = alloca i1, i1 0
-  %nop5365 = alloca i1, i1 0
-  %nop5366 = alloca i1, i1 0
-  %nop5367 = alloca i1, i1 0
-  %nop5368 = alloca i1, i1 0
-  %nop5369 = alloca i1, i1 0
-  %nop5370 = alloca i1, i1 0
-  %nop5371 = alloca i1, i1 0
-  %nop5372 = alloca i1, i1 0
-  %nop5373 = alloca i1, i1 0
-  %nop5374 = alloca i1, i1 0
-  %nop5375 = alloca i1, i1 0
-  %nop5376 = alloca i1, i1 0
-  %nop5377 = alloca i1, i1 0
-  %nop5378 = alloca i1, i1 0
-  %nop5379 = alloca i1, i1 0
-  %nop5380 = alloca i1, i1 0
-  %nop5381 = alloca i1, i1 0
-  %nop5382 = alloca i1, i1 0
-  %nop5383 = alloca i1, i1 0
-  %nop5384 = alloca i1, i1 0
-  %nop5385 = alloca i1, i1 0
-  %nop5386 = alloca i1, i1 0
-  %nop5387 = alloca i1, i1 0
-  %nop5388 = alloca i1, i1 0
-  %nop5389 = alloca i1, i1 0
-  %nop5390 = alloca i1, i1 0
-  %nop5391 = alloca i1, i1 0
-  %nop5392 = alloca i1, i1 0
-  %nop5393 = alloca i1, i1 0
-  %nop5394 = alloca i1, i1 0
-  %nop5395 = alloca i1, i1 0
-  %nop5396 = alloca i1, i1 0
-  %nop5397 = alloca i1, i1 0
-  %nop5398 = alloca i1, i1 0
-  %nop5399 = alloca i1, i1 0
-  %nop5400 = alloca i1, i1 0
-  %nop5401 = alloca i1, i1 0
-  %nop5402 = alloca i1, i1 0
-  %nop5403 = alloca i1, i1 0
-  %nop5404 = alloca i1, i1 0
-  %nop5405 = alloca i1, i1 0
-  %nop5406 = alloca i1, i1 0
-  %nop5407 = alloca i1, i1 0
-  %nop5408 = alloca i1, i1 0
-  %nop5409 = alloca i1, i1 0
-  %nop5410 = alloca i1, i1 0
-  %nop5411 = alloca i1, i1 0
-  %nop5412 = alloca i1, i1 0
-  %nop5413 = alloca i1, i1 0
-  %nop5414 = alloca i1, i1 0
-  %nop5415 = alloca i1, i1 0
-  %nop5416 = alloca i1, i1 0
-  %nop5417 = alloca i1, i1 0
-  %nop5418 = alloca i1, i1 0
-  %nop5419 = alloca i1, i1 0
-  %nop5420 = alloca i1, i1 0
-  %nop5421 = alloca i1, i1 0
-  %nop5422 = alloca i1, i1 0
-  %nop5423 = alloca i1, i1 0
-  %nop5424 = alloca i1, i1 0
-  %nop5425 = alloca i1, i1 0
-  %nop5426 = alloca i1, i1 0
-  %nop5427 = alloca i1, i1 0
-  %nop5428 = alloca i1, i1 0
-  %nop5429 = alloca i1, i1 0
-  %nop5430 = alloca i1, i1 0
-  %nop5431 = alloca i1, i1 0
-  %nop5432 = alloca i1, i1 0
-  %nop5433 = alloca i1, i1 0
-  %nop5434 = alloca i1, i1 0
-  %nop5435 = alloca i1, i1 0
-  %nop5436 = alloca i1, i1 0
-  %nop5437 = alloca i1, i1 0
-  %nop5438 = alloca i1, i1 0
-  %nop5439 = alloca i1, i1 0
-  %nop5440 = alloca i1, i1 0
-  %nop5441 = alloca i1, i1 0
-  %nop5442 = alloca i1, i1 0
-  %nop5443 = alloca i1, i1 0
-  %nop5444 = alloca i1, i1 0
-  %nop5445 = alloca i1, i1 0
-  %nop5446 = alloca i1, i1 0
-  %nop5447 = alloca i1, i1 0
-  %nop5448 = alloca i1, i1 0
-  %nop5449 = alloca i1, i1 0
-  %nop5450 = alloca i1, i1 0
-  %nop5451 = alloca i1, i1 0
-  %nop5452 = alloca i1, i1 0
-  %nop5453 = alloca i1, i1 0
-  %nop5454 = alloca i1, i1 0
-  %nop5455 = alloca i1, i1 0
-  %nop5456 = alloca i1, i1 0
-  %nop5457 = alloca i1, i1 0
-  %nop5458 = alloca i1, i1 0
-  %nop5459 = alloca i1, i1 0
-  %nop5460 = alloca i1, i1 0
-  %nop5461 = alloca i1, i1 0
-  %nop5462 = alloca i1, i1 0
-  %nop5463 = alloca i1, i1 0
-  %nop5464 = alloca i1, i1 0
-  %nop5465 = alloca i1, i1 0
-  %nop5466 = alloca i1, i1 0
-  %nop5467 = alloca i1, i1 0
-  %nop5468 = alloca i1, i1 0
-  %nop5469 = alloca i1, i1 0
-  %nop5470 = alloca i1, i1 0
-  %nop5471 = alloca i1, i1 0
-  %nop5472 = alloca i1, i1 0
-  %nop5473 = alloca i1, i1 0
-  %nop5474 = alloca i1, i1 0
-  %nop5475 = alloca i1, i1 0
-  %nop5476 = alloca i1, i1 0
-  %nop5477 = alloca i1, i1 0
-  %nop5478 = alloca i1, i1 0
-  %nop5479 = alloca i1, i1 0
-  %nop5480 = alloca i1, i1 0
-  %nop5481 = alloca i1, i1 0
-  %nop5482 = alloca i1, i1 0
-  %nop5483 = alloca i1, i1 0
-  %nop5484 = alloca i1, i1 0
-  %nop5485 = alloca i1, i1 0
-  %nop5486 = alloca i1, i1 0
-  %nop5487 = alloca i1, i1 0
-  %nop5488 = alloca i1, i1 0
-  %nop5489 = alloca i1, i1 0
-  %nop5490 = alloca i1, i1 0
-  %nop5491 = alloca i1, i1 0
-  %nop5492 = alloca i1, i1 0
-  %nop5493 = alloca i1, i1 0
-  %nop5494 = alloca i1, i1 0
-  %nop5495 = alloca i1, i1 0
-  %nop5496 = alloca i1, i1 0
-  %nop5497 = alloca i1, i1 0
-  %nop5498 = alloca i1, i1 0
-  %nop5499 = alloca i1, i1 0
-  %nop5500 = alloca i1, i1 0
-  %nop5501 = alloca i1, i1 0
-  %nop5502 = alloca i1, i1 0
-  %nop5503 = alloca i1, i1 0
-  %nop5504 = alloca i1, i1 0
-  %nop5505 = alloca i1, i1 0
-  %nop5506 = alloca i1, i1 0
-  %nop5507 = alloca i1, i1 0
-  %nop5508 = alloca i1, i1 0
-  %nop5509 = alloca i1, i1 0
-  %nop5510 = alloca i1, i1 0
-  %nop5511 = alloca i1, i1 0
-  %nop5512 = alloca i1, i1 0
-  %nop5513 = alloca i1, i1 0
-  %nop5514 = alloca i1, i1 0
-  %nop5515 = alloca i1, i1 0
-  %nop5516 = alloca i1, i1 0
-  %nop5517 = alloca i1, i1 0
-  %nop5518 = alloca i1, i1 0
-  %nop5519 = alloca i1, i1 0
-  %nop5520 = alloca i1, i1 0
-  %nop5521 = alloca i1, i1 0
-  %nop5522 = alloca i1, i1 0
-  %nop5523 = alloca i1, i1 0
-  %nop5524 = alloca i1, i1 0
-  %nop5525 = alloca i1, i1 0
-  %nop5526 = alloca i1, i1 0
-  %nop5527 = alloca i1, i1 0
-  %nop5528 = alloca i1, i1 0
-  %nop5529 = alloca i1, i1 0
-  %nop5530 = alloca i1, i1 0
-  %nop5531 = alloca i1, i1 0
-  %nop5532 = alloca i1, i1 0
-  %nop5533 = alloca i1, i1 0
-  %nop5534 = alloca i1, i1 0
-  %nop5535 = alloca i1, i1 0
-  %nop5536 = alloca i1, i1 0
-  %nop5537 = alloca i1, i1 0
-  %nop5538 = alloca i1, i1 0
-  %nop5539 = alloca i1, i1 0
-  %nop5540 = alloca i1, i1 0
-  %nop5541 = alloca i1, i1 0
-  %nop5542 = alloca i1, i1 0
-  %nop5543 = alloca i1, i1 0
-  %nop5544 = alloca i1, i1 0
-  %nop5545 = alloca i1, i1 0
-  %nop5546 = alloca i1, i1 0
-  %nop5547 = alloca i1, i1 0
-  %nop5548 = alloca i1, i1 0
-  %nop5549 = alloca i1, i1 0
-  %nop5550 = alloca i1, i1 0
-  %nop5551 = alloca i1, i1 0
-  %nop5552 = alloca i1, i1 0
-  %nop5553 = alloca i1, i1 0
-  %nop5554 = alloca i1, i1 0
-  %nop5555 = alloca i1, i1 0
-  %nop5556 = alloca i1, i1 0
-  %nop5557 = alloca i1, i1 0
-  %nop5558 = alloca i1, i1 0
-  %nop5559 = alloca i1, i1 0
-  %nop5560 = alloca i1, i1 0
-  %nop5561 = alloca i1, i1 0
-  %nop5562 = alloca i1, i1 0
-  %nop5563 = alloca i1, i1 0
-  %nop5564 = alloca i1, i1 0
-  %nop5565 = alloca i1, i1 0
-  %nop5566 = alloca i1, i1 0
-  %nop5567 = alloca i1, i1 0
-  %nop5568 = alloca i1, i1 0
-  %nop5569 = alloca i1, i1 0
-  %nop5570 = alloca i1, i1 0
-  %nop5571 = alloca i1, i1 0
-  %nop5572 = alloca i1, i1 0
-  %nop5573 = alloca i1, i1 0
-  %nop5574 = alloca i1, i1 0
-  %nop5575 = alloca i1, i1 0
-  %nop5576 = alloca i1, i1 0
-  %nop5577 = alloca i1, i1 0
-  %nop5578 = alloca i1, i1 0
-  %nop5579 = alloca i1, i1 0
-  %nop5580 = alloca i1, i1 0
-  %nop5581 = alloca i1, i1 0
-  %nop5582 = alloca i1, i1 0
-  %nop5583 = alloca i1, i1 0
-  %nop5584 = alloca i1, i1 0
-  %nop5585 = alloca i1, i1 0
-  %nop5586 = alloca i1, i1 0
-  %nop5587 = alloca i1, i1 0
-  %nop5588 = alloca i1, i1 0
-  %nop5589 = alloca i1, i1 0
-  %nop5590 = alloca i1, i1 0
-  %nop5591 = alloca i1, i1 0
-  %nop5592 = alloca i1, i1 0
-  %nop5593 = alloca i1, i1 0
-  %nop5594 = alloca i1, i1 0
-  %nop5595 = alloca i1, i1 0
-  %nop5596 = alloca i1, i1 0
-  %nop5597 = alloca i1, i1 0
-  %nop5598 = alloca i1, i1 0
-  %nop5599 = alloca i1, i1 0
-  %nop5600 = alloca i1, i1 0
-  %nop5601 = alloca i1, i1 0
-  %nop5602 = alloca i1, i1 0
-  %nop5603 = alloca i1, i1 0
-  %nop5604 = alloca i1, i1 0
-  %nop5605 = alloca i1, i1 0
-  %nop5606 = alloca i1, i1 0
-  %nop5607 = alloca i1, i1 0
-  %nop5608 = alloca i1, i1 0
-  %nop5609 = alloca i1, i1 0
-  %nop5610 = alloca i1, i1 0
-  %nop5611 = alloca i1, i1 0
-  %nop5612 = alloca i1, i1 0
-  %nop5613 = alloca i1, i1 0
-  %nop5614 = alloca i1, i1 0
-  %nop5615 = alloca i1, i1 0
-  %nop5616 = alloca i1, i1 0
-  %nop5617 = alloca i1, i1 0
-  %nop5618 = alloca i1, i1 0
-  %nop5619 = alloca i1, i1 0
-  %nop5620 = alloca i1, i1 0
-  %nop5621 = alloca i1, i1 0
-  %nop5622 = alloca i1, i1 0
-  %nop5623 = alloca i1, i1 0
-  %nop5624 = alloca i1, i1 0
-  %nop5625 = alloca i1, i1 0
-  %nop5626 = alloca i1, i1 0
-  %nop5627 = alloca i1, i1 0
-  %nop5628 = alloca i1, i1 0
-  %nop5629 = alloca i1, i1 0
-  %nop5630 = alloca i1, i1 0
-  %nop5631 = alloca i1, i1 0
-  %nop5632 = alloca i1, i1 0
-  %nop5633 = alloca i1, i1 0
-  %nop5634 = alloca i1, i1 0
-  %nop5635 = alloca i1, i1 0
-  %nop5636 = alloca i1, i1 0
-  %nop5637 = alloca i1, i1 0
-  %nop5638 = alloca i1, i1 0
-  %nop5639 = alloca i1, i1 0
-  %nop5640 = alloca i1, i1 0
-  %nop5641 = alloca i1, i1 0
-  %nop5642 = alloca i1, i1 0
-  %nop5643 = alloca i1, i1 0
-  %nop5644 = alloca i1, i1 0
-  %nop5645 = alloca i1, i1 0
-  %nop5646 = alloca i1, i1 0
-  %nop5647 = alloca i1, i1 0
-  %nop5648 = alloca i1, i1 0
-  %nop5649 = alloca i1, i1 0
-  %nop5650 = alloca i1, i1 0
-  %nop5651 = alloca i1, i1 0
-  %nop5652 = alloca i1, i1 0
-  %nop5653 = alloca i1, i1 0
-  %nop5654 = alloca i1, i1 0
-  %nop5655 = alloca i1, i1 0
-  %nop5656 = alloca i1, i1 0
-  %nop5657 = alloca i1, i1 0
-  %nop5658 = alloca i1, i1 0
-  %nop5659 = alloca i1, i1 0
-  %nop5660 = alloca i1, i1 0
-  %nop5661 = alloca i1, i1 0
-  %nop5662 = alloca i1, i1 0
-  %nop5663 = alloca i1, i1 0
-  %nop5664 = alloca i1, i1 0
-  %nop5665 = alloca i1, i1 0
-  %nop5666 = alloca i1, i1 0
-  %nop5667 = alloca i1, i1 0
-  %nop5668 = alloca i1, i1 0
-  %nop5669 = alloca i1, i1 0
-  %nop5670 = alloca i1, i1 0
-  %nop5671 = alloca i1, i1 0
-  %nop5672 = alloca i1, i1 0
-  %nop5673 = alloca i1, i1 0
-  %nop5674 = alloca i1, i1 0
-  %nop5675 = alloca i1, i1 0
-  %nop5676 = alloca i1, i1 0
-  %nop5677 = alloca i1, i1 0
-  %nop5678 = alloca i1, i1 0
-  %nop5679 = alloca i1, i1 0
-  %nop5680 = alloca i1, i1 0
-  %nop5681 = alloca i1, i1 0
-  %nop5682 = alloca i1, i1 0
-  %nop5683 = alloca i1, i1 0
-  %nop5684 = alloca i1, i1 0
-  %nop5685 = alloca i1, i1 0
-  %nop5686 = alloca i1, i1 0
-  %nop5687 = alloca i1, i1 0
-  %nop5688 = alloca i1, i1 0
-  %nop5689 = alloca i1, i1 0
-  %nop5690 = alloca i1, i1 0
-  %nop5691 = alloca i1, i1 0
-  %nop5692 = alloca i1, i1 0
-  %nop5693 = alloca i1, i1 0
-  %nop5694 = alloca i1, i1 0
-  %nop5695 = alloca i1, i1 0
-  %nop5696 = alloca i1, i1 0
-  %nop5697 = alloca i1, i1 0
-  %nop5698 = alloca i1, i1 0
-  %nop5699 = alloca i1, i1 0
-  %nop5700 = alloca i1, i1 0
-  %nop5701 = alloca i1, i1 0
-  %nop5702 = alloca i1, i1 0
-  %nop5703 = alloca i1, i1 0
-  %nop5704 = alloca i1, i1 0
-  %nop5705 = alloca i1, i1 0
-  %nop5706 = alloca i1, i1 0
-  %nop5707 = alloca i1, i1 0
-  %nop5708 = alloca i1, i1 0
-  %nop5709 = alloca i1, i1 0
-  %nop5710 = alloca i1, i1 0
-  %nop5711 = alloca i1, i1 0
-  %nop5712 = alloca i1, i1 0
-  %nop5713 = alloca i1, i1 0
-  %nop5714 = alloca i1, i1 0
-  %nop5715 = alloca i1, i1 0
-  %nop5716 = alloca i1, i1 0
-  %nop5717 = alloca i1, i1 0
-  %nop5718 = alloca i1, i1 0
-  %nop5719 = alloca i1, i1 0
-  %nop5720 = alloca i1, i1 0
-  %nop5721 = alloca i1, i1 0
-  %nop5722 = alloca i1, i1 0
-  %nop5723 = alloca i1, i1 0
-  %nop5724 = alloca i1, i1 0
-  %nop5725 = alloca i1, i1 0
-  %nop5726 = alloca i1, i1 0
-  %nop5727 = alloca i1, i1 0
-  %nop5728 = alloca i1, i1 0
-  %nop5729 = alloca i1, i1 0
-  %nop5730 = alloca i1, i1 0
-  %nop5731 = alloca i1, i1 0
-  %nop5732 = alloca i1, i1 0
-  %nop5733 = alloca i1, i1 0
-  %nop5734 = alloca i1, i1 0
-  %nop5735 = alloca i1, i1 0
-  %nop5736 = alloca i1, i1 0
-  %nop5737 = alloca i1, i1 0
-  %nop5738 = alloca i1, i1 0
-  %nop5739 = alloca i1, i1 0
-  %nop5740 = alloca i1, i1 0
-  %nop5741 = alloca i1, i1 0
-  %nop5742 = alloca i1, i1 0
-  %nop5743 = alloca i1, i1 0
-  %nop5744 = alloca i1, i1 0
-  %nop5745 = alloca i1, i1 0
-  %nop5746 = alloca i1, i1 0
-  %nop5747 = alloca i1, i1 0
-  %nop5748 = alloca i1, i1 0
-  %nop5749 = alloca i1, i1 0
-  %nop5750 = alloca i1, i1 0
-  %nop5751 = alloca i1, i1 0
-  %nop5752 = alloca i1, i1 0
-  %nop5753 = alloca i1, i1 0
-  %nop5754 = alloca i1, i1 0
-  %nop5755 = alloca i1, i1 0
-  %nop5756 = alloca i1, i1 0
-  %nop5757 = alloca i1, i1 0
-  %nop5758 = alloca i1, i1 0
-  %nop5759 = alloca i1, i1 0
-  %nop5760 = alloca i1, i1 0
-  %nop5761 = alloca i1, i1 0
-  %nop5762 = alloca i1, i1 0
-  %nop5763 = alloca i1, i1 0
-  %nop5764 = alloca i1, i1 0
-  %nop5765 = alloca i1, i1 0
-  %nop5766 = alloca i1, i1 0
-  %nop5767 = alloca i1, i1 0
-  %nop5768 = alloca i1, i1 0
-  %nop5769 = alloca i1, i1 0
-  %nop5770 = alloca i1, i1 0
-  %nop5771 = alloca i1, i1 0
-  %nop5772 = alloca i1, i1 0
-  %nop5773 = alloca i1, i1 0
-  %nop5774 = alloca i1, i1 0
-  %nop5775 = alloca i1, i1 0
-  %nop5776 = alloca i1, i1 0
-  %nop5777 = alloca i1, i1 0
-  %nop5778 = alloca i1, i1 0
-  %nop5779 = alloca i1, i1 0
-  %nop5780 = alloca i1, i1 0
-  %nop5781 = alloca i1, i1 0
-  %nop5782 = alloca i1, i1 0
-  %nop5783 = alloca i1, i1 0
-  %nop5784 = alloca i1, i1 0
-  %nop5785 = alloca i1, i1 0
-  %nop5786 = alloca i1, i1 0
-  %nop5787 = alloca i1, i1 0
-  %nop5788 = alloca i1, i1 0
-  %nop5789 = alloca i1, i1 0
-  %nop5790 = alloca i1, i1 0
-  %nop5791 = alloca i1, i1 0
-  %nop5792 = alloca i1, i1 0
-  %nop5793 = alloca i1, i1 0
-  %nop5794 = alloca i1, i1 0
-  %nop5795 = alloca i1, i1 0
-  %nop5796 = alloca i1, i1 0
-  %nop5797 = alloca i1, i1 0
-  %nop5798 = alloca i1, i1 0
-  %nop5799 = alloca i1, i1 0
-  %nop5800 = alloca i1, i1 0
-  %nop5801 = alloca i1, i1 0
-  %nop5802 = alloca i1, i1 0
-  %nop5803 = alloca i1, i1 0
-  %nop5804 = alloca i1, i1 0
-  %nop5805 = alloca i1, i1 0
-  %nop5806 = alloca i1, i1 0
-  %nop5807 = alloca i1, i1 0
-  %nop5808 = alloca i1, i1 0
-  %nop5809 = alloca i1, i1 0
-  %nop5810 = alloca i1, i1 0
-  %nop5811 = alloca i1, i1 0
-  %nop5812 = alloca i1, i1 0
-  %nop5813 = alloca i1, i1 0
-  %nop5814 = alloca i1, i1 0
-  %nop5815 = alloca i1, i1 0
-  %nop5816 = alloca i1, i1 0
-  %nop5817 = alloca i1, i1 0
-  %nop5818 = alloca i1, i1 0
-  %nop5819 = alloca i1, i1 0
-  %nop5820 = alloca i1, i1 0
-  %nop5821 = alloca i1, i1 0
-  %nop5822 = alloca i1, i1 0
-  %nop5823 = alloca i1, i1 0
-  %nop5824 = alloca i1, i1 0
-  %nop5825 = alloca i1, i1 0
-  %nop5826 = alloca i1, i1 0
-  %nop5827 = alloca i1, i1 0
-  %nop5828 = alloca i1, i1 0
-  %nop5829 = alloca i1, i1 0
-  %nop5830 = alloca i1, i1 0
-  %nop5831 = alloca i1, i1 0
-  %nop5832 = alloca i1, i1 0
-  %nop5833 = alloca i1, i1 0
-  %nop5834 = alloca i1, i1 0
-  %nop5835 = alloca i1, i1 0
-  %nop5836 = alloca i1, i1 0
-  %nop5837 = alloca i1, i1 0
-  %nop5838 = alloca i1, i1 0
-  %nop5839 = alloca i1, i1 0
-  %nop5840 = alloca i1, i1 0
-  %nop5841 = alloca i1, i1 0
-  %nop5842 = alloca i1, i1 0
-  %nop5843 = alloca i1, i1 0
-  %nop5844 = alloca i1, i1 0
-  %nop5845 = alloca i1, i1 0
-  %nop5846 = alloca i1, i1 0
-  %nop5847 = alloca i1, i1 0
-  %nop5848 = alloca i1, i1 0
-  %nop5849 = alloca i1, i1 0
-  %nop5850 = alloca i1, i1 0
-  %nop5851 = alloca i1, i1 0
-  %nop5852 = alloca i1, i1 0
-  %nop5853 = alloca i1, i1 0
-  %nop5854 = alloca i1, i1 0
-  %nop5855 = alloca i1, i1 0
-  %nop5856 = alloca i1, i1 0
-  %nop5857 = alloca i1, i1 0
-  %nop5858 = alloca i1, i1 0
-  %nop5859 = alloca i1, i1 0
-  %nop5860 = alloca i1, i1 0
-  %nop5861 = alloca i1, i1 0
-  %nop5862 = alloca i1, i1 0
-  %nop5863 = alloca i1, i1 0
-  %nop5864 = alloca i1, i1 0
-  %nop5865 = alloca i1, i1 0
-  %nop5866 = alloca i1, i1 0
-  %nop5867 = alloca i1, i1 0
-  %nop5868 = alloca i1, i1 0
-  %nop5869 = alloca i1, i1 0
-  %nop5870 = alloca i1, i1 0
-  %nop5871 = alloca i1, i1 0
-  %nop5872 = alloca i1, i1 0
-  %nop5873 = alloca i1, i1 0
-  %nop5874 = alloca i1, i1 0
-  %nop5875 = alloca i1, i1 0
-  %nop5876 = alloca i1, i1 0
-  %nop5877 = alloca i1, i1 0
-  %nop5878 = alloca i1, i1 0
-  %nop5879 = alloca i1, i1 0
-  %nop5880 = alloca i1, i1 0
-  %nop5881 = alloca i1, i1 0
-  %nop5882 = alloca i1, i1 0
-  %nop5883 = alloca i1, i1 0
-  %nop5884 = alloca i1, i1 0
-  %nop5885 = alloca i1, i1 0
-  %nop5886 = alloca i1, i1 0
-  %nop5887 = alloca i1, i1 0
-  %nop5888 = alloca i1, i1 0
-  %nop5889 = alloca i1, i1 0
-  %nop5890 = alloca i1, i1 0
-  %nop5891 = alloca i1, i1 0
-  %nop5892 = alloca i1, i1 0
-  %nop5893 = alloca i1, i1 0
-  %nop5894 = alloca i1, i1 0
-  %nop5895 = alloca i1, i1 0
-  %nop5896 = alloca i1, i1 0
-  %nop5897 = alloca i1, i1 0
-  %nop5898 = alloca i1, i1 0
-  %nop5899 = alloca i1, i1 0
-  %nop5900 = alloca i1, i1 0
-  %nop5901 = alloca i1, i1 0
-  %nop5902 = alloca i1, i1 0
-  %nop5903 = alloca i1, i1 0
-  %nop5904 = alloca i1, i1 0
-  %nop5905 = alloca i1, i1 0
-  %nop5906 = alloca i1, i1 0
-  %nop5907 = alloca i1, i1 0
-  %nop5908 = alloca i1, i1 0
-  %nop5909 = alloca i1, i1 0
-  %nop5910 = alloca i1, i1 0
-  %nop5911 = alloca i1, i1 0
-  %nop5912 = alloca i1, i1 0
-  %nop5913 = alloca i1, i1 0
-  %nop5914 = alloca i1, i1 0
-  %nop5915 = alloca i1, i1 0
-  %nop5916 = alloca i1, i1 0
-  %nop5917 = alloca i1, i1 0
-  %nop5918 = alloca i1, i1 0
-  %nop5919 = alloca i1, i1 0
-  %nop5920 = alloca i1, i1 0
-  %nop5921 = alloca i1, i1 0
-  %nop5922 = alloca i1, i1 0
-  %nop5923 = alloca i1, i1 0
-  %nop5924 = alloca i1, i1 0
-  %nop5925 = alloca i1, i1 0
-  %nop5926 = alloca i1, i1 0
-  %nop5927 = alloca i1, i1 0
-  %nop5928 = alloca i1, i1 0
-  %nop5929 = alloca i1, i1 0
-  %nop5930 = alloca i1, i1 0
-  %nop5931 = alloca i1, i1 0
-  %nop5932 = alloca i1, i1 0
-  %nop5933 = alloca i1, i1 0
-  %nop5934 = alloca i1, i1 0
-  %nop5935 = alloca i1, i1 0
-  %nop5936 = alloca i1, i1 0
-  %nop5937 = alloca i1, i1 0
-  %nop5938 = alloca i1, i1 0
-  %nop5939 = alloca i1, i1 0
-  %nop5940 = alloca i1, i1 0
-  %nop5941 = alloca i1, i1 0
-  %nop5942 = alloca i1, i1 0
-  %nop5943 = alloca i1, i1 0
-  %nop5944 = alloca i1, i1 0
-  %nop5945 = alloca i1, i1 0
-  %nop5946 = alloca i1, i1 0
-  %nop5947 = alloca i1, i1 0
-  %nop5948 = alloca i1, i1 0
-  %nop5949 = alloca i1, i1 0
-  %nop5950 = alloca i1, i1 0
-  %nop5951 = alloca i1, i1 0
-  %nop5952 = alloca i1, i1 0
-  %nop5953 = alloca i1, i1 0
-  %nop5954 = alloca i1, i1 0
-  %nop5955 = alloca i1, i1 0
-  %nop5956 = alloca i1, i1 0
-  %nop5957 = alloca i1, i1 0
-  %nop5958 = alloca i1, i1 0
-  %nop5959 = alloca i1, i1 0
-  %nop5960 = alloca i1, i1 0
-  %nop5961 = alloca i1, i1 0
-  %nop5962 = alloca i1, i1 0
-  %nop5963 = alloca i1, i1 0
-  %nop5964 = alloca i1, i1 0
-  %nop5965 = alloca i1, i1 0
-  %nop5966 = alloca i1, i1 0
-  %nop5967 = alloca i1, i1 0
-  %nop5968 = alloca i1, i1 0
-  %nop5969 = alloca i1, i1 0
-  %nop5970 = alloca i1, i1 0
-  %nop5971 = alloca i1, i1 0
-  %nop5972 = alloca i1, i1 0
-  %nop5973 = alloca i1, i1 0
-  %nop5974 = alloca i1, i1 0
-  %nop5975 = alloca i1, i1 0
-  %nop5976 = alloca i1, i1 0
-  %nop5977 = alloca i1, i1 0
-  %nop5978 = alloca i1, i1 0
-  %nop5979 = alloca i1, i1 0
-  %nop5980 = alloca i1, i1 0
-  %nop5981 = alloca i1, i1 0
-  %nop5982 = alloca i1, i1 0
-  %nop5983 = alloca i1, i1 0
-  %nop5984 = alloca i1, i1 0
-  %nop5985 = alloca i1, i1 0
-  %nop5986 = alloca i1, i1 0
-  %nop5987 = alloca i1, i1 0
-  %nop5988 = alloca i1, i1 0
-  %nop5989 = alloca i1, i1 0
-  %nop5990 = alloca i1, i1 0
-  %nop5991 = alloca i1, i1 0
-  %nop5992 = alloca i1, i1 0
-  %nop5993 = alloca i1, i1 0
-  %nop5994 = alloca i1, i1 0
-  %nop5995 = alloca i1, i1 0
-  %nop5996 = alloca i1, i1 0
-  %nop5997 = alloca i1, i1 0
-  %nop5998 = alloca i1, i1 0
-  %nop5999 = alloca i1, i1 0
-  %nop6000 = alloca i1, i1 0
-  %nop6001 = alloca i1, i1 0
-  %nop6002 = alloca i1, i1 0
-  %nop6003 = alloca i1, i1 0
-  %nop6004 = alloca i1, i1 0
-  %nop6005 = alloca i1, i1 0
-  %nop6006 = alloca i1, i1 0
-  %nop6007 = alloca i1, i1 0
-  %nop6008 = alloca i1, i1 0
-  %nop6009 = alloca i1, i1 0
-  %nop6010 = alloca i1, i1 0
-  %nop6011 = alloca i1, i1 0
-  %nop6012 = alloca i1, i1 0
-  %nop6013 = alloca i1, i1 0
-  %nop6014 = alloca i1, i1 0
-  %nop6015 = alloca i1, i1 0
-  %nop6016 = alloca i1, i1 0
-  %nop6017 = alloca i1, i1 0
-  %nop6018 = alloca i1, i1 0
-  %nop6019 = alloca i1, i1 0
-  %nop6020 = alloca i1, i1 0
-  %nop6021 = alloca i1, i1 0
-  %nop6022 = alloca i1, i1 0
-  %nop6023 = alloca i1, i1 0
-  %nop6024 = alloca i1, i1 0
-  %nop6025 = alloca i1, i1 0
-  %nop6026 = alloca i1, i1 0
-  %nop6027 = alloca i1, i1 0
-  %nop6028 = alloca i1, i1 0
-  %nop6029 = alloca i1, i1 0
-  %nop6030 = alloca i1, i1 0
-  %nop6031 = alloca i1, i1 0
-  %nop6032 = alloca i1, i1 0
-  %nop6033 = alloca i1, i1 0
-  %nop6034 = alloca i1, i1 0
-  %nop6035 = alloca i1, i1 0
-  %nop6036 = alloca i1, i1 0
-  %nop6037 = alloca i1, i1 0
-  %nop6038 = alloca i1, i1 0
-  %nop6039 = alloca i1, i1 0
-  %nop6040 = alloca i1, i1 0
-  %nop6041 = alloca i1, i1 0
-  %nop6042 = alloca i1, i1 0
-  %nop6043 = alloca i1, i1 0
-  %nop6044 = alloca i1, i1 0
-  %nop6045 = alloca i1, i1 0
-  %nop6046 = alloca i1, i1 0
-  %nop6047 = alloca i1, i1 0
-  %nop6048 = alloca i1, i1 0
-  %nop6049 = alloca i1, i1 0
-  %nop6050 = alloca i1, i1 0
-  %nop6051 = alloca i1, i1 0
-  %nop6052 = alloca i1, i1 0
-  %nop6053 = alloca i1, i1 0
-  %nop6054 = alloca i1, i1 0
-  %nop6055 = alloca i1, i1 0
-  %nop6056 = alloca i1, i1 0
-  %nop6057 = alloca i1, i1 0
-  %nop6058 = alloca i1, i1 0
-  %nop6059 = alloca i1, i1 0
-  %nop6060 = alloca i1, i1 0
-  %nop6061 = alloca i1, i1 0
-  %nop6062 = alloca i1, i1 0
-  %nop6063 = alloca i1, i1 0
-  %nop6064 = alloca i1, i1 0
-  %nop6065 = alloca i1, i1 0
-  %nop6066 = alloca i1, i1 0
-  %nop6067 = alloca i1, i1 0
-  %nop6068 = alloca i1, i1 0
-  %nop6069 = alloca i1, i1 0
-  %nop6070 = alloca i1, i1 0
-  %nop6071 = alloca i1, i1 0
-  %nop6072 = alloca i1, i1 0
-  %nop6073 = alloca i1, i1 0
-  %nop6074 = alloca i1, i1 0
-  %nop6075 = alloca i1, i1 0
-  %nop6076 = alloca i1, i1 0
-  %nop6077 = alloca i1, i1 0
-  %nop6078 = alloca i1, i1 0
-  %nop6079 = alloca i1, i1 0
-  %nop6080 = alloca i1, i1 0
-  %nop6081 = alloca i1, i1 0
-  %nop6082 = alloca i1, i1 0
-  %nop6083 = alloca i1, i1 0
-  %nop6084 = alloca i1, i1 0
-  %nop6085 = alloca i1, i1 0
-  %nop6086 = alloca i1, i1 0
-  %nop6087 = alloca i1, i1 0
-  %nop6088 = alloca i1, i1 0
-  %nop6089 = alloca i1, i1 0
-  %nop6090 = alloca i1, i1 0
-  %nop6091 = alloca i1, i1 0
-  %nop6092 = alloca i1, i1 0
-  %nop6093 = alloca i1, i1 0
-  %nop6094 = alloca i1, i1 0
-  %nop6095 = alloca i1, i1 0
-  %nop6096 = alloca i1, i1 0
-  %nop6097 = alloca i1, i1 0
-  %nop6098 = alloca i1, i1 0
-  %nop6099 = alloca i1, i1 0
-  %nop6100 = alloca i1, i1 0
-  %nop6101 = alloca i1, i1 0
-  %nop6102 = alloca i1, i1 0
-  %nop6103 = alloca i1, i1 0
-  %nop6104 = alloca i1, i1 0
-  %nop6105 = alloca i1, i1 0
-  %nop6106 = alloca i1, i1 0
-  %nop6107 = alloca i1, i1 0
-  %nop6108 = alloca i1, i1 0
-  %nop6109 = alloca i1, i1 0
-  %nop6110 = alloca i1, i1 0
-  %nop6111 = alloca i1, i1 0
-  %nop6112 = alloca i1, i1 0
-  %nop6113 = alloca i1, i1 0
-  %nop6114 = alloca i1, i1 0
-  %nop6115 = alloca i1, i1 0
-  %nop6116 = alloca i1, i1 0
-  %nop6117 = alloca i1, i1 0
-  %nop6118 = alloca i1, i1 0
-  %nop6119 = alloca i1, i1 0
-  %nop6120 = alloca i1, i1 0
-  %nop6121 = alloca i1, i1 0
-  %nop6122 = alloca i1, i1 0
-  %nop6123 = alloca i1, i1 0
-  %nop6124 = alloca i1, i1 0
-  %nop6125 = alloca i1, i1 0
-  %nop6126 = alloca i1, i1 0
-  %nop6127 = alloca i1, i1 0
-  %nop6128 = alloca i1, i1 0
-  %nop6129 = alloca i1, i1 0
-  %nop6130 = alloca i1, i1 0
-  %nop6131 = alloca i1, i1 0
-  %nop6132 = alloca i1, i1 0
-  %nop6133 = alloca i1, i1 0
-  %nop6134 = alloca i1, i1 0
-  %nop6135 = alloca i1, i1 0
-  %nop6136 = alloca i1, i1 0
-  %nop6137 = alloca i1, i1 0
-  %nop6138 = alloca i1, i1 0
-  %nop6139 = alloca i1, i1 0
-  %nop6140 = alloca i1, i1 0
-  %nop6141 = alloca i1, i1 0
-  %nop6142 = alloca i1, i1 0
-  %nop6143 = alloca i1, i1 0
-  %nop6144 = alloca i1, i1 0
-  %nop6145 = alloca i1, i1 0
-  %nop6146 = alloca i1, i1 0
-  %nop6147 = alloca i1, i1 0
-  %nop6148 = alloca i1, i1 0
-  %nop6149 = alloca i1, i1 0
-  %nop6150 = alloca i1, i1 0
-  %nop6151 = alloca i1, i1 0
-  %nop6152 = alloca i1, i1 0
-  %nop6153 = alloca i1, i1 0
-  %nop6154 = alloca i1, i1 0
-  %nop6155 = alloca i1, i1 0
-  %nop6156 = alloca i1, i1 0
-  %nop6157 = alloca i1, i1 0
-  %nop6158 = alloca i1, i1 0
-  %nop6159 = alloca i1, i1 0
-  %nop6160 = alloca i1, i1 0
-  %nop6161 = alloca i1, i1 0
-  %nop6162 = alloca i1, i1 0
-  %nop6163 = alloca i1, i1 0
-  %nop6164 = alloca i1, i1 0
-  %nop6165 = alloca i1, i1 0
-  %nop6166 = alloca i1, i1 0
-  %nop6167 = alloca i1, i1 0
-  %nop6168 = alloca i1, i1 0
-  %nop6169 = alloca i1, i1 0
-  %nop6170 = alloca i1, i1 0
-  %nop6171 = alloca i1, i1 0
-  %nop6172 = alloca i1, i1 0
-  %nop6173 = alloca i1, i1 0
-  %nop6174 = alloca i1, i1 0
-  %nop6175 = alloca i1, i1 0
-  %nop6176 = alloca i1, i1 0
-  %nop6177 = alloca i1, i1 0
-  %nop6178 = alloca i1, i1 0
-  %nop6179 = alloca i1, i1 0
-  %nop6180 = alloca i1, i1 0
-  %nop6181 = alloca i1, i1 0
-  %nop6182 = alloca i1, i1 0
-  %nop6183 = alloca i1, i1 0
-  %nop6184 = alloca i1, i1 0
-  %nop6185 = alloca i1, i1 0
-  %nop6186 = alloca i1, i1 0
-  %nop6187 = alloca i1, i1 0
-  %nop6188 = alloca i1, i1 0
-  %nop6189 = alloca i1, i1 0
-  %nop6190 = alloca i1, i1 0
-  %nop6191 = alloca i1, i1 0
-  %nop6192 = alloca i1, i1 0
-  %nop6193 = alloca i1, i1 0
-  %nop6194 = alloca i1, i1 0
-  %nop6195 = alloca i1, i1 0
-  %nop6196 = alloca i1, i1 0
-  %nop6197 = alloca i1, i1 0
-  %nop6198 = alloca i1, i1 0
-  %nop6199 = alloca i1, i1 0
-  %nop6200 = alloca i1, i1 0
-  %nop6201 = alloca i1, i1 0
-  %nop6202 = alloca i1, i1 0
-  %nop6203 = alloca i1, i1 0
-  %nop6204 = alloca i1, i1 0
-  %nop6205 = alloca i1, i1 0
-  %nop6206 = alloca i1, i1 0
-  %nop6207 = alloca i1, i1 0
-  %nop6208 = alloca i1, i1 0
-  %nop6209 = alloca i1, i1 0
-  %nop6210 = alloca i1, i1 0
-  %nop6211 = alloca i1, i1 0
-  %nop6212 = alloca i1, i1 0
-  %nop6213 = alloca i1, i1 0
-  %nop6214 = alloca i1, i1 0
-  %nop6215 = alloca i1, i1 0
-  %nop6216 = alloca i1, i1 0
-  %nop6217 = alloca i1, i1 0
-  %nop6218 = alloca i1, i1 0
-  %nop6219 = alloca i1, i1 0
-  %nop6220 = alloca i1, i1 0
-  %nop6221 = alloca i1, i1 0
-  %nop6222 = alloca i1, i1 0
-  %nop6223 = alloca i1, i1 0
-  %nop6224 = alloca i1, i1 0
-  %nop6225 = alloca i1, i1 0
-  %nop6226 = alloca i1, i1 0
-  %nop6227 = alloca i1, i1 0
-  %nop6228 = alloca i1, i1 0
-  %nop6229 = alloca i1, i1 0
-  %nop6230 = alloca i1, i1 0
-  %nop6231 = alloca i1, i1 0
-  %nop6232 = alloca i1, i1 0
-  %nop6233 = alloca i1, i1 0
-  %nop6234 = alloca i1, i1 0
-  %nop6235 = alloca i1, i1 0
-  %nop6236 = alloca i1, i1 0
-  %nop6237 = alloca i1, i1 0
-  %nop6238 = alloca i1, i1 0
-  %nop6239 = alloca i1, i1 0
-  %nop6240 = alloca i1, i1 0
-  %nop6241 = alloca i1, i1 0
-  %nop6242 = alloca i1, i1 0
-  %nop6243 = alloca i1, i1 0
-  %nop6244 = alloca i1, i1 0
-  %nop6245 = alloca i1, i1 0
-  %nop6246 = alloca i1, i1 0
-  %nop6247 = alloca i1, i1 0
-  %nop6248 = alloca i1, i1 0
-  %nop6249 = alloca i1, i1 0
-  %nop6250 = alloca i1, i1 0
-  %nop6251 = alloca i1, i1 0
-  %nop6252 = alloca i1, i1 0
-  %nop6253 = alloca i1, i1 0
-  %nop6254 = alloca i1, i1 0
-  %nop6255 = alloca i1, i1 0
-  %nop6256 = alloca i1, i1 0
-  %nop6257 = alloca i1, i1 0
-  %nop6258 = alloca i1, i1 0
-  %nop6259 = alloca i1, i1 0
-  %nop6260 = alloca i1, i1 0
-  %nop6261 = alloca i1, i1 0
-  %nop6262 = alloca i1, i1 0
-  %nop6263 = alloca i1, i1 0
-  %nop6264 = alloca i1, i1 0
-  %nop6265 = alloca i1, i1 0
-  %nop6266 = alloca i1, i1 0
-  %nop6267 = alloca i1, i1 0
-  %nop6268 = alloca i1, i1 0
-  %nop6269 = alloca i1, i1 0
-  %nop6270 = alloca i1, i1 0
-  %nop6271 = alloca i1, i1 0
-  %nop6272 = alloca i1, i1 0
-  %nop6273 = alloca i1, i1 0
-  %nop6274 = alloca i1, i1 0
-  %nop6275 = alloca i1, i1 0
-  %nop6276 = alloca i1, i1 0
-  %nop6277 = alloca i1, i1 0
-  %nop6278 = alloca i1, i1 0
-  %nop6279 = alloca i1, i1 0
-  %nop6280 = alloca i1, i1 0
-  %nop6281 = alloca i1, i1 0
-  %nop6282 = alloca i1, i1 0
-  %nop6283 = alloca i1, i1 0
-  %nop6284 = alloca i1, i1 0
-  %nop6285 = alloca i1, i1 0
-  %nop6286 = alloca i1, i1 0
-  %nop6287 = alloca i1, i1 0
-  %nop6288 = alloca i1, i1 0
-  %nop6289 = alloca i1, i1 0
-  %nop6290 = alloca i1, i1 0
-  %nop6291 = alloca i1, i1 0
-  %nop6292 = alloca i1, i1 0
-  %nop6293 = alloca i1, i1 0
-  %nop6294 = alloca i1, i1 0
-  %nop6295 = alloca i1, i1 0
-  %nop6296 = alloca i1, i1 0
-  %nop6297 = alloca i1, i1 0
-  %nop6298 = alloca i1, i1 0
-  %nop6299 = alloca i1, i1 0
-  %nop6300 = alloca i1, i1 0
-  %nop6301 = alloca i1, i1 0
-  %nop6302 = alloca i1, i1 0
-  %nop6303 = alloca i1, i1 0
-  %nop6304 = alloca i1, i1 0
-  %nop6305 = alloca i1, i1 0
-  %nop6306 = alloca i1, i1 0
-  %nop6307 = alloca i1, i1 0
-  %nop6308 = alloca i1, i1 0
-  %nop6309 = alloca i1, i1 0
-  %nop6310 = alloca i1, i1 0
-  %nop6311 = alloca i1, i1 0
-  %nop6312 = alloca i1, i1 0
-  %nop6313 = alloca i1, i1 0
-  %nop6314 = alloca i1, i1 0
-  %nop6315 = alloca i1, i1 0
-  %nop6316 = alloca i1, i1 0
-  %nop6317 = alloca i1, i1 0
-  %nop6318 = alloca i1, i1 0
-  %nop6319 = alloca i1, i1 0
-  %nop6320 = alloca i1, i1 0
-  %nop6321 = alloca i1, i1 0
-  %nop6322 = alloca i1, i1 0
-  %nop6323 = alloca i1, i1 0
-  %nop6324 = alloca i1, i1 0
-  %nop6325 = alloca i1, i1 0
-  %nop6326 = alloca i1, i1 0
-  %nop6327 = alloca i1, i1 0
-  %nop6328 = alloca i1, i1 0
-  %nop6329 = alloca i1, i1 0
-  %nop6330 = alloca i1, i1 0
-  %nop6331 = alloca i1, i1 0
-  %nop6332 = alloca i1, i1 0
-  %nop6333 = alloca i1, i1 0
-  %nop6334 = alloca i1, i1 0
-  %nop6335 = alloca i1, i1 0
-  %nop6336 = alloca i1, i1 0
-  %nop6337 = alloca i1, i1 0
-  %nop6338 = alloca i1, i1 0
-  %nop6339 = alloca i1, i1 0
-  %nop6340 = alloca i1, i1 0
-  %nop6341 = alloca i1, i1 0
-  %nop6342 = alloca i1, i1 0
-  %nop6343 = alloca i1, i1 0
-  %nop6344 = alloca i1, i1 0
-  %nop6345 = alloca i1, i1 0
-  %nop6346 = alloca i1, i1 0
-  %nop6347 = alloca i1, i1 0
-  %nop6348 = alloca i1, i1 0
-  %nop6349 = alloca i1, i1 0
-  %nop6350 = alloca i1, i1 0
-  %nop6351 = alloca i1, i1 0
-  %nop6352 = alloca i1, i1 0
-  %nop6353 = alloca i1, i1 0
-  %nop6354 = alloca i1, i1 0
-  %nop6355 = alloca i1, i1 0
-  %nop6356 = alloca i1, i1 0
-  %nop6357 = alloca i1, i1 0
-  %nop6358 = alloca i1, i1 0
-  %nop6359 = alloca i1, i1 0
-  %nop6360 = alloca i1, i1 0
-  %nop6361 = alloca i1, i1 0
-  %nop6362 = alloca i1, i1 0
-  %nop6363 = alloca i1, i1 0
-  %nop6364 = alloca i1, i1 0
-  %nop6365 = alloca i1, i1 0
-  %nop6366 = alloca i1, i1 0
-  %nop6367 = alloca i1, i1 0
-  %nop6368 = alloca i1, i1 0
-  %nop6369 = alloca i1, i1 0
-  %nop6370 = alloca i1, i1 0
-  %nop6371 = alloca i1, i1 0
-  %nop6372 = alloca i1, i1 0
-  %nop6373 = alloca i1, i1 0
-  %nop6374 = alloca i1, i1 0
-  %nop6375 = alloca i1, i1 0
-  %nop6376 = alloca i1, i1 0
-  %nop6377 = alloca i1, i1 0
-  %nop6378 = alloca i1, i1 0
-  %nop6379 = alloca i1, i1 0
-  %nop6380 = alloca i1, i1 0
-  %nop6381 = alloca i1, i1 0
-  %nop6382 = alloca i1, i1 0
-  %nop6383 = alloca i1, i1 0
-  %nop6384 = alloca i1, i1 0
-  %nop6385 = alloca i1, i1 0
-  %nop6386 = alloca i1, i1 0
-  %nop6387 = alloca i1, i1 0
-  %nop6388 = alloca i1, i1 0
-  %nop6389 = alloca i1, i1 0
-  %nop6390 = alloca i1, i1 0
-  %nop6391 = alloca i1, i1 0
-  %nop6392 = alloca i1, i1 0
-  %nop6393 = alloca i1, i1 0
-  %nop6394 = alloca i1, i1 0
-  %nop6395 = alloca i1, i1 0
-  %nop6396 = alloca i1, i1 0
-  %nop6397 = alloca i1, i1 0
-  %nop6398 = alloca i1, i1 0
-  %nop6399 = alloca i1, i1 0
-  %nop6400 = alloca i1, i1 0
-  %nop6401 = alloca i1, i1 0
-  %nop6402 = alloca i1, i1 0
-  %nop6403 = alloca i1, i1 0
-  %nop6404 = alloca i1, i1 0
-  %nop6405 = alloca i1, i1 0
-  %nop6406 = alloca i1, i1 0
-  %nop6407 = alloca i1, i1 0
-  %nop6408 = alloca i1, i1 0
-  %nop6409 = alloca i1, i1 0
-  %nop6410 = alloca i1, i1 0
-  %nop6411 = alloca i1, i1 0
-  %nop6412 = alloca i1, i1 0
-  %nop6413 = alloca i1, i1 0
-  %nop6414 = alloca i1, i1 0
-  %nop6415 = alloca i1, i1 0
-  %nop6416 = alloca i1, i1 0
-  %nop6417 = alloca i1, i1 0
-  %nop6418 = alloca i1, i1 0
-  %nop6419 = alloca i1, i1 0
-  %nop6420 = alloca i1, i1 0
-  %nop6421 = alloca i1, i1 0
-  %nop6422 = alloca i1, i1 0
-  %nop6423 = alloca i1, i1 0
-  %nop6424 = alloca i1, i1 0
-  %nop6425 = alloca i1, i1 0
-  %nop6426 = alloca i1, i1 0
-  %nop6427 = alloca i1, i1 0
-  %nop6428 = alloca i1, i1 0
-  %nop6429 = alloca i1, i1 0
-  %nop6430 = alloca i1, i1 0
-  %nop6431 = alloca i1, i1 0
-  %nop6432 = alloca i1, i1 0
-  %nop6433 = alloca i1, i1 0
-  %nop6434 = alloca i1, i1 0
-  %nop6435 = alloca i1, i1 0
-  %nop6436 = alloca i1, i1 0
-  %nop6437 = alloca i1, i1 0
-  %nop6438 = alloca i1, i1 0
-  %nop6439 = alloca i1, i1 0
-  %nop6440 = alloca i1, i1 0
-  %nop6441 = alloca i1, i1 0
-  %nop6442 = alloca i1, i1 0
-  %nop6443 = alloca i1, i1 0
-  %nop6444 = alloca i1, i1 0
-  %nop6445 = alloca i1, i1 0
-  %nop6446 = alloca i1, i1 0
-  %nop6447 = alloca i1, i1 0
-  %nop6448 = alloca i1, i1 0
-  %nop6449 = alloca i1, i1 0
-  %nop6450 = alloca i1, i1 0
-  %nop6451 = alloca i1, i1 0
-  %nop6452 = alloca i1, i1 0
-  %nop6453 = alloca i1, i1 0
-  %nop6454 = alloca i1, i1 0
-  %nop6455 = alloca i1, i1 0
-  %nop6456 = alloca i1, i1 0
-  %nop6457 = alloca i1, i1 0
-  %nop6458 = alloca i1, i1 0
-  %nop6459 = alloca i1, i1 0
-  %nop6460 = alloca i1, i1 0
-  %nop6461 = alloca i1, i1 0
-  %nop6462 = alloca i1, i1 0
-  %nop6463 = alloca i1, i1 0
-  %nop6464 = alloca i1, i1 0
-  %nop6465 = alloca i1, i1 0
-  %nop6466 = alloca i1, i1 0
-  %nop6467 = alloca i1, i1 0
-  %nop6468 = alloca i1, i1 0
-  %nop6469 = alloca i1, i1 0
-  %nop6470 = alloca i1, i1 0
-  %nop6471 = alloca i1, i1 0
-  %nop6472 = alloca i1, i1 0
-  %nop6473 = alloca i1, i1 0
-  %nop6474 = alloca i1, i1 0
-  %nop6475 = alloca i1, i1 0
-  %nop6476 = alloca i1, i1 0
-  %nop6477 = alloca i1, i1 0
-  %nop6478 = alloca i1, i1 0
-  %nop6479 = alloca i1, i1 0
-  %nop6480 = alloca i1, i1 0
-  %nop6481 = alloca i1, i1 0
-  %nop6482 = alloca i1, i1 0
-  %nop6483 = alloca i1, i1 0
-  %nop6484 = alloca i1, i1 0
-  %nop6485 = alloca i1, i1 0
-  %nop6486 = alloca i1, i1 0
-  %nop6487 = alloca i1, i1 0
-  %nop6488 = alloca i1, i1 0
-  %nop6489 = alloca i1, i1 0
-  %nop6490 = alloca i1, i1 0
-  %nop6491 = alloca i1, i1 0
-  %nop6492 = alloca i1, i1 0
-  %nop6493 = alloca i1, i1 0
-  %nop6494 = alloca i1, i1 0
-  %nop6495 = alloca i1, i1 0
-  %nop6496 = alloca i1, i1 0
-  %nop6497 = alloca i1, i1 0
-  %nop6498 = alloca i1, i1 0
-  %nop6499 = alloca i1, i1 0
-  %nop6500 = alloca i1, i1 0
-  %nop6501 = alloca i1, i1 0
-  %nop6502 = alloca i1, i1 0
-  %nop6503 = alloca i1, i1 0
-  %nop6504 = alloca i1, i1 0
-  %nop6505 = alloca i1, i1 0
-  %nop6506 = alloca i1, i1 0
-  %nop6507 = alloca i1, i1 0
-  %nop6508 = alloca i1, i1 0
-  %nop6509 = alloca i1, i1 0
-  %nop6510 = alloca i1, i1 0
-  %nop6511 = alloca i1, i1 0
-  %nop6512 = alloca i1, i1 0
-  %nop6513 = alloca i1, i1 0
-  %nop6514 = alloca i1, i1 0
-  %nop6515 = alloca i1, i1 0
-  %nop6516 = alloca i1, i1 0
-  %nop6517 = alloca i1, i1 0
-  %nop6518 = alloca i1, i1 0
-  %nop6519 = alloca i1, i1 0
-  %nop6520 = alloca i1, i1 0
-  %nop6521 = alloca i1, i1 0
-  %nop6522 = alloca i1, i1 0
-  %nop6523 = alloca i1, i1 0
-  %nop6524 = alloca i1, i1 0
-  %nop6525 = alloca i1, i1 0
-  %nop6526 = alloca i1, i1 0
-  %nop6527 = alloca i1, i1 0
-  %nop6528 = alloca i1, i1 0
-  %nop6529 = alloca i1, i1 0
-  %nop6530 = alloca i1, i1 0
-  %nop6531 = alloca i1, i1 0
-  %nop6532 = alloca i1, i1 0
-  %nop6533 = alloca i1, i1 0
-  %nop6534 = alloca i1, i1 0
-  %nop6535 = alloca i1, i1 0
-  %nop6536 = alloca i1, i1 0
-  %nop6537 = alloca i1, i1 0
-  %nop6538 = alloca i1, i1 0
-  %nop6539 = alloca i1, i1 0
-  %nop6540 = alloca i1, i1 0
-  %nop6541 = alloca i1, i1 0
-  %nop6542 = alloca i1, i1 0
-  %nop6543 = alloca i1, i1 0
-  %nop6544 = alloca i1, i1 0
-  %nop6545 = alloca i1, i1 0
-  %nop6546 = alloca i1, i1 0
-  %nop6547 = alloca i1, i1 0
-  %nop6548 = alloca i1, i1 0
-  %nop6549 = alloca i1, i1 0
-  %nop6550 = alloca i1, i1 0
-  %nop6551 = alloca i1, i1 0
-  %nop6552 = alloca i1, i1 0
-  %nop6553 = alloca i1, i1 0
-  %nop6554 = alloca i1, i1 0
-  %nop6555 = alloca i1, i1 0
-  %nop6556 = alloca i1, i1 0
-  %nop6557 = alloca i1, i1 0
-  %nop6558 = alloca i1, i1 0
-  %nop6559 = alloca i1, i1 0
-  %nop6560 = alloca i1, i1 0
-  %nop6561 = alloca i1, i1 0
-  %nop6562 = alloca i1, i1 0
-  %nop6563 = alloca i1, i1 0
-  %nop6564 = alloca i1, i1 0
-  %nop6565 = alloca i1, i1 0
-  %nop6566 = alloca i1, i1 0
-  %nop6567 = alloca i1, i1 0
-  %nop6568 = alloca i1, i1 0
-  %nop6569 = alloca i1, i1 0
-  %nop6570 = alloca i1, i1 0
-  %nop6571 = alloca i1, i1 0
-  %nop6572 = alloca i1, i1 0
-  %nop6573 = alloca i1, i1 0
-  %nop6574 = alloca i1, i1 0
-  %nop6575 = alloca i1, i1 0
-  %nop6576 = alloca i1, i1 0
-  %nop6577 = alloca i1, i1 0
-  %nop6578 = alloca i1, i1 0
-  %nop6579 = alloca i1, i1 0
-  %nop6580 = alloca i1, i1 0
-  %nop6581 = alloca i1, i1 0
-  %nop6582 = alloca i1, i1 0
-  %nop6583 = alloca i1, i1 0
-  %nop6584 = alloca i1, i1 0
-  %nop6585 = alloca i1, i1 0
-  %nop6586 = alloca i1, i1 0
-  %nop6587 = alloca i1, i1 0
-  %nop6588 = alloca i1, i1 0
-  %nop6589 = alloca i1, i1 0
-  %nop6590 = alloca i1, i1 0
-  %nop6591 = alloca i1, i1 0
-  %nop6592 = alloca i1, i1 0
-  %nop6593 = alloca i1, i1 0
-  %nop6594 = alloca i1, i1 0
-  %nop6595 = alloca i1, i1 0
-  %nop6596 = alloca i1, i1 0
-  %nop6597 = alloca i1, i1 0
-  %nop6598 = alloca i1, i1 0
-  %nop6599 = alloca i1, i1 0
-  %nop6600 = alloca i1, i1 0
-  %nop6601 = alloca i1, i1 0
-  %nop6602 = alloca i1, i1 0
-  %nop6603 = alloca i1, i1 0
-  %nop6604 = alloca i1, i1 0
-  %nop6605 = alloca i1, i1 0
-  %nop6606 = alloca i1, i1 0
-  %nop6607 = alloca i1, i1 0
-  %nop6608 = alloca i1, i1 0
-  %nop6609 = alloca i1, i1 0
-  %nop6610 = alloca i1, i1 0
-  %nop6611 = alloca i1, i1 0
-  %nop6612 = alloca i1, i1 0
-  %nop6613 = alloca i1, i1 0
-  %nop6614 = alloca i1, i1 0
-  %nop6615 = alloca i1, i1 0
-  %nop6616 = alloca i1, i1 0
-  %nop6617 = alloca i1, i1 0
-  %nop6618 = alloca i1, i1 0
-  %nop6619 = alloca i1, i1 0
-  %nop6620 = alloca i1, i1 0
-  %nop6621 = alloca i1, i1 0
-  %nop6622 = alloca i1, i1 0
-  %nop6623 = alloca i1, i1 0
-  %nop6624 = alloca i1, i1 0
-  %nop6625 = alloca i1, i1 0
-  %nop6626 = alloca i1, i1 0
-  %nop6627 = alloca i1, i1 0
-  %nop6628 = alloca i1, i1 0
-  %nop6629 = alloca i1, i1 0
-  %nop6630 = alloca i1, i1 0
-  %nop6631 = alloca i1, i1 0
-  %nop6632 = alloca i1, i1 0
-  %nop6633 = alloca i1, i1 0
-  %nop6634 = alloca i1, i1 0
-  %nop6635 = alloca i1, i1 0
-  %nop6636 = alloca i1, i1 0
-  %nop6637 = alloca i1, i1 0
-  %nop6638 = alloca i1, i1 0
-  %nop6639 = alloca i1, i1 0
-  %nop6640 = alloca i1, i1 0
-  %nop6641 = alloca i1, i1 0
-  %nop6642 = alloca i1, i1 0
-  %nop6643 = alloca i1, i1 0
-  %nop6644 = alloca i1, i1 0
-  %nop6645 = alloca i1, i1 0
-  %nop6646 = alloca i1, i1 0
-  %nop6647 = alloca i1, i1 0
-  %nop6648 = alloca i1, i1 0
-  %nop6649 = alloca i1, i1 0
-  %nop6650 = alloca i1, i1 0
-  %nop6651 = alloca i1, i1 0
-  %nop6652 = alloca i1, i1 0
-  %nop6653 = alloca i1, i1 0
-  %nop6654 = alloca i1, i1 0
-  %nop6655 = alloca i1, i1 0
-  %nop6656 = alloca i1, i1 0
-  %nop6657 = alloca i1, i1 0
-  %nop6658 = alloca i1, i1 0
-  %nop6659 = alloca i1, i1 0
-  %nop6660 = alloca i1, i1 0
-  %nop6661 = alloca i1, i1 0
-  %nop6662 = alloca i1, i1 0
-  %nop6663 = alloca i1, i1 0
-  %nop6664 = alloca i1, i1 0
-  %nop6665 = alloca i1, i1 0
-  %nop6666 = alloca i1, i1 0
-  %nop6667 = alloca i1, i1 0
-  %nop6668 = alloca i1, i1 0
-  %nop6669 = alloca i1, i1 0
-  %nop6670 = alloca i1, i1 0
-  %nop6671 = alloca i1, i1 0
-  %nop6672 = alloca i1, i1 0
-  %nop6673 = alloca i1, i1 0
-  %nop6674 = alloca i1, i1 0
-  %nop6675 = alloca i1, i1 0
-  %nop6676 = alloca i1, i1 0
-  %nop6677 = alloca i1, i1 0
-  %nop6678 = alloca i1, i1 0
-  %nop6679 = alloca i1, i1 0
-  %nop6680 = alloca i1, i1 0
-  %nop6681 = alloca i1, i1 0
-  %nop6682 = alloca i1, i1 0
-  %nop6683 = alloca i1, i1 0
-  %nop6684 = alloca i1, i1 0
-  %nop6685 = alloca i1, i1 0
-  %nop6686 = alloca i1, i1 0
-  %nop6687 = alloca i1, i1 0
-  %nop6688 = alloca i1, i1 0
-  %nop6689 = alloca i1, i1 0
-  %nop6690 = alloca i1, i1 0
-  %nop6691 = alloca i1, i1 0
-  %nop6692 = alloca i1, i1 0
-  %nop6693 = alloca i1, i1 0
-  %nop6694 = alloca i1, i1 0
-  %nop6695 = alloca i1, i1 0
-  %nop6696 = alloca i1, i1 0
-  %nop6697 = alloca i1, i1 0
-  %nop6698 = alloca i1, i1 0
-  %nop6699 = alloca i1, i1 0
-  %nop6700 = alloca i1, i1 0
-  %nop6701 = alloca i1, i1 0
-  %nop6702 = alloca i1, i1 0
-  %nop6703 = alloca i1, i1 0
-  %nop6704 = alloca i1, i1 0
-  %nop6705 = alloca i1, i1 0
-  %nop6706 = alloca i1, i1 0
-  %nop6707 = alloca i1, i1 0
-  %nop6708 = alloca i1, i1 0
-  %nop6709 = alloca i1, i1 0
-  %nop6710 = alloca i1, i1 0
-  %nop6711 = alloca i1, i1 0
-  %nop6712 = alloca i1, i1 0
-  %nop6713 = alloca i1, i1 0
-  %nop6714 = alloca i1, i1 0
-  %nop6715 = alloca i1, i1 0
-  %nop6716 = alloca i1, i1 0
-  %nop6717 = alloca i1, i1 0
-  %nop6718 = alloca i1, i1 0
-  %nop6719 = alloca i1, i1 0
-  %nop6720 = alloca i1, i1 0
-  %nop6721 = alloca i1, i1 0
-  %nop6722 = alloca i1, i1 0
-  %nop6723 = alloca i1, i1 0
-  %nop6724 = alloca i1, i1 0
-  %nop6725 = alloca i1, i1 0
-  %nop6726 = alloca i1, i1 0
-  %nop6727 = alloca i1, i1 0
-  %nop6728 = alloca i1, i1 0
-  %nop6729 = alloca i1, i1 0
-  %nop6730 = alloca i1, i1 0
-  %nop6731 = alloca i1, i1 0
-  %nop6732 = alloca i1, i1 0
-  %nop6733 = alloca i1, i1 0
-  %nop6734 = alloca i1, i1 0
-  %nop6735 = alloca i1, i1 0
-  %nop6736 = alloca i1, i1 0
-  %nop6737 = alloca i1, i1 0
-  %nop6738 = alloca i1, i1 0
-  %nop6739 = alloca i1, i1 0
-  %nop6740 = alloca i1, i1 0
-  %nop6741 = alloca i1, i1 0
-  %nop6742 = alloca i1, i1 0
-  %nop6743 = alloca i1, i1 0
-  %nop6744 = alloca i1, i1 0
-  %nop6745 = alloca i1, i1 0
-  %nop6746 = alloca i1, i1 0
-  %nop6747 = alloca i1, i1 0
-  %nop6748 = alloca i1, i1 0
-  %nop6749 = alloca i1, i1 0
-  %nop6750 = alloca i1, i1 0
-  %nop6751 = alloca i1, i1 0
-  %nop6752 = alloca i1, i1 0
-  %nop6753 = alloca i1, i1 0
-  %nop6754 = alloca i1, i1 0
-  %nop6755 = alloca i1, i1 0
-  %nop6756 = alloca i1, i1 0
-  %nop6757 = alloca i1, i1 0
-  %nop6758 = alloca i1, i1 0
-  %nop6759 = alloca i1, i1 0
-  %nop6760 = alloca i1, i1 0
-  %nop6761 = alloca i1, i1 0
-  %nop6762 = alloca i1, i1 0
-  %nop6763 = alloca i1, i1 0
-  %nop6764 = alloca i1, i1 0
-  %nop6765 = alloca i1, i1 0
-  %nop6766 = alloca i1, i1 0
-  %nop6767 = alloca i1, i1 0
-  %nop6768 = alloca i1, i1 0
-  %nop6769 = alloca i1, i1 0
-  %nop6770 = alloca i1, i1 0
-  %nop6771 = alloca i1, i1 0
-  %nop6772 = alloca i1, i1 0
-  %nop6773 = alloca i1, i1 0
-  %nop6774 = alloca i1, i1 0
-  %nop6775 = alloca i1, i1 0
-  %nop6776 = alloca i1, i1 0
-  %nop6777 = alloca i1, i1 0
-  %nop6778 = alloca i1, i1 0
-  %nop6779 = alloca i1, i1 0
-  %nop6780 = alloca i1, i1 0
-  %nop6781 = alloca i1, i1 0
-  %nop6782 = alloca i1, i1 0
-  %nop6783 = alloca i1, i1 0
-  %nop6784 = alloca i1, i1 0
-  %nop6785 = alloca i1, i1 0
-  %nop6786 = alloca i1, i1 0
-  %nop6787 = alloca i1, i1 0
-  %nop6788 = alloca i1, i1 0
-  %nop6789 = alloca i1, i1 0
-  %nop6790 = alloca i1, i1 0
-  %nop6791 = alloca i1, i1 0
-  %nop6792 = alloca i1, i1 0
-  %nop6793 = alloca i1, i1 0
-  %nop6794 = alloca i1, i1 0
-  %nop6795 = alloca i1, i1 0
-  %nop6796 = alloca i1, i1 0
-  %nop6797 = alloca i1, i1 0
-  %nop6798 = alloca i1, i1 0
-  %nop6799 = alloca i1, i1 0
-  %nop6800 = alloca i1, i1 0
-  %nop6801 = alloca i1, i1 0
-  %nop6802 = alloca i1, i1 0
-  %nop6803 = alloca i1, i1 0
-  %nop6804 = alloca i1, i1 0
-  %nop6805 = alloca i1, i1 0
-  %nop6806 = alloca i1, i1 0
-  %nop6807 = alloca i1, i1 0
-  %nop6808 = alloca i1, i1 0
-  %nop6809 = alloca i1, i1 0
-  %nop6810 = alloca i1, i1 0
-  %nop6811 = alloca i1, i1 0
-  %nop6812 = alloca i1, i1 0
-  %nop6813 = alloca i1, i1 0
-  %nop6814 = alloca i1, i1 0
-  %nop6815 = alloca i1, i1 0
-  %nop6816 = alloca i1, i1 0
-  %nop6817 = alloca i1, i1 0
-  %nop6818 = alloca i1, i1 0
-  %nop6819 = alloca i1, i1 0
-  %nop6820 = alloca i1, i1 0
-  %nop6821 = alloca i1, i1 0
-  %nop6822 = alloca i1, i1 0
-  %nop6823 = alloca i1, i1 0
-  %nop6824 = alloca i1, i1 0
-  %nop6825 = alloca i1, i1 0
-  %nop6826 = alloca i1, i1 0
-  %nop6827 = alloca i1, i1 0
-  %nop6828 = alloca i1, i1 0
-  %nop6829 = alloca i1, i1 0
-  %nop6830 = alloca i1, i1 0
-  %nop6831 = alloca i1, i1 0
-  %nop6832 = alloca i1, i1 0
-  %nop6833 = alloca i1, i1 0
-  %nop6834 = alloca i1, i1 0
-  %nop6835 = alloca i1, i1 0
-  %nop6836 = alloca i1, i1 0
-  %nop6837 = alloca i1, i1 0
-  %nop6838 = alloca i1, i1 0
-  %nop6839 = alloca i1, i1 0
-  %nop6840 = alloca i1, i1 0
-  %nop6841 = alloca i1, i1 0
-  %nop6842 = alloca i1, i1 0
-  %nop6843 = alloca i1, i1 0
-  %nop6844 = alloca i1, i1 0
-  %nop6845 = alloca i1, i1 0
-  %nop6846 = alloca i1, i1 0
-  %nop6847 = alloca i1, i1 0
-  %nop6848 = alloca i1, i1 0
-  %nop6849 = alloca i1, i1 0
-  %nop6850 = alloca i1, i1 0
-  %nop6851 = alloca i1, i1 0
-  %nop6852 = alloca i1, i1 0
-  %nop6853 = alloca i1, i1 0
-  %nop6854 = alloca i1, i1 0
-  %nop6855 = alloca i1, i1 0
-  %nop6856 = alloca i1, i1 0
-  %nop6857 = alloca i1, i1 0
-  %nop6858 = alloca i1, i1 0
-  %nop6859 = alloca i1, i1 0
-  %nop6860 = alloca i1, i1 0
-  %nop6861 = alloca i1, i1 0
-  %nop6862 = alloca i1, i1 0
-  %nop6863 = alloca i1, i1 0
-  %nop6864 = alloca i1, i1 0
-  %nop6865 = alloca i1, i1 0
-  %nop6866 = alloca i1, i1 0
-  %nop6867 = alloca i1, i1 0
-  %nop6868 = alloca i1, i1 0
-  %nop6869 = alloca i1, i1 0
-  %nop6870 = alloca i1, i1 0
-  %nop6871 = alloca i1, i1 0
-  %nop6872 = alloca i1, i1 0
-  %nop6873 = alloca i1, i1 0
-  %nop6874 = alloca i1, i1 0
-  %nop6875 = alloca i1, i1 0
-  %nop6876 = alloca i1, i1 0
-  %nop6877 = alloca i1, i1 0
-  %nop6878 = alloca i1, i1 0
-  %nop6879 = alloca i1, i1 0
-  %nop6880 = alloca i1, i1 0
-  %nop6881 = alloca i1, i1 0
-  %nop6882 = alloca i1, i1 0
-  %nop6883 = alloca i1, i1 0
-  %nop6884 = alloca i1, i1 0
-  %nop6885 = alloca i1, i1 0
-  %nop6886 = alloca i1, i1 0
-  %nop6887 = alloca i1, i1 0
-  %nop6888 = alloca i1, i1 0
-  %nop6889 = alloca i1, i1 0
-  %nop6890 = alloca i1, i1 0
-  %nop6891 = alloca i1, i1 0
-  %nop6892 = alloca i1, i1 0
-  %nop6893 = alloca i1, i1 0
-  %nop6894 = alloca i1, i1 0
-  %nop6895 = alloca i1, i1 0
-  %nop6896 = alloca i1, i1 0
-  %nop6897 = alloca i1, i1 0
-  %nop6898 = alloca i1, i1 0
-  %nop6899 = alloca i1, i1 0
-  %nop6900 = alloca i1, i1 0
-  %nop6901 = alloca i1, i1 0
-  %nop6902 = alloca i1, i1 0
-  %nop6903 = alloca i1, i1 0
-  %nop6904 = alloca i1, i1 0
-  %nop6905 = alloca i1, i1 0
-  %nop6906 = alloca i1, i1 0
-  %nop6907 = alloca i1, i1 0
-  %nop6908 = alloca i1, i1 0
-  %nop6909 = alloca i1, i1 0
-  %nop6910 = alloca i1, i1 0
-  %nop6911 = alloca i1, i1 0
-  %nop6912 = alloca i1, i1 0
-  %nop6913 = alloca i1, i1 0
-  %nop6914 = alloca i1, i1 0
-  %nop6915 = alloca i1, i1 0
-  %nop6916 = alloca i1, i1 0
-  %nop6917 = alloca i1, i1 0
-  %nop6918 = alloca i1, i1 0
-  %nop6919 = alloca i1, i1 0
-  %nop6920 = alloca i1, i1 0
-  %nop6921 = alloca i1, i1 0
-  %nop6922 = alloca i1, i1 0
-  %nop6923 = alloca i1, i1 0
-  %nop6924 = alloca i1, i1 0
-  %nop6925 = alloca i1, i1 0
-  %nop6926 = alloca i1, i1 0
-  %nop6927 = alloca i1, i1 0
-  %nop6928 = alloca i1, i1 0
-  %nop6929 = alloca i1, i1 0
-  %nop6930 = alloca i1, i1 0
-  %nop6931 = alloca i1, i1 0
-  %nop6932 = alloca i1, i1 0
-  %nop6933 = alloca i1, i1 0
-  %nop6934 = alloca i1, i1 0
-  %nop6935 = alloca i1, i1 0
-  %nop6936 = alloca i1, i1 0
-  %nop6937 = alloca i1, i1 0
-  %nop6938 = alloca i1, i1 0
-  %nop6939 = alloca i1, i1 0
-  %nop6940 = alloca i1, i1 0
-  %nop6941 = alloca i1, i1 0
-  %nop6942 = alloca i1, i1 0
-  %nop6943 = alloca i1, i1 0
-  %nop6944 = alloca i1, i1 0
-  %nop6945 = alloca i1, i1 0
-  %nop6946 = alloca i1, i1 0
-  %nop6947 = alloca i1, i1 0
-  %nop6948 = alloca i1, i1 0
-  %nop6949 = alloca i1, i1 0
-  %nop6950 = alloca i1, i1 0
-  %nop6951 = alloca i1, i1 0
-  %nop6952 = alloca i1, i1 0
-  %nop6953 = alloca i1, i1 0
-  %nop6954 = alloca i1, i1 0
-  %nop6955 = alloca i1, i1 0
-  %nop6956 = alloca i1, i1 0
-  %nop6957 = alloca i1, i1 0
-  %nop6958 = alloca i1, i1 0
-  %nop6959 = alloca i1, i1 0
-  %nop6960 = alloca i1, i1 0
-  %nop6961 = alloca i1, i1 0
-  %nop6962 = alloca i1, i1 0
-  %nop6963 = alloca i1, i1 0
-  %nop6964 = alloca i1, i1 0
-  %nop6965 = alloca i1, i1 0
-  %nop6966 = alloca i1, i1 0
-  %nop6967 = alloca i1, i1 0
-  %nop6968 = alloca i1, i1 0
-  %nop6969 = alloca i1, i1 0
-  %nop6970 = alloca i1, i1 0
-  %nop6971 = alloca i1, i1 0
-  %nop6972 = alloca i1, i1 0
-  %nop6973 = alloca i1, i1 0
-  %nop6974 = alloca i1, i1 0
-  %nop6975 = alloca i1, i1 0
-  %nop6976 = alloca i1, i1 0
-  %nop6977 = alloca i1, i1 0
-  %nop6978 = alloca i1, i1 0
-  %nop6979 = alloca i1, i1 0
-  %nop6980 = alloca i1, i1 0
-  %nop6981 = alloca i1, i1 0
-  %nop6982 = alloca i1, i1 0
-  %nop6983 = alloca i1, i1 0
-  %nop6984 = alloca i1, i1 0
-  %nop6985 = alloca i1, i1 0
-  %nop6986 = alloca i1, i1 0
-  %nop6987 = alloca i1, i1 0
-  %nop6988 = alloca i1, i1 0
-  %nop6989 = alloca i1, i1 0
-  %nop6990 = alloca i1, i1 0
-  %nop6991 = alloca i1, i1 0
-  %nop6992 = alloca i1, i1 0
-  %nop6993 = alloca i1, i1 0
-  %nop6994 = alloca i1, i1 0
-  %nop6995 = alloca i1, i1 0
-  %nop6996 = alloca i1, i1 0
-  %nop6997 = alloca i1, i1 0
-  %nop6998 = alloca i1, i1 0
-  %nop6999 = alloca i1, i1 0
-  %nop7000 = alloca i1, i1 0
-  %nop7001 = alloca i1, i1 0
-  %nop7002 = alloca i1, i1 0
-  %nop7003 = alloca i1, i1 0
-  %nop7004 = alloca i1, i1 0
-  %nop7005 = alloca i1, i1 0
-  %nop7006 = alloca i1, i1 0
-  %nop7007 = alloca i1, i1 0
-  %nop7008 = alloca i1, i1 0
-  %nop7009 = alloca i1, i1 0
-  %nop7010 = alloca i1, i1 0
-  %nop7011 = alloca i1, i1 0
-  %nop7012 = alloca i1, i1 0
-  %nop7013 = alloca i1, i1 0
-  %nop7014 = alloca i1, i1 0
-  %nop7015 = alloca i1, i1 0
-  %nop7016 = alloca i1, i1 0
-  %nop7017 = alloca i1, i1 0
-  %nop7018 = alloca i1, i1 0
-  %nop7019 = alloca i1, i1 0
-  %nop7020 = alloca i1, i1 0
-  %nop7021 = alloca i1, i1 0
-  %nop7022 = alloca i1, i1 0
-  %nop7023 = alloca i1, i1 0
-  %nop7024 = alloca i1, i1 0
-  %nop7025 = alloca i1, i1 0
-  %nop7026 = alloca i1, i1 0
-  %nop7027 = alloca i1, i1 0
-  %nop7028 = alloca i1, i1 0
-  %nop7029 = alloca i1, i1 0
-  %nop7030 = alloca i1, i1 0
-  %nop7031 = alloca i1, i1 0
-  %nop7032 = alloca i1, i1 0
-  %nop7033 = alloca i1, i1 0
-  %nop7034 = alloca i1, i1 0
-  %nop7035 = alloca i1, i1 0
-  %nop7036 = alloca i1, i1 0
-  %nop7037 = alloca i1, i1 0
-  %nop7038 = alloca i1, i1 0
-  %nop7039 = alloca i1, i1 0
-  %nop7040 = alloca i1, i1 0
-  %nop7041 = alloca i1, i1 0
-  %nop7042 = alloca i1, i1 0
-  %nop7043 = alloca i1, i1 0
-  %nop7044 = alloca i1, i1 0
-  %nop7045 = alloca i1, i1 0
-  %nop7046 = alloca i1, i1 0
-  %nop7047 = alloca i1, i1 0
-  %nop7048 = alloca i1, i1 0
-  %nop7049 = alloca i1, i1 0
-  %nop7050 = alloca i1, i1 0
-  %nop7051 = alloca i1, i1 0
-  %nop7052 = alloca i1, i1 0
-  %nop7053 = alloca i1, i1 0
-  %nop7054 = alloca i1, i1 0
-  %nop7055 = alloca i1, i1 0
-  %nop7056 = alloca i1, i1 0
-  %nop7057 = alloca i1, i1 0
-  %nop7058 = alloca i1, i1 0
-  %nop7059 = alloca i1, i1 0
-  %nop7060 = alloca i1, i1 0
-  %nop7061 = alloca i1, i1 0
-  %nop7062 = alloca i1, i1 0
-  %nop7063 = alloca i1, i1 0
-  %nop7064 = alloca i1, i1 0
-  %nop7065 = alloca i1, i1 0
-  %nop7066 = alloca i1, i1 0
-  %nop7067 = alloca i1, i1 0
-  %nop7068 = alloca i1, i1 0
-  %nop7069 = alloca i1, i1 0
-  %nop7070 = alloca i1, i1 0
-  %nop7071 = alloca i1, i1 0
-  %nop7072 = alloca i1, i1 0
-  %nop7073 = alloca i1, i1 0
-  %nop7074 = alloca i1, i1 0
-  %nop7075 = alloca i1, i1 0
-  %nop7076 = alloca i1, i1 0
-  %nop7077 = alloca i1, i1 0
-  %nop7078 = alloca i1, i1 0
-  %nop7079 = alloca i1, i1 0
-  %nop7080 = alloca i1, i1 0
-  %nop7081 = alloca i1, i1 0
-  %nop7082 = alloca i1, i1 0
-  %nop7083 = alloca i1, i1 0
-  %nop7084 = alloca i1, i1 0
-  %nop7085 = alloca i1, i1 0
-  %nop7086 = alloca i1, i1 0
-  %nop7087 = alloca i1, i1 0
-  %nop7088 = alloca i1, i1 0
-  %nop7089 = alloca i1, i1 0
-  %nop7090 = alloca i1, i1 0
-  %nop7091 = alloca i1, i1 0
-  %nop7092 = alloca i1, i1 0
-  %nop7093 = alloca i1, i1 0
-  %nop7094 = alloca i1, i1 0
-  %nop7095 = alloca i1, i1 0
-  %nop7096 = alloca i1, i1 0
-  %nop7097 = alloca i1, i1 0
-  %nop7098 = alloca i1, i1 0
-  %nop7099 = alloca i1, i1 0
-  %nop7100 = alloca i1, i1 0
-  %nop7101 = alloca i1, i1 0
-  %nop7102 = alloca i1, i1 0
-  %nop7103 = alloca i1, i1 0
-  %nop7104 = alloca i1, i1 0
-  %nop7105 = alloca i1, i1 0
-  %nop7106 = alloca i1, i1 0
-  %nop7107 = alloca i1, i1 0
-  %nop7108 = alloca i1, i1 0
-  %nop7109 = alloca i1, i1 0
-  %nop7110 = alloca i1, i1 0
-  %nop7111 = alloca i1, i1 0
-  %nop7112 = alloca i1, i1 0
-  %nop7113 = alloca i1, i1 0
-  %nop7114 = alloca i1, i1 0
-  %nop7115 = alloca i1, i1 0
-  %nop7116 = alloca i1, i1 0
-  %nop7117 = alloca i1, i1 0
-  %nop7118 = alloca i1, i1 0
-  %nop7119 = alloca i1, i1 0
-  %nop7120 = alloca i1, i1 0
-  %nop7121 = alloca i1, i1 0
-  %nop7122 = alloca i1, i1 0
-  %nop7123 = alloca i1, i1 0
-  %nop7124 = alloca i1, i1 0
-  %nop7125 = alloca i1, i1 0
-  %nop7126 = alloca i1, i1 0
-  %nop7127 = alloca i1, i1 0
-  %nop7128 = alloca i1, i1 0
-  %nop7129 = alloca i1, i1 0
-  %nop7130 = alloca i1, i1 0
-  %nop7131 = alloca i1, i1 0
-  %nop7132 = alloca i1, i1 0
-  %nop7133 = alloca i1, i1 0
-  %nop7134 = alloca i1, i1 0
-  %nop7135 = alloca i1, i1 0
-  %nop7136 = alloca i1, i1 0
-  %nop7137 = alloca i1, i1 0
-  %nop7138 = alloca i1, i1 0
-  %nop7139 = alloca i1, i1 0
-  %nop7140 = alloca i1, i1 0
-  %nop7141 = alloca i1, i1 0
-  %nop7142 = alloca i1, i1 0
-  %nop7143 = alloca i1, i1 0
-  %nop7144 = alloca i1, i1 0
-  %nop7145 = alloca i1, i1 0
-  %nop7146 = alloca i1, i1 0
-  %nop7147 = alloca i1, i1 0
-  %nop7148 = alloca i1, i1 0
-  %nop7149 = alloca i1, i1 0
-  %nop7150 = alloca i1, i1 0
-  %nop7151 = alloca i1, i1 0
-  %nop7152 = alloca i1, i1 0
-  %nop7153 = alloca i1, i1 0
-  %nop7154 = alloca i1, i1 0
-  %nop7155 = alloca i1, i1 0
-  %nop7156 = alloca i1, i1 0
-  %nop7157 = alloca i1, i1 0
-  %nop7158 = alloca i1, i1 0
-  %nop7159 = alloca i1, i1 0
-  %nop7160 = alloca i1, i1 0
-  %nop7161 = alloca i1, i1 0
-  %nop7162 = alloca i1, i1 0
-  %nop7163 = alloca i1, i1 0
-  %nop7164 = alloca i1, i1 0
-  %nop7165 = alloca i1, i1 0
-  %nop7166 = alloca i1, i1 0
-  %nop7167 = alloca i1, i1 0
-  %nop7168 = alloca i1, i1 0
-  %nop7169 = alloca i1, i1 0
-  %nop7170 = alloca i1, i1 0
-  %nop7171 = alloca i1, i1 0
-  %nop7172 = alloca i1, i1 0
-  %nop7173 = alloca i1, i1 0
-  %nop7174 = alloca i1, i1 0
-  %nop7175 = alloca i1, i1 0
-  %nop7176 = alloca i1, i1 0
-  %nop7177 = alloca i1, i1 0
-  %nop7178 = alloca i1, i1 0
-  %nop7179 = alloca i1, i1 0
-  %nop7180 = alloca i1, i1 0
-  %nop7181 = alloca i1, i1 0
-  %nop7182 = alloca i1, i1 0
-  %nop7183 = alloca i1, i1 0
-  %nop7184 = alloca i1, i1 0
-  %nop7185 = alloca i1, i1 0
-  %nop7186 = alloca i1, i1 0
-  %nop7187 = alloca i1, i1 0
-  %nop7188 = alloca i1, i1 0
-  %nop7189 = alloca i1, i1 0
-  %nop7190 = alloca i1, i1 0
-  %nop7191 = alloca i1, i1 0
-  %nop7192 = alloca i1, i1 0
-  %nop7193 = alloca i1, i1 0
-  %nop7194 = alloca i1, i1 0
-  %nop7195 = alloca i1, i1 0
-  %nop7196 = alloca i1, i1 0
-  %nop7197 = alloca i1, i1 0
-  %nop7198 = alloca i1, i1 0
-  %nop7199 = alloca i1, i1 0
-  %nop7200 = alloca i1, i1 0
-  %nop7201 = alloca i1, i1 0
-  %nop7202 = alloca i1, i1 0
-  %nop7203 = alloca i1, i1 0
-  %nop7204 = alloca i1, i1 0
-  %nop7205 = alloca i1, i1 0
-  %nop7206 = alloca i1, i1 0
-  %nop7207 = alloca i1, i1 0
-  %nop7208 = alloca i1, i1 0
-  %nop7209 = alloca i1, i1 0
-  %nop7210 = alloca i1, i1 0
-  %nop7211 = alloca i1, i1 0
-  %nop7212 = alloca i1, i1 0
-  %nop7213 = alloca i1, i1 0
-  %nop7214 = alloca i1, i1 0
-  %nop7215 = alloca i1, i1 0
-  %nop7216 = alloca i1, i1 0
-  %nop7217 = alloca i1, i1 0
-  %nop7218 = alloca i1, i1 0
-  %nop7219 = alloca i1, i1 0
-  %nop7220 = alloca i1, i1 0
-  %nop7221 = alloca i1, i1 0
-  %nop7222 = alloca i1, i1 0
-  %nop7223 = alloca i1, i1 0
-  %nop7224 = alloca i1, i1 0
-  %nop7225 = alloca i1, i1 0
-  %nop7226 = alloca i1, i1 0
-  %nop7227 = alloca i1, i1 0
-  %nop7228 = alloca i1, i1 0
-  %nop7229 = alloca i1, i1 0
-  %nop7230 = alloca i1, i1 0
-  %nop7231 = alloca i1, i1 0
-  %nop7232 = alloca i1, i1 0
-  %nop7233 = alloca i1, i1 0
-  %nop7234 = alloca i1, i1 0
-  %nop7235 = alloca i1, i1 0
-  %nop7236 = alloca i1, i1 0
-  %nop7237 = alloca i1, i1 0
-  %nop7238 = alloca i1, i1 0
-  %nop7239 = alloca i1, i1 0
-  %nop7240 = alloca i1, i1 0
-  %nop7241 = alloca i1, i1 0
-  %nop7242 = alloca i1, i1 0
-  %nop7243 = alloca i1, i1 0
-  %nop7244 = alloca i1, i1 0
-  %nop7245 = alloca i1, i1 0
-  %nop7246 = alloca i1, i1 0
-  %nop7247 = alloca i1, i1 0
-  %nop7248 = alloca i1, i1 0
-  %nop7249 = alloca i1, i1 0
-  %nop7250 = alloca i1, i1 0
-  %nop7251 = alloca i1, i1 0
-  %nop7252 = alloca i1, i1 0
-  %nop7253 = alloca i1, i1 0
-  %nop7254 = alloca i1, i1 0
-  %nop7255 = alloca i1, i1 0
-  %nop7256 = alloca i1, i1 0
-  %nop7257 = alloca i1, i1 0
-  %nop7258 = alloca i1, i1 0
-  %nop7259 = alloca i1, i1 0
-  %nop7260 = alloca i1, i1 0
-  %nop7261 = alloca i1, i1 0
-  %nop7262 = alloca i1, i1 0
-  %nop7263 = alloca i1, i1 0
-  %nop7264 = alloca i1, i1 0
-  %nop7265 = alloca i1, i1 0
-  %nop7266 = alloca i1, i1 0
-  %nop7267 = alloca i1, i1 0
-  %nop7268 = alloca i1, i1 0
-  %nop7269 = alloca i1, i1 0
-  %nop7270 = alloca i1, i1 0
-  %nop7271 = alloca i1, i1 0
-  %nop7272 = alloca i1, i1 0
-  %nop7273 = alloca i1, i1 0
-  %nop7274 = alloca i1, i1 0
-  %nop7275 = alloca i1, i1 0
-  %nop7276 = alloca i1, i1 0
-  %nop7277 = alloca i1, i1 0
-  %nop7278 = alloca i1, i1 0
-  %nop7279 = alloca i1, i1 0
-  %nop7280 = alloca i1, i1 0
-  %nop7281 = alloca i1, i1 0
-  %nop7282 = alloca i1, i1 0
-  %nop7283 = alloca i1, i1 0
-  %nop7284 = alloca i1, i1 0
-  %nop7285 = alloca i1, i1 0
-  %nop7286 = alloca i1, i1 0
-  %nop7287 = alloca i1, i1 0
-  %nop7288 = alloca i1, i1 0
-  %nop7289 = alloca i1, i1 0
-  %nop7290 = alloca i1, i1 0
-  %nop7291 = alloca i1, i1 0
-  %nop7292 = alloca i1, i1 0
-  %nop7293 = alloca i1, i1 0
-  %nop7294 = alloca i1, i1 0
-  %nop7295 = alloca i1, i1 0
-  %nop7296 = alloca i1, i1 0
-  %nop7297 = alloca i1, i1 0
-  %nop7298 = alloca i1, i1 0
-  %nop7299 = alloca i1, i1 0
-  %nop7300 = alloca i1, i1 0
-  %nop7301 = alloca i1, i1 0
-  %nop7302 = alloca i1, i1 0
-  %nop7303 = alloca i1, i1 0
-  %nop7304 = alloca i1, i1 0
-  %nop7305 = alloca i1, i1 0
-  %nop7306 = alloca i1, i1 0
-  %nop7307 = alloca i1, i1 0
-  %nop7308 = alloca i1, i1 0
-  %nop7309 = alloca i1, i1 0
-  %nop7310 = alloca i1, i1 0
-  %nop7311 = alloca i1, i1 0
-  %nop7312 = alloca i1, i1 0
-  %nop7313 = alloca i1, i1 0
-  %nop7314 = alloca i1, i1 0
-  %nop7315 = alloca i1, i1 0
-  %nop7316 = alloca i1, i1 0
-  %nop7317 = alloca i1, i1 0
-  %nop7318 = alloca i1, i1 0
-  %nop7319 = alloca i1, i1 0
-  %nop7320 = alloca i1, i1 0
-  %nop7321 = alloca i1, i1 0
-  %nop7322 = alloca i1, i1 0
-  %nop7323 = alloca i1, i1 0
-  %nop7324 = alloca i1, i1 0
-  %nop7325 = alloca i1, i1 0
-  %nop7326 = alloca i1, i1 0
-  %nop7327 = alloca i1, i1 0
-  %nop7328 = alloca i1, i1 0
-  %nop7329 = alloca i1, i1 0
-  %nop7330 = alloca i1, i1 0
-  %nop7331 = alloca i1, i1 0
-  %nop7332 = alloca i1, i1 0
-  %nop7333 = alloca i1, i1 0
-  %nop7334 = alloca i1, i1 0
-  %nop7335 = alloca i1, i1 0
-  %nop7336 = alloca i1, i1 0
-  %nop7337 = alloca i1, i1 0
-  %nop7338 = alloca i1, i1 0
-  %nop7339 = alloca i1, i1 0
-  %nop7340 = alloca i1, i1 0
-  %nop7341 = alloca i1, i1 0
-  %nop7342 = alloca i1, i1 0
-  %nop7343 = alloca i1, i1 0
-  %nop7344 = alloca i1, i1 0
-  %nop7345 = alloca i1, i1 0
-  %nop7346 = alloca i1, i1 0
-  %nop7347 = alloca i1, i1 0
-  %nop7348 = alloca i1, i1 0
-  %nop7349 = alloca i1, i1 0
-  %nop7350 = alloca i1, i1 0
-  %nop7351 = alloca i1, i1 0
-  %nop7352 = alloca i1, i1 0
-  %nop7353 = alloca i1, i1 0
-  %nop7354 = alloca i1, i1 0
-  %nop7355 = alloca i1, i1 0
-  %nop7356 = alloca i1, i1 0
-  %nop7357 = alloca i1, i1 0
-  %nop7358 = alloca i1, i1 0
-  %nop7359 = alloca i1, i1 0
-  %nop7360 = alloca i1, i1 0
-  %nop7361 = alloca i1, i1 0
-  %nop7362 = alloca i1, i1 0
-  %nop7363 = alloca i1, i1 0
-  %nop7364 = alloca i1, i1 0
-  %nop7365 = alloca i1, i1 0
-  %nop7366 = alloca i1, i1 0
-  %nop7367 = alloca i1, i1 0
-  %nop7368 = alloca i1, i1 0
-  %nop7369 = alloca i1, i1 0
-  %nop7370 = alloca i1, i1 0
-  %nop7371 = alloca i1, i1 0
-  %nop7372 = alloca i1, i1 0
-  %nop7373 = alloca i1, i1 0
-  %nop7374 = alloca i1, i1 0
-  %nop7375 = alloca i1, i1 0
-  %nop7376 = alloca i1, i1 0
-  %nop7377 = alloca i1, i1 0
-  %nop7378 = alloca i1, i1 0
-  %nop7379 = alloca i1, i1 0
-  %nop7380 = alloca i1, i1 0
-  %nop7381 = alloca i1, i1 0
-  %nop7382 = alloca i1, i1 0
-  %nop7383 = alloca i1, i1 0
-  %nop7384 = alloca i1, i1 0
-  %nop7385 = alloca i1, i1 0
-  %nop7386 = alloca i1, i1 0
-  %nop7387 = alloca i1, i1 0
-  %nop7388 = alloca i1, i1 0
-  %nop7389 = alloca i1, i1 0
-  %nop7390 = alloca i1, i1 0
-  %nop7391 = alloca i1, i1 0
-  %nop7392 = alloca i1, i1 0
-  %nop7393 = alloca i1, i1 0
-  %nop7394 = alloca i1, i1 0
-  %nop7395 = alloca i1, i1 0
-  %nop7396 = alloca i1, i1 0
-  %nop7397 = alloca i1, i1 0
-  %nop7398 = alloca i1, i1 0
-  %nop7399 = alloca i1, i1 0
-  %nop7400 = alloca i1, i1 0
-  %nop7401 = alloca i1, i1 0
-  %nop7402 = alloca i1, i1 0
-  %nop7403 = alloca i1, i1 0
-  %nop7404 = alloca i1, i1 0
-  %nop7405 = alloca i1, i1 0
-  %nop7406 = alloca i1, i1 0
-  %nop7407 = alloca i1, i1 0
-  %nop7408 = alloca i1, i1 0
-  %nop7409 = alloca i1, i1 0
-  %nop7410 = alloca i1, i1 0
-  %nop7411 = alloca i1, i1 0
-  %nop7412 = alloca i1, i1 0
-  %nop7413 = alloca i1, i1 0
-  %nop7414 = alloca i1, i1 0
-  %nop7415 = alloca i1, i1 0
-  %nop7416 = alloca i1, i1 0
-  %nop7417 = alloca i1, i1 0
-  %nop7418 = alloca i1, i1 0
-  %nop7419 = alloca i1, i1 0
-  %nop7420 = alloca i1, i1 0
-  %nop7421 = alloca i1, i1 0
-  %nop7422 = alloca i1, i1 0
-  %nop7423 = alloca i1, i1 0
-  %nop7424 = alloca i1, i1 0
-  %nop7425 = alloca i1, i1 0
-  %nop7426 = alloca i1, i1 0
-  %nop7427 = alloca i1, i1 0
-  %nop7428 = alloca i1, i1 0
-  %nop7429 = alloca i1, i1 0
-  %nop7430 = alloca i1, i1 0
-  %nop7431 = alloca i1, i1 0
-  %nop7432 = alloca i1, i1 0
-  %nop7433 = alloca i1, i1 0
-  %nop7434 = alloca i1, i1 0
-  %nop7435 = alloca i1, i1 0
-  %nop7436 = alloca i1, i1 0
-  %nop7437 = alloca i1, i1 0
-  %nop7438 = alloca i1, i1 0
-  %nop7439 = alloca i1, i1 0
-  %nop7440 = alloca i1, i1 0
-  %nop7441 = alloca i1, i1 0
-  %nop7442 = alloca i1, i1 0
-  %nop7443 = alloca i1, i1 0
-  %nop7444 = alloca i1, i1 0
-  %nop7445 = alloca i1, i1 0
-  %nop7446 = alloca i1, i1 0
-  %nop7447 = alloca i1, i1 0
-  %nop7448 = alloca i1, i1 0
-  %nop7449 = alloca i1, i1 0
-  %nop7450 = alloca i1, i1 0
-  %nop7451 = alloca i1, i1 0
-  %nop7452 = alloca i1, i1 0
-  %nop7453 = alloca i1, i1 0
-  %nop7454 = alloca i1, i1 0
-  %nop7455 = alloca i1, i1 0
-  %nop7456 = alloca i1, i1 0
-  %nop7457 = alloca i1, i1 0
-  %nop7458 = alloca i1, i1 0
-  %nop7459 = alloca i1, i1 0
-  %nop7460 = alloca i1, i1 0
-  %nop7461 = alloca i1, i1 0
-  %nop7462 = alloca i1, i1 0
-  %nop7463 = alloca i1, i1 0
-  %nop7464 = alloca i1, i1 0
-  %nop7465 = alloca i1, i1 0
-  %nop7466 = alloca i1, i1 0
-  %nop7467 = alloca i1, i1 0
-  %nop7468 = alloca i1, i1 0
-  %nop7469 = alloca i1, i1 0
-  %nop7470 = alloca i1, i1 0
-  %nop7471 = alloca i1, i1 0
-  %nop7472 = alloca i1, i1 0
-  %nop7473 = alloca i1, i1 0
-  %nop7474 = alloca i1, i1 0
-  %nop7475 = alloca i1, i1 0
-  %nop7476 = alloca i1, i1 0
-  %nop7477 = alloca i1, i1 0
-  %nop7478 = alloca i1, i1 0
-  %nop7479 = alloca i1, i1 0
-  %nop7480 = alloca i1, i1 0
-  %nop7481 = alloca i1, i1 0
-  %nop7482 = alloca i1, i1 0
-  %nop7483 = alloca i1, i1 0
-  %nop7484 = alloca i1, i1 0
-  %nop7485 = alloca i1, i1 0
-  %nop7486 = alloca i1, i1 0
-  %nop7487 = alloca i1, i1 0
-  %nop7488 = alloca i1, i1 0
-  %nop7489 = alloca i1, i1 0
-  %nop7490 = alloca i1, i1 0
-  %nop7491 = alloca i1, i1 0
-  %nop7492 = alloca i1, i1 0
-  %nop7493 = alloca i1, i1 0
-  %nop7494 = alloca i1, i1 0
-  %nop7495 = alloca i1, i1 0
-  %nop7496 = alloca i1, i1 0
-  %nop7497 = alloca i1, i1 0
-  %nop7498 = alloca i1, i1 0
-  %nop7499 = alloca i1, i1 0
-  %nop7500 = alloca i1, i1 0
-  %nop7501 = alloca i1, i1 0
-  %nop7502 = alloca i1, i1 0
-  %nop7503 = alloca i1, i1 0
-  %nop7504 = alloca i1, i1 0
-  %nop7505 = alloca i1, i1 0
-  %nop7506 = alloca i1, i1 0
-  %nop7507 = alloca i1, i1 0
-  %nop7508 = alloca i1, i1 0
-  %nop7509 = alloca i1, i1 0
-  %nop7510 = alloca i1, i1 0
-  %nop7511 = alloca i1, i1 0
-  %nop7512 = alloca i1, i1 0
-  %nop7513 = alloca i1, i1 0
-  %nop7514 = alloca i1, i1 0
-  %nop7515 = alloca i1, i1 0
-  %nop7516 = alloca i1, i1 0
-  %nop7517 = alloca i1, i1 0
-  %nop7518 = alloca i1, i1 0
-  %nop7519 = alloca i1, i1 0
-  %nop7520 = alloca i1, i1 0
-  %nop7521 = alloca i1, i1 0
-  %nop7522 = alloca i1, i1 0
-  %nop7523 = alloca i1, i1 0
-  %nop7524 = alloca i1, i1 0
-  %nop7525 = alloca i1, i1 0
-  %nop7526 = alloca i1, i1 0
-  %nop7527 = alloca i1, i1 0
-  %nop7528 = alloca i1, i1 0
-  %nop7529 = alloca i1, i1 0
-  %nop7530 = alloca i1, i1 0
-  %nop7531 = alloca i1, i1 0
-  %nop7532 = alloca i1, i1 0
-  %nop7533 = alloca i1, i1 0
-  %nop7534 = alloca i1, i1 0
-  %nop7535 = alloca i1, i1 0
-  %nop7536 = alloca i1, i1 0
-  %nop7537 = alloca i1, i1 0
-  %nop7538 = alloca i1, i1 0
-  %nop7539 = alloca i1, i1 0
-  %nop7540 = alloca i1, i1 0
-  %nop7541 = alloca i1, i1 0
-  %nop7542 = alloca i1, i1 0
-  %nop7543 = alloca i1, i1 0
-  %nop7544 = alloca i1, i1 0
-  %nop7545 = alloca i1, i1 0
-  %nop7546 = alloca i1, i1 0
-  %nop7547 = alloca i1, i1 0
-  %nop7548 = alloca i1, i1 0
-  %nop7549 = alloca i1, i1 0
-  %nop7550 = alloca i1, i1 0
-  %nop7551 = alloca i1, i1 0
-  %nop7552 = alloca i1, i1 0
-  %nop7553 = alloca i1, i1 0
-  %nop7554 = alloca i1, i1 0
-  %nop7555 = alloca i1, i1 0
-  %nop7556 = alloca i1, i1 0
-  %nop7557 = alloca i1, i1 0
-  %nop7558 = alloca i1, i1 0
-  %nop7559 = alloca i1, i1 0
-  %nop7560 = alloca i1, i1 0
-  %nop7561 = alloca i1, i1 0
-  %nop7562 = alloca i1, i1 0
-  %nop7563 = alloca i1, i1 0
-  %nop7564 = alloca i1, i1 0
-  %nop7565 = alloca i1, i1 0
-  %nop7566 = alloca i1, i1 0
-  %nop7567 = alloca i1, i1 0
-  %nop7568 = alloca i1, i1 0
-  %nop7569 = alloca i1, i1 0
-  %nop7570 = alloca i1, i1 0
-  %nop7571 = alloca i1, i1 0
-  %nop7572 = alloca i1, i1 0
-  %nop7573 = alloca i1, i1 0
-  %nop7574 = alloca i1, i1 0
-  %nop7575 = alloca i1, i1 0
-  %nop7576 = alloca i1, i1 0
-  %nop7577 = alloca i1, i1 0
-  %nop7578 = alloca i1, i1 0
-  %nop7579 = alloca i1, i1 0
-  %nop7580 = alloca i1, i1 0
-  %nop7581 = alloca i1, i1 0
-  %nop7582 = alloca i1, i1 0
-  %nop7583 = alloca i1, i1 0
-  %nop7584 = alloca i1, i1 0
-  %nop7585 = alloca i1, i1 0
-  %nop7586 = alloca i1, i1 0
-  %nop7587 = alloca i1, i1 0
-  %nop7588 = alloca i1, i1 0
-  %nop7589 = alloca i1, i1 0
-  %nop7590 = alloca i1, i1 0
-  %nop7591 = alloca i1, i1 0
-  %nop7592 = alloca i1, i1 0
-  %nop7593 = alloca i1, i1 0
-  %nop7594 = alloca i1, i1 0
-  %nop7595 = alloca i1, i1 0
-  %nop7596 = alloca i1, i1 0
-  %nop7597 = alloca i1, i1 0
-  %nop7598 = alloca i1, i1 0
-  %nop7599 = alloca i1, i1 0
-  %nop7600 = alloca i1, i1 0
-  %nop7601 = alloca i1, i1 0
-  %nop7602 = alloca i1, i1 0
-  %nop7603 = alloca i1, i1 0
-  %nop7604 = alloca i1, i1 0
-  %nop7605 = alloca i1, i1 0
-  %nop7606 = alloca i1, i1 0
-  %nop7607 = alloca i1, i1 0
-  %nop7608 = alloca i1, i1 0
-  %nop7609 = alloca i1, i1 0
-  %nop7610 = alloca i1, i1 0
-  %nop7611 = alloca i1, i1 0
-  %nop7612 = alloca i1, i1 0
-  %nop7613 = alloca i1, i1 0
-  %nop7614 = alloca i1, i1 0
-  %nop7615 = alloca i1, i1 0
-  %nop7616 = alloca i1, i1 0
-  %nop7617 = alloca i1, i1 0
-  %nop7618 = alloca i1, i1 0
-  %nop7619 = alloca i1, i1 0
-  %nop7620 = alloca i1, i1 0
-  %nop7621 = alloca i1, i1 0
-  %nop7622 = alloca i1, i1 0
-  %nop7623 = alloca i1, i1 0
-  %nop7624 = alloca i1, i1 0
-  %nop7625 = alloca i1, i1 0
-  %nop7626 = alloca i1, i1 0
-  %nop7627 = alloca i1, i1 0
-  %nop7628 = alloca i1, i1 0
-  %nop7629 = alloca i1, i1 0
-  %nop7630 = alloca i1, i1 0
-  %nop7631 = alloca i1, i1 0
-  %nop7632 = alloca i1, i1 0
-  %nop7633 = alloca i1, i1 0
-  %nop7634 = alloca i1, i1 0
-  %nop7635 = alloca i1, i1 0
-  %nop7636 = alloca i1, i1 0
-  %nop7637 = alloca i1, i1 0
-  %nop7638 = alloca i1, i1 0
-  %nop7639 = alloca i1, i1 0
-  %nop7640 = alloca i1, i1 0
-  %nop7641 = alloca i1, i1 0
-  %nop7642 = alloca i1, i1 0
-  %nop7643 = alloca i1, i1 0
-  %nop7644 = alloca i1, i1 0
-  %nop7645 = alloca i1, i1 0
-  %nop7646 = alloca i1, i1 0
-  %nop7647 = alloca i1, i1 0
-  %nop7648 = alloca i1, i1 0
-  %nop7649 = alloca i1, i1 0
-  %nop7650 = alloca i1, i1 0
-  %nop7651 = alloca i1, i1 0
-  %nop7652 = alloca i1, i1 0
-  %nop7653 = alloca i1, i1 0
-  %nop7654 = alloca i1, i1 0
-  %nop7655 = alloca i1, i1 0
-  %nop7656 = alloca i1, i1 0
-  %nop7657 = alloca i1, i1 0
-  %nop7658 = alloca i1, i1 0
-  %nop7659 = alloca i1, i1 0
-  %nop7660 = alloca i1, i1 0
-  %nop7661 = alloca i1, i1 0
-  %nop7662 = alloca i1, i1 0
-  %nop7663 = alloca i1, i1 0
-  %nop7664 = alloca i1, i1 0
-  %nop7665 = alloca i1, i1 0
-  %nop7666 = alloca i1, i1 0
-  %nop7667 = alloca i1, i1 0
-  %nop7668 = alloca i1, i1 0
-  %nop7669 = alloca i1, i1 0
-  %nop7670 = alloca i1, i1 0
-  %nop7671 = alloca i1, i1 0
-  %nop7672 = alloca i1, i1 0
-  %nop7673 = alloca i1, i1 0
-  %nop7674 = alloca i1, i1 0
-  %nop7675 = alloca i1, i1 0
-  %nop7676 = alloca i1, i1 0
-  %nop7677 = alloca i1, i1 0
-  %nop7678 = alloca i1, i1 0
-  %nop7679 = alloca i1, i1 0
-  %nop7680 = alloca i1, i1 0
-  %nop7681 = alloca i1, i1 0
-  %nop7682 = alloca i1, i1 0
-  %nop7683 = alloca i1, i1 0
-  %nop7684 = alloca i1, i1 0
-  %nop7685 = alloca i1, i1 0
-  %nop7686 = alloca i1, i1 0
-  %nop7687 = alloca i1, i1 0
-  %nop7688 = alloca i1, i1 0
-  %nop7689 = alloca i1, i1 0
-  %nop7690 = alloca i1, i1 0
-  %nop7691 = alloca i1, i1 0
-  %nop7692 = alloca i1, i1 0
-  %nop7693 = alloca i1, i1 0
-  %nop7694 = alloca i1, i1 0
-  %nop7695 = alloca i1, i1 0
-  %nop7696 = alloca i1, i1 0
-  %nop7697 = alloca i1, i1 0
-  %nop7698 = alloca i1, i1 0
-  %nop7699 = alloca i1, i1 0
-  %nop7700 = alloca i1, i1 0
-  %nop7701 = alloca i1, i1 0
-  %nop7702 = alloca i1, i1 0
-  %nop7703 = alloca i1, i1 0
-  %nop7704 = alloca i1, i1 0
-  %nop7705 = alloca i1, i1 0
-  %nop7706 = alloca i1, i1 0
-  %nop7707 = alloca i1, i1 0
-  %nop7708 = alloca i1, i1 0
-  %nop7709 = alloca i1, i1 0
-  %nop7710 = alloca i1, i1 0
-  %nop7711 = alloca i1, i1 0
-  %nop7712 = alloca i1, i1 0
-  %nop7713 = alloca i1, i1 0
-  %nop7714 = alloca i1, i1 0
-  %nop7715 = alloca i1, i1 0
-  %nop7716 = alloca i1, i1 0
-  %nop7717 = alloca i1, i1 0
-  %nop7718 = alloca i1, i1 0
-  %nop7719 = alloca i1, i1 0
-  %nop7720 = alloca i1, i1 0
-  %nop7721 = alloca i1, i1 0
-  %nop7722 = alloca i1, i1 0
-  %nop7723 = alloca i1, i1 0
-  %nop7724 = alloca i1, i1 0
-  %nop7725 = alloca i1, i1 0
-  %nop7726 = alloca i1, i1 0
-  %nop7727 = alloca i1, i1 0
-  %nop7728 = alloca i1, i1 0
-  %nop7729 = alloca i1, i1 0
-  %nop7730 = alloca i1, i1 0
-  %nop7731 = alloca i1, i1 0
-  %nop7732 = alloca i1, i1 0
-  %nop7733 = alloca i1, i1 0
-  %nop7734 = alloca i1, i1 0
-  %nop7735 = alloca i1, i1 0
-  %nop7736 = alloca i1, i1 0
-  %nop7737 = alloca i1, i1 0
-  %nop7738 = alloca i1, i1 0
-  %nop7739 = alloca i1, i1 0
-  %nop7740 = alloca i1, i1 0
-  %nop7741 = alloca i1, i1 0
-  %nop7742 = alloca i1, i1 0
-  %nop7743 = alloca i1, i1 0
-  %nop7744 = alloca i1, i1 0
-  %nop7745 = alloca i1, i1 0
-  %nop7746 = alloca i1, i1 0
-  %nop7747 = alloca i1, i1 0
-  %nop7748 = alloca i1, i1 0
-  %nop7749 = alloca i1, i1 0
-  %nop7750 = alloca i1, i1 0
-  %nop7751 = alloca i1, i1 0
-  %nop7752 = alloca i1, i1 0
-  %nop7753 = alloca i1, i1 0
-  %nop7754 = alloca i1, i1 0
-  %nop7755 = alloca i1, i1 0
-  %nop7756 = alloca i1, i1 0
-  %nop7757 = alloca i1, i1 0
-  %nop7758 = alloca i1, i1 0
-  %nop7759 = alloca i1, i1 0
-  %nop7760 = alloca i1, i1 0
-  %nop7761 = alloca i1, i1 0
-  %nop7762 = alloca i1, i1 0
-  %nop7763 = alloca i1, i1 0
-  %nop7764 = alloca i1, i1 0
-  %nop7765 = alloca i1, i1 0
-  %nop7766 = alloca i1, i1 0
-  %nop7767 = alloca i1, i1 0
-  %nop7768 = alloca i1, i1 0
-  %nop7769 = alloca i1, i1 0
-  %nop7770 = alloca i1, i1 0
-  %nop7771 = alloca i1, i1 0
-  %nop7772 = alloca i1, i1 0
-  %nop7773 = alloca i1, i1 0
-  %nop7774 = alloca i1, i1 0
-  %nop7775 = alloca i1, i1 0
-  %nop7776 = alloca i1, i1 0
-  %nop7777 = alloca i1, i1 0
-  %nop7778 = alloca i1, i1 0
-  %nop7779 = alloca i1, i1 0
-  %nop7780 = alloca i1, i1 0
-  %nop7781 = alloca i1, i1 0
-  %nop7782 = alloca i1, i1 0
-  %nop7783 = alloca i1, i1 0
-  %nop7784 = alloca i1, i1 0
-  %nop7785 = alloca i1, i1 0
-  %nop7786 = alloca i1, i1 0
-  %nop7787 = alloca i1, i1 0
-  %nop7788 = alloca i1, i1 0
-  %nop7789 = alloca i1, i1 0
-  %nop7790 = alloca i1, i1 0
-  %nop7791 = alloca i1, i1 0
-  %nop7792 = alloca i1, i1 0
-  %nop7793 = alloca i1, i1 0
-  %nop7794 = alloca i1, i1 0
-  %nop7795 = alloca i1, i1 0
-  %nop7796 = alloca i1, i1 0
-  %nop7797 = alloca i1, i1 0
-  %nop7798 = alloca i1, i1 0
-  %nop7799 = alloca i1, i1 0
-  %nop7800 = alloca i1, i1 0
-  %nop7801 = alloca i1, i1 0
-  %nop7802 = alloca i1, i1 0
-  %nop7803 = alloca i1, i1 0
-  %nop7804 = alloca i1, i1 0
-  %nop7805 = alloca i1, i1 0
-  %nop7806 = alloca i1, i1 0
-  %nop7807 = alloca i1, i1 0
-  %nop7808 = alloca i1, i1 0
-  %nop7809 = alloca i1, i1 0
-  %nop7810 = alloca i1, i1 0
-  %nop7811 = alloca i1, i1 0
-  %nop7812 = alloca i1, i1 0
-  %nop7813 = alloca i1, i1 0
-  %nop7814 = alloca i1, i1 0
-  %nop7815 = alloca i1, i1 0
-  %nop7816 = alloca i1, i1 0
-  %nop7817 = alloca i1, i1 0
-  %nop7818 = alloca i1, i1 0
-  %nop7819 = alloca i1, i1 0
-  %nop7820 = alloca i1, i1 0
-  %nop7821 = alloca i1, i1 0
-  %nop7822 = alloca i1, i1 0
-  %nop7823 = alloca i1, i1 0
-  %nop7824 = alloca i1, i1 0
-  %nop7825 = alloca i1, i1 0
-  %nop7826 = alloca i1, i1 0
-  %nop7827 = alloca i1, i1 0
-  %nop7828 = alloca i1, i1 0
-  %nop7829 = alloca i1, i1 0
-  %nop7830 = alloca i1, i1 0
-  %nop7831 = alloca i1, i1 0
-  %nop7832 = alloca i1, i1 0
-  %nop7833 = alloca i1, i1 0
-  %nop7834 = alloca i1, i1 0
-  %nop7835 = alloca i1, i1 0
-  %nop7836 = alloca i1, i1 0
-  %nop7837 = alloca i1, i1 0
-  %nop7838 = alloca i1, i1 0
-  %nop7839 = alloca i1, i1 0
-  %nop7840 = alloca i1, i1 0
-  %nop7841 = alloca i1, i1 0
-  %nop7842 = alloca i1, i1 0
-  %nop7843 = alloca i1, i1 0
-  %nop7844 = alloca i1, i1 0
-  %nop7845 = alloca i1, i1 0
-  %nop7846 = alloca i1, i1 0
-  %nop7847 = alloca i1, i1 0
-  %nop7848 = alloca i1, i1 0
-  %nop7849 = alloca i1, i1 0
-  %nop7850 = alloca i1, i1 0
-  %nop7851 = alloca i1, i1 0
-  %nop7852 = alloca i1, i1 0
-  %nop7853 = alloca i1, i1 0
-  %nop7854 = alloca i1, i1 0
-  %nop7855 = alloca i1, i1 0
-  %nop7856 = alloca i1, i1 0
-  %nop7857 = alloca i1, i1 0
-  %nop7858 = alloca i1, i1 0
-  %nop7859 = alloca i1, i1 0
-  %nop7860 = alloca i1, i1 0
-  %nop7861 = alloca i1, i1 0
-  %nop7862 = alloca i1, i1 0
-  %nop7863 = alloca i1, i1 0
-  %nop7864 = alloca i1, i1 0
-  %nop7865 = alloca i1, i1 0
-  %nop7866 = alloca i1, i1 0
-  %nop7867 = alloca i1, i1 0
-  %nop7868 = alloca i1, i1 0
-  %nop7869 = alloca i1, i1 0
-  %nop7870 = alloca i1, i1 0
-  %nop7871 = alloca i1, i1 0
-  %nop7872 = alloca i1, i1 0
-  %nop7873 = alloca i1, i1 0
-  %nop7874 = alloca i1, i1 0
-  %nop7875 = alloca i1, i1 0
-  %nop7876 = alloca i1, i1 0
-  %nop7877 = alloca i1, i1 0
-  %nop7878 = alloca i1, i1 0
-  %nop7879 = alloca i1, i1 0
-  %nop7880 = alloca i1, i1 0
-  %nop7881 = alloca i1, i1 0
-  %nop7882 = alloca i1, i1 0
-  %nop7883 = alloca i1, i1 0
-  %nop7884 = alloca i1, i1 0
-  %nop7885 = alloca i1, i1 0
-  %nop7886 = alloca i1, i1 0
-  %nop7887 = alloca i1, i1 0
-  %nop7888 = alloca i1, i1 0
-  %nop7889 = alloca i1, i1 0
-  %nop7890 = alloca i1, i1 0
-  %nop7891 = alloca i1, i1 0
-  %nop7892 = alloca i1, i1 0
-  %nop7893 = alloca i1, i1 0
-  %nop7894 = alloca i1, i1 0
-  %nop7895 = alloca i1, i1 0
-  %nop7896 = alloca i1, i1 0
-  %nop7897 = alloca i1, i1 0
-  %nop7898 = alloca i1, i1 0
-  %nop7899 = alloca i1, i1 0
-  %nop7900 = alloca i1, i1 0
-  %nop7901 = alloca i1, i1 0
-  %nop7902 = alloca i1, i1 0
-  %nop7903 = alloca i1, i1 0
-  %nop7904 = alloca i1, i1 0
-  %nop7905 = alloca i1, i1 0
-  %nop7906 = alloca i1, i1 0
-  %nop7907 = alloca i1, i1 0
-  %nop7908 = alloca i1, i1 0
-  %nop7909 = alloca i1, i1 0
-  %nop7910 = alloca i1, i1 0
-  %nop7911 = alloca i1, i1 0
-  %nop7912 = alloca i1, i1 0
-  %nop7913 = alloca i1, i1 0
-  %nop7914 = alloca i1, i1 0
-  %nop7915 = alloca i1, i1 0
-  %nop7916 = alloca i1, i1 0
-  %nop7917 = alloca i1, i1 0
-  %nop7918 = alloca i1, i1 0
-  %nop7919 = alloca i1, i1 0
-  %nop7920 = alloca i1, i1 0
-  %nop7921 = alloca i1, i1 0
-  %nop7922 = alloca i1, i1 0
-  %nop7923 = alloca i1, i1 0
-  %nop7924 = alloca i1, i1 0
-  %nop7925 = alloca i1, i1 0
-  %nop7926 = alloca i1, i1 0
-  %nop7927 = alloca i1, i1 0
-  %nop7928 = alloca i1, i1 0
-  %nop7929 = alloca i1, i1 0
-  %nop7930 = alloca i1, i1 0
-  %nop7931 = alloca i1, i1 0
-  %nop7932 = alloca i1, i1 0
-  %nop7933 = alloca i1, i1 0
-  %nop7934 = alloca i1, i1 0
-  %nop7935 = alloca i1, i1 0
-  %nop7936 = alloca i1, i1 0
-  %nop7937 = alloca i1, i1 0
-  %nop7938 = alloca i1, i1 0
-  %nop7939 = alloca i1, i1 0
-  %nop7940 = alloca i1, i1 0
-  %nop7941 = alloca i1, i1 0
-  %nop7942 = alloca i1, i1 0
-  %nop7943 = alloca i1, i1 0
-  %nop7944 = alloca i1, i1 0
-  %nop7945 = alloca i1, i1 0
-  %nop7946 = alloca i1, i1 0
-  %nop7947 = alloca i1, i1 0
-  %nop7948 = alloca i1, i1 0
-  %nop7949 = alloca i1, i1 0
-  %nop7950 = alloca i1, i1 0
-  %nop7951 = alloca i1, i1 0
-  %nop7952 = alloca i1, i1 0
-  %nop7953 = alloca i1, i1 0
-  %nop7954 = alloca i1, i1 0
-  %nop7955 = alloca i1, i1 0
-  %nop7956 = alloca i1, i1 0
-  %nop7957 = alloca i1, i1 0
-  %nop7958 = alloca i1, i1 0
-  %nop7959 = alloca i1, i1 0
-  %nop7960 = alloca i1, i1 0
-  %nop7961 = alloca i1, i1 0
-  %nop7962 = alloca i1, i1 0
-  %nop7963 = alloca i1, i1 0
-  %nop7964 = alloca i1, i1 0
-  %nop7965 = alloca i1, i1 0
-  %nop7966 = alloca i1, i1 0
-  %nop7967 = alloca i1, i1 0
-  %nop7968 = alloca i1, i1 0
-  %nop7969 = alloca i1, i1 0
-  %nop7970 = alloca i1, i1 0
-  %nop7971 = alloca i1, i1 0
-  %nop7972 = alloca i1, i1 0
-  %nop7973 = alloca i1, i1 0
-  %nop7974 = alloca i1, i1 0
-  %nop7975 = alloca i1, i1 0
-  %nop7976 = alloca i1, i1 0
-  %nop7977 = alloca i1, i1 0
-  %nop7978 = alloca i1, i1 0
-  %nop7979 = alloca i1, i1 0
-  %nop7980 = alloca i1, i1 0
-  %nop7981 = alloca i1, i1 0
-  %nop7982 = alloca i1, i1 0
-  %nop7983 = alloca i1, i1 0
-  %nop7984 = alloca i1, i1 0
-  %nop7985 = alloca i1, i1 0
-  %nop7986 = alloca i1, i1 0
-  %nop7987 = alloca i1, i1 0
-  %nop7988 = alloca i1, i1 0
-  %nop7989 = alloca i1, i1 0
-  %nop7990 = alloca i1, i1 0
-  %nop7991 = alloca i1, i1 0
-  %nop7992 = alloca i1, i1 0
-  %nop7993 = alloca i1, i1 0
-  %nop7994 = alloca i1, i1 0
-  %nop7995 = alloca i1, i1 0
-  %nop7996 = alloca i1, i1 0
-  %nop7997 = alloca i1, i1 0
-  %nop7998 = alloca i1, i1 0
-  %nop7999 = alloca i1, i1 0
-  %nop8000 = alloca i1, i1 0
-  %nop8001 = alloca i1, i1 0
-  %nop8002 = alloca i1, i1 0
-  %nop8003 = alloca i1, i1 0
-  %nop8004 = alloca i1, i1 0
-  %nop8005 = alloca i1, i1 0
-  %nop8006 = alloca i1, i1 0
-  %nop8007 = alloca i1, i1 0
-  %nop8008 = alloca i1, i1 0
-  %nop8009 = alloca i1, i1 0
-  %nop8010 = alloca i1, i1 0
-  %nop8011 = alloca i1, i1 0
-  %nop8012 = alloca i1, i1 0
-  %nop8013 = alloca i1, i1 0
-  %nop8014 = alloca i1, i1 0
-  %nop8015 = alloca i1, i1 0
-  %nop8016 = alloca i1, i1 0
-  %nop8017 = alloca i1, i1 0
-  %nop8018 = alloca i1, i1 0
-  %nop8019 = alloca i1, i1 0
-  %nop8020 = alloca i1, i1 0
-  %nop8021 = alloca i1, i1 0
-  %nop8022 = alloca i1, i1 0
-  %nop8023 = alloca i1, i1 0
-  %nop8024 = alloca i1, i1 0
-  %nop8025 = alloca i1, i1 0
-  %nop8026 = alloca i1, i1 0
-  %nop8027 = alloca i1, i1 0
-  %nop8028 = alloca i1, i1 0
-  %nop8029 = alloca i1, i1 0
-  %nop8030 = alloca i1, i1 0
-  %nop8031 = alloca i1, i1 0
-  %nop8032 = alloca i1, i1 0
-  %nop8033 = alloca i1, i1 0
-  %nop8034 = alloca i1, i1 0
-  %nop8035 = alloca i1, i1 0
-  %nop8036 = alloca i1, i1 0
-  %nop8037 = alloca i1, i1 0
-  %nop8038 = alloca i1, i1 0
-  %nop8039 = alloca i1, i1 0
-  %nop8040 = alloca i1, i1 0
-  %nop8041 = alloca i1, i1 0
-  %nop8042 = alloca i1, i1 0
-  %nop8043 = alloca i1, i1 0
-  %nop8044 = alloca i1, i1 0
-  %nop8045 = alloca i1, i1 0
-  %nop8046 = alloca i1, i1 0
-  %nop8047 = alloca i1, i1 0
-  %nop8048 = alloca i1, i1 0
-  %nop8049 = alloca i1, i1 0
-  %nop8050 = alloca i1, i1 0
-  %nop8051 = alloca i1, i1 0
-  %nop8052 = alloca i1, i1 0
-  %nop8053 = alloca i1, i1 0
-  %nop8054 = alloca i1, i1 0
-  %nop8055 = alloca i1, i1 0
-  %nop8056 = alloca i1, i1 0
-  %nop8057 = alloca i1, i1 0
-  %nop8058 = alloca i1, i1 0
-  %nop8059 = alloca i1, i1 0
-  %nop8060 = alloca i1, i1 0
-  %nop8061 = alloca i1, i1 0
-  %nop8062 = alloca i1, i1 0
-  %nop8063 = alloca i1, i1 0
-  %nop8064 = alloca i1, i1 0
-  %nop8065 = alloca i1, i1 0
-  %nop8066 = alloca i1, i1 0
-  %nop8067 = alloca i1, i1 0
-  %nop8068 = alloca i1, i1 0
-  %nop8069 = alloca i1, i1 0
-  %nop8070 = alloca i1, i1 0
-  %nop8071 = alloca i1, i1 0
-  %nop8072 = alloca i1, i1 0
-  %nop8073 = alloca i1, i1 0
-  %nop8074 = alloca i1, i1 0
-  %nop8075 = alloca i1, i1 0
-  %nop8076 = alloca i1, i1 0
-  %nop8077 = alloca i1, i1 0
-  %nop8078 = alloca i1, i1 0
-  %nop8079 = alloca i1, i1 0
-  %nop8080 = alloca i1, i1 0
-  %nop8081 = alloca i1, i1 0
-  %nop8082 = alloca i1, i1 0
-  %nop8083 = alloca i1, i1 0
-  %nop8084 = alloca i1, i1 0
-  %nop8085 = alloca i1, i1 0
-  %nop8086 = alloca i1, i1 0
-  %nop8087 = alloca i1, i1 0
-  %nop8088 = alloca i1, i1 0
-  %nop8089 = alloca i1, i1 0
-  %nop8090 = alloca i1, i1 0
-  %nop8091 = alloca i1, i1 0
-  %nop8092 = alloca i1, i1 0
-  %nop8093 = alloca i1, i1 0
-  %nop8094 = alloca i1, i1 0
-  %nop8095 = alloca i1, i1 0
-  %nop8096 = alloca i1, i1 0
-  %nop8097 = alloca i1, i1 0
-  %nop8098 = alloca i1, i1 0
-  %nop8099 = alloca i1, i1 0
-  %nop8100 = alloca i1, i1 0
-  %nop8101 = alloca i1, i1 0
-  %nop8102 = alloca i1, i1 0
-  %nop8103 = alloca i1, i1 0
-  %nop8104 = alloca i1, i1 0
-  %nop8105 = alloca i1, i1 0
-  %nop8106 = alloca i1, i1 0
-  %nop8107 = alloca i1, i1 0
-  %nop8108 = alloca i1, i1 0
-  %nop8109 = alloca i1, i1 0
-  %nop8110 = alloca i1, i1 0
-  %nop8111 = alloca i1, i1 0
-  %nop8112 = alloca i1, i1 0
-  %nop8113 = alloca i1, i1 0
-  %nop8114 = alloca i1, i1 0
-  %nop8115 = alloca i1, i1 0
-  %nop8116 = alloca i1, i1 0
-  %nop8117 = alloca i1, i1 0
-  %nop8118 = alloca i1, i1 0
-  %nop8119 = alloca i1, i1 0
-  %nop8120 = alloca i1, i1 0
-  %nop8121 = alloca i1, i1 0
-  %nop8122 = alloca i1, i1 0
-  %nop8123 = alloca i1, i1 0
-  %nop8124 = alloca i1, i1 0
-  %nop8125 = alloca i1, i1 0
-  %nop8126 = alloca i1, i1 0
-  %nop8127 = alloca i1, i1 0
-  %nop8128 = alloca i1, i1 0
-  %nop8129 = alloca i1, i1 0
-  %nop8130 = alloca i1, i1 0
-  %nop8131 = alloca i1, i1 0
-  %nop8132 = alloca i1, i1 0
-  %nop8133 = alloca i1, i1 0
-  %nop8134 = alloca i1, i1 0
-  %nop8135 = alloca i1, i1 0
-  %nop8136 = alloca i1, i1 0
-  %nop8137 = alloca i1, i1 0
-  %nop8138 = alloca i1, i1 0
-  %nop8139 = alloca i1, i1 0
-  %nop8140 = alloca i1, i1 0
-  %nop8141 = alloca i1, i1 0
-  %nop8142 = alloca i1, i1 0
-  %nop8143 = alloca i1, i1 0
-  %nop8144 = alloca i1, i1 0
-  %nop8145 = alloca i1, i1 0
-  %nop8146 = alloca i1, i1 0
-  %nop8147 = alloca i1, i1 0
-  %nop8148 = alloca i1, i1 0
-  %nop8149 = alloca i1, i1 0
-  %nop8150 = alloca i1, i1 0
-  %nop8151 = alloca i1, i1 0
-  %nop8152 = alloca i1, i1 0
-  %nop8153 = alloca i1, i1 0
-  %nop8154 = alloca i1, i1 0
-  %nop8155 = alloca i1, i1 0
-  %nop8156 = alloca i1, i1 0
-  %nop8157 = alloca i1, i1 0
-  %nop8158 = alloca i1, i1 0
-  %nop8159 = alloca i1, i1 0
-  %nop8160 = alloca i1, i1 0
-  %nop8161 = alloca i1, i1 0
-  %nop8162 = alloca i1, i1 0
-  %nop8163 = alloca i1, i1 0
-  %nop8164 = alloca i1, i1 0
-  %nop8165 = alloca i1, i1 0
-  %nop8166 = alloca i1, i1 0
-  %nop8167 = alloca i1, i1 0
-  %nop8168 = alloca i1, i1 0
-  %nop8169 = alloca i1, i1 0
-  %nop8170 = alloca i1, i1 0
-  %nop8171 = alloca i1, i1 0
-  %nop8172 = alloca i1, i1 0
-  %nop8173 = alloca i1, i1 0
-  %nop8174 = alloca i1, i1 0
-  %nop8175 = alloca i1, i1 0
-  %nop8176 = alloca i1, i1 0
-  %nop8177 = alloca i1, i1 0
-  %nop8178 = alloca i1, i1 0
-  %nop8179 = alloca i1, i1 0
-  %nop8180 = alloca i1, i1 0
-  %nop8181 = alloca i1, i1 0
-  %nop8182 = alloca i1, i1 0
-  %nop8183 = alloca i1, i1 0
-  %nop8184 = alloca i1, i1 0
-  %nop8185 = alloca i1, i1 0
-  %nop8186 = alloca i1, i1 0
-  %nop8187 = alloca i1, i1 0
-  %nop8188 = alloca i1, i1 0
-  %nop8189 = alloca i1, i1 0
-  %nop8190 = alloca i1, i1 0
-  %nop8191 = alloca i1, i1 0
-  %nop8192 = alloca i1, i1 0
-  %nop8193 = alloca i1, i1 0
-  %nop8194 = alloca i1, i1 0
-  %nop8195 = alloca i1, i1 0
-  %nop8196 = alloca i1, i1 0
-  %nop8197 = alloca i1, i1 0
-  %nop8198 = alloca i1, i1 0
-  %nop8199 = alloca i1, i1 0
-  %nop8200 = alloca i1, i1 0
-  %nop8201 = alloca i1, i1 0
-  %nop8202 = alloca i1, i1 0
-  %nop8203 = alloca i1, i1 0
-  %nop8204 = alloca i1, i1 0
-  %nop8205 = alloca i1, i1 0
-  %nop8206 = alloca i1, i1 0
-  %nop8207 = alloca i1, i1 0
-  %nop8208 = alloca i1, i1 0
-  %nop8209 = alloca i1, i1 0
-  %nop8210 = alloca i1, i1 0
-  %nop8211 = alloca i1, i1 0
-  %nop8212 = alloca i1, i1 0
-  %nop8213 = alloca i1, i1 0
-  %nop8214 = alloca i1, i1 0
-  %nop8215 = alloca i1, i1 0
-  %nop8216 = alloca i1, i1 0
-  %nop8217 = alloca i1, i1 0
-  %nop8218 = alloca i1, i1 0
-  %nop8219 = alloca i1, i1 0
-  %nop8220 = alloca i1, i1 0
-  %nop8221 = alloca i1, i1 0
-  %nop8222 = alloca i1, i1 0
-  %nop8223 = alloca i1, i1 0
-  %nop8224 = alloca i1, i1 0
-  %nop8225 = alloca i1, i1 0
-  %nop8226 = alloca i1, i1 0
-  %nop8227 = alloca i1, i1 0
-  %nop8228 = alloca i1, i1 0
-  %nop8229 = alloca i1, i1 0
-  %nop8230 = alloca i1, i1 0
-  %nop8231 = alloca i1, i1 0
-  %nop8232 = alloca i1, i1 0
-  %nop8233 = alloca i1, i1 0
-  %nop8234 = alloca i1, i1 0
-  %nop8235 = alloca i1, i1 0
-  %nop8236 = alloca i1, i1 0
-  %nop8237 = alloca i1, i1 0
-  %nop8238 = alloca i1, i1 0
-  %nop8239 = alloca i1, i1 0
-  %nop8240 = alloca i1, i1 0
-  %nop8241 = alloca i1, i1 0
-  %nop8242 = alloca i1, i1 0
-  %nop8243 = alloca i1, i1 0
-  %nop8244 = alloca i1, i1 0
-  %nop8245 = alloca i1, i1 0
-  %nop8246 = alloca i1, i1 0
-  %nop8247 = alloca i1, i1 0
-  %nop8248 = alloca i1, i1 0
-  %nop8249 = alloca i1, i1 0
-  %nop8250 = alloca i1, i1 0
-  %nop8251 = alloca i1, i1 0
-  %nop8252 = alloca i1, i1 0
-  %nop8253 = alloca i1, i1 0
-  %nop8254 = alloca i1, i1 0
-  %nop8255 = alloca i1, i1 0
-  %nop8256 = alloca i1, i1 0
-  %nop8257 = alloca i1, i1 0
-  %nop8258 = alloca i1, i1 0
-  %nop8259 = alloca i1, i1 0
-  %nop8260 = alloca i1, i1 0
-  %nop8261 = alloca i1, i1 0
-  %nop8262 = alloca i1, i1 0
-  %nop8263 = alloca i1, i1 0
-  %nop8264 = alloca i1, i1 0
-  %nop8265 = alloca i1, i1 0
-  %nop8266 = alloca i1, i1 0
-  %nop8267 = alloca i1, i1 0
-  %nop8268 = alloca i1, i1 0
-  %nop8269 = alloca i1, i1 0
-  %nop8270 = alloca i1, i1 0
-  %nop8271 = alloca i1, i1 0
-  %nop8272 = alloca i1, i1 0
-  %nop8273 = alloca i1, i1 0
-  %nop8274 = alloca i1, i1 0
-  %nop8275 = alloca i1, i1 0
-  %nop8276 = alloca i1, i1 0
-  %nop8277 = alloca i1, i1 0
-  %nop8278 = alloca i1, i1 0
-  %nop8279 = alloca i1, i1 0
-  %nop8280 = alloca i1, i1 0
-  %nop8281 = alloca i1, i1 0
-  %nop8282 = alloca i1, i1 0
-  %nop8283 = alloca i1, i1 0
-  %nop8284 = alloca i1, i1 0
-  %nop8285 = alloca i1, i1 0
-  %nop8286 = alloca i1, i1 0
-  %nop8287 = alloca i1, i1 0
-  %nop8288 = alloca i1, i1 0
-  %nop8289 = alloca i1, i1 0
-  %nop8290 = alloca i1, i1 0
-  %nop8291 = alloca i1, i1 0
-  %nop8292 = alloca i1, i1 0
-  %nop8293 = alloca i1, i1 0
-  %nop8294 = alloca i1, i1 0
-  %nop8295 = alloca i1, i1 0
-  %nop8296 = alloca i1, i1 0
-  %nop8297 = alloca i1, i1 0
-  %nop8298 = alloca i1, i1 0
-  %nop8299 = alloca i1, i1 0
-  %nop8300 = alloca i1, i1 0
-  %nop8301 = alloca i1, i1 0
-  %nop8302 = alloca i1, i1 0
-  %nop8303 = alloca i1, i1 0
-  %nop8304 = alloca i1, i1 0
-  %nop8305 = alloca i1, i1 0
-  %nop8306 = alloca i1, i1 0
-  %nop8307 = alloca i1, i1 0
-  %nop8308 = alloca i1, i1 0
-  %nop8309 = alloca i1, i1 0
-  %nop8310 = alloca i1, i1 0
-  %nop8311 = alloca i1, i1 0
-  %nop8312 = alloca i1, i1 0
-  %nop8313 = alloca i1, i1 0
-  %nop8314 = alloca i1, i1 0
-  %nop8315 = alloca i1, i1 0
-  %nop8316 = alloca i1, i1 0
-  %nop8317 = alloca i1, i1 0
-  %nop8318 = alloca i1, i1 0
-  %nop8319 = alloca i1, i1 0
-  %nop8320 = alloca i1, i1 0
-  %nop8321 = alloca i1, i1 0
-  %nop8322 = alloca i1, i1 0
-  %nop8323 = alloca i1, i1 0
-  %nop8324 = alloca i1, i1 0
-  %nop8325 = alloca i1, i1 0
-  %nop8326 = alloca i1, i1 0
-  %nop8327 = alloca i1, i1 0
-  %nop8328 = alloca i1, i1 0
-  %nop8329 = alloca i1, i1 0
-  %nop8330 = alloca i1, i1 0
-  %nop8331 = alloca i1, i1 0
-  %nop8332 = alloca i1, i1 0
-  %nop8333 = alloca i1, i1 0
-  %nop8334 = alloca i1, i1 0
-  %nop8335 = alloca i1, i1 0
-  %nop8336 = alloca i1, i1 0
-  %nop8337 = alloca i1, i1 0
-  %nop8338 = alloca i1, i1 0
-  %nop8339 = alloca i1, i1 0
-  %nop8340 = alloca i1, i1 0
-  %nop8341 = alloca i1, i1 0
-  %nop8342 = alloca i1, i1 0
-  %nop8343 = alloca i1, i1 0
-  %nop8344 = alloca i1, i1 0
-  %nop8345 = alloca i1, i1 0
-  %nop8346 = alloca i1, i1 0
-  %nop8347 = alloca i1, i1 0
-  %nop8348 = alloca i1, i1 0
-  %nop8349 = alloca i1, i1 0
-  %nop8350 = alloca i1, i1 0
-  %nop8351 = alloca i1, i1 0
-  %nop8352 = alloca i1, i1 0
-  %nop8353 = alloca i1, i1 0
-  %nop8354 = alloca i1, i1 0
-  %nop8355 = alloca i1, i1 0
-  %nop8356 = alloca i1, i1 0
-  %nop8357 = alloca i1, i1 0
-  %nop8358 = alloca i1, i1 0
-  %nop8359 = alloca i1, i1 0
-  %nop8360 = alloca i1, i1 0
-  %nop8361 = alloca i1, i1 0
-  %nop8362 = alloca i1, i1 0
-  %nop8363 = alloca i1, i1 0
-  %nop8364 = alloca i1, i1 0
-  %nop8365 = alloca i1, i1 0
-  %nop8366 = alloca i1, i1 0
-  %nop8367 = alloca i1, i1 0
-  %nop8368 = alloca i1, i1 0
-  %nop8369 = alloca i1, i1 0
-  %nop8370 = alloca i1, i1 0
-  %nop8371 = alloca i1, i1 0
-  %nop8372 = alloca i1, i1 0
-  %nop8373 = alloca i1, i1 0
-  %nop8374 = alloca i1, i1 0
-  %nop8375 = alloca i1, i1 0
-  %nop8376 = alloca i1, i1 0
-  %nop8377 = alloca i1, i1 0
-  %nop8378 = alloca i1, i1 0
-  %nop8379 = alloca i1, i1 0
-  %nop8380 = alloca i1, i1 0
-  %nop8381 = alloca i1, i1 0
-  %nop8382 = alloca i1, i1 0
-  %nop8383 = alloca i1, i1 0
-  %nop8384 = alloca i1, i1 0
-  %nop8385 = alloca i1, i1 0
-  %nop8386 = alloca i1, i1 0
-  %nop8387 = alloca i1, i1 0
-  %nop8388 = alloca i1, i1 0
-  %nop8389 = alloca i1, i1 0
-  %nop8390 = alloca i1, i1 0
-  %nop8391 = alloca i1, i1 0
-  %nop8392 = alloca i1, i1 0
-  %nop8393 = alloca i1, i1 0
-  %nop8394 = alloca i1, i1 0
-  %nop8395 = alloca i1, i1 0
-  %nop8396 = alloca i1, i1 0
-  %nop8397 = alloca i1, i1 0
-  %nop8398 = alloca i1, i1 0
-  %nop8399 = alloca i1, i1 0
-  %nop8400 = alloca i1, i1 0
-  %nop8401 = alloca i1, i1 0
-  %nop8402 = alloca i1, i1 0
-  %nop8403 = alloca i1, i1 0
-  %nop8404 = alloca i1, i1 0
-  %nop8405 = alloca i1, i1 0
-  %nop8406 = alloca i1, i1 0
-  %nop8407 = alloca i1, i1 0
-  %nop8408 = alloca i1, i1 0
-  %nop8409 = alloca i1, i1 0
-  %nop8410 = alloca i1, i1 0
-  %nop8411 = alloca i1, i1 0
-  %nop8412 = alloca i1, i1 0
-  %nop8413 = alloca i1, i1 0
-  %nop8414 = alloca i1, i1 0
-  %nop8415 = alloca i1, i1 0
-  %nop8416 = alloca i1, i1 0
-  %nop8417 = alloca i1, i1 0
-  %nop8418 = alloca i1, i1 0
-  %nop8419 = alloca i1, i1 0
-  %nop8420 = alloca i1, i1 0
-  %nop8421 = alloca i1, i1 0
-  %nop8422 = alloca i1, i1 0
-  %nop8423 = alloca i1, i1 0
-  %nop8424 = alloca i1, i1 0
-  %nop8425 = alloca i1, i1 0
-  %nop8426 = alloca i1, i1 0
-  %nop8427 = alloca i1, i1 0
-  %nop8428 = alloca i1, i1 0
-  %nop8429 = alloca i1, i1 0
-  %nop8430 = alloca i1, i1 0
-  %nop8431 = alloca i1, i1 0
-  %nop8432 = alloca i1, i1 0
-  %nop8433 = alloca i1, i1 0
-  %nop8434 = alloca i1, i1 0
-  %nop8435 = alloca i1, i1 0
-  %nop8436 = alloca i1, i1 0
-  %nop8437 = alloca i1, i1 0
-  %nop8438 = alloca i1, i1 0
-  %nop8439 = alloca i1, i1 0
-  %nop8440 = alloca i1, i1 0
-  %nop8441 = alloca i1, i1 0
-  %nop8442 = alloca i1, i1 0
-  %nop8443 = alloca i1, i1 0
-  %nop8444 = alloca i1, i1 0
-  %nop8445 = alloca i1, i1 0
-  %nop8446 = alloca i1, i1 0
-  %nop8447 = alloca i1, i1 0
-  %nop8448 = alloca i1, i1 0
-  %nop8449 = alloca i1, i1 0
-  %nop8450 = alloca i1, i1 0
-  %nop8451 = alloca i1, i1 0
-  %nop8452 = alloca i1, i1 0
-  %nop8453 = alloca i1, i1 0
-  %nop8454 = alloca i1, i1 0
-  %nop8455 = alloca i1, i1 0
-  %nop8456 = alloca i1, i1 0
-  %nop8457 = alloca i1, i1 0
-  %nop8458 = alloca i1, i1 0
-  %nop8459 = alloca i1, i1 0
-  %nop8460 = alloca i1, i1 0
-  %nop8461 = alloca i1, i1 0
-  %nop8462 = alloca i1, i1 0
-  %nop8463 = alloca i1, i1 0
-  %nop8464 = alloca i1, i1 0
-  %nop8465 = alloca i1, i1 0
-  %nop8466 = alloca i1, i1 0
-  %nop8467 = alloca i1, i1 0
-  %nop8468 = alloca i1, i1 0
-  %nop8469 = alloca i1, i1 0
-  %nop8470 = alloca i1, i1 0
-  %nop8471 = alloca i1, i1 0
-  %nop8472 = alloca i1, i1 0
-  %nop8473 = alloca i1, i1 0
-  %nop8474 = alloca i1, i1 0
-  %nop8475 = alloca i1, i1 0
-  %nop8476 = alloca i1, i1 0
-  %nop8477 = alloca i1, i1 0
-  %nop8478 = alloca i1, i1 0
-  %nop8479 = alloca i1, i1 0
-  %nop8480 = alloca i1, i1 0
-  %nop8481 = alloca i1, i1 0
-  %nop8482 = alloca i1, i1 0
-  %nop8483 = alloca i1, i1 0
-  %nop8484 = alloca i1, i1 0
-  %nop8485 = alloca i1, i1 0
-  %nop8486 = alloca i1, i1 0
-  %nop8487 = alloca i1, i1 0
-  %nop8488 = alloca i1, i1 0
-  %nop8489 = alloca i1, i1 0
-  %nop8490 = alloca i1, i1 0
-  %nop8491 = alloca i1, i1 0
-  %nop8492 = alloca i1, i1 0
-  %nop8493 = alloca i1, i1 0
-  %nop8494 = alloca i1, i1 0
-  %nop8495 = alloca i1, i1 0
-  %nop8496 = alloca i1, i1 0
-  %nop8497 = alloca i1, i1 0
-  %nop8498 = alloca i1, i1 0
-  %nop8499 = alloca i1, i1 0
-  %nop8500 = alloca i1, i1 0
-  %nop8501 = alloca i1, i1 0
-  %nop8502 = alloca i1, i1 0
-  %nop8503 = alloca i1, i1 0
-  %nop8504 = alloca i1, i1 0
-  %nop8505 = alloca i1, i1 0
-  %nop8506 = alloca i1, i1 0
-  %nop8507 = alloca i1, i1 0
-  %nop8508 = alloca i1, i1 0
-  %nop8509 = alloca i1, i1 0
-  %nop8510 = alloca i1, i1 0
-  %nop8511 = alloca i1, i1 0
-  %nop8512 = alloca i1, i1 0
-  %nop8513 = alloca i1, i1 0
-  %nop8514 = alloca i1, i1 0
-  %nop8515 = alloca i1, i1 0
-  %nop8516 = alloca i1, i1 0
-  %nop8517 = alloca i1, i1 0
-  %nop8518 = alloca i1, i1 0
-  %nop8519 = alloca i1, i1 0
-  %nop8520 = alloca i1, i1 0
-  %nop8521 = alloca i1, i1 0
-  %nop8522 = alloca i1, i1 0
-  %nop8523 = alloca i1, i1 0
-  %nop8524 = alloca i1, i1 0
-  %nop8525 = alloca i1, i1 0
-  %nop8526 = alloca i1, i1 0
-  %nop8527 = alloca i1, i1 0
-  %nop8528 = alloca i1, i1 0
-  %nop8529 = alloca i1, i1 0
-  %nop8530 = alloca i1, i1 0
-  %nop8531 = alloca i1, i1 0
-  %nop8532 = alloca i1, i1 0
-  %nop8533 = alloca i1, i1 0
-  %nop8534 = alloca i1, i1 0
-  %nop8535 = alloca i1, i1 0
-  %nop8536 = alloca i1, i1 0
-  %nop8537 = alloca i1, i1 0
-  %nop8538 = alloca i1, i1 0
-  %nop8539 = alloca i1, i1 0
-  %nop8540 = alloca i1, i1 0
-  %nop8541 = alloca i1, i1 0
-  %nop8542 = alloca i1, i1 0
-  %nop8543 = alloca i1, i1 0
-  %nop8544 = alloca i1, i1 0
-  %nop8545 = alloca i1, i1 0
-  %nop8546 = alloca i1, i1 0
-  %nop8547 = alloca i1, i1 0
-  %nop8548 = alloca i1, i1 0
-  %nop8549 = alloca i1, i1 0
-  %nop8550 = alloca i1, i1 0
-  %nop8551 = alloca i1, i1 0
-  %nop8552 = alloca i1, i1 0
-  %nop8553 = alloca i1, i1 0
-  %nop8554 = alloca i1, i1 0
-  %nop8555 = alloca i1, i1 0
-  %nop8556 = alloca i1, i1 0
-  %nop8557 = alloca i1, i1 0
-  %nop8558 = alloca i1, i1 0
-  %nop8559 = alloca i1, i1 0
-  %nop8560 = alloca i1, i1 0
-  %nop8561 = alloca i1, i1 0
-  %nop8562 = alloca i1, i1 0
-  %nop8563 = alloca i1, i1 0
-  %nop8564 = alloca i1, i1 0
-  %nop8565 = alloca i1, i1 0
-  %nop8566 = alloca i1, i1 0
-  %nop8567 = alloca i1, i1 0
-  %nop8568 = alloca i1, i1 0
-  %nop8569 = alloca i1, i1 0
-  %nop8570 = alloca i1, i1 0
-  %nop8571 = alloca i1, i1 0
-  %nop8572 = alloca i1, i1 0
-  %nop8573 = alloca i1, i1 0
-  %nop8574 = alloca i1, i1 0
-  %nop8575 = alloca i1, i1 0
-  %nop8576 = alloca i1, i1 0
-  %nop8577 = alloca i1, i1 0
-  %nop8578 = alloca i1, i1 0
-  %nop8579 = alloca i1, i1 0
-  %nop8580 = alloca i1, i1 0
-  %nop8581 = alloca i1, i1 0
-  %nop8582 = alloca i1, i1 0
-  %nop8583 = alloca i1, i1 0
-  %nop8584 = alloca i1, i1 0
-  %nop8585 = alloca i1, i1 0
-  %nop8586 = alloca i1, i1 0
-  %nop8587 = alloca i1, i1 0
-  %nop8588 = alloca i1, i1 0
-  %nop8589 = alloca i1, i1 0
-  %nop8590 = alloca i1, i1 0
-  %nop8591 = alloca i1, i1 0
-  %nop8592 = alloca i1, i1 0
-  %nop8593 = alloca i1, i1 0
-  %nop8594 = alloca i1, i1 0
-  %nop8595 = alloca i1, i1 0
-  %nop8596 = alloca i1, i1 0
-  %nop8597 = alloca i1, i1 0
-  %nop8598 = alloca i1, i1 0
-  %nop8599 = alloca i1, i1 0
-  %nop8600 = alloca i1, i1 0
-  %nop8601 = alloca i1, i1 0
-  %nop8602 = alloca i1, i1 0
-  %nop8603 = alloca i1, i1 0
-  %nop8604 = alloca i1, i1 0
-  %nop8605 = alloca i1, i1 0
-  %nop8606 = alloca i1, i1 0
-  %nop8607 = alloca i1, i1 0
-  %nop8608 = alloca i1, i1 0
-  %nop8609 = alloca i1, i1 0
-  %nop8610 = alloca i1, i1 0
-  %nop8611 = alloca i1, i1 0
-  %nop8612 = alloca i1, i1 0
-  %nop8613 = alloca i1, i1 0
-  %nop8614 = alloca i1, i1 0
-  %nop8615 = alloca i1, i1 0
-  %nop8616 = alloca i1, i1 0
-  %nop8617 = alloca i1, i1 0
-  %nop8618 = alloca i1, i1 0
-  %nop8619 = alloca i1, i1 0
-  %nop8620 = alloca i1, i1 0
-  %nop8621 = alloca i1, i1 0
-  %nop8622 = alloca i1, i1 0
-  %nop8623 = alloca i1, i1 0
-  %nop8624 = alloca i1, i1 0
-  %nop8625 = alloca i1, i1 0
-  %nop8626 = alloca i1, i1 0
-  %nop8627 = alloca i1, i1 0
-  %nop8628 = alloca i1, i1 0
-  %nop8629 = alloca i1, i1 0
-  %nop8630 = alloca i1, i1 0
-  %nop8631 = alloca i1, i1 0
-  %nop8632 = alloca i1, i1 0
-  %nop8633 = alloca i1, i1 0
-  %nop8634 = alloca i1, i1 0
-  %nop8635 = alloca i1, i1 0
-  %nop8636 = alloca i1, i1 0
-  %nop8637 = alloca i1, i1 0
-  %nop8638 = alloca i1, i1 0
-  %nop8639 = alloca i1, i1 0
-  %nop8640 = alloca i1, i1 0
-  %nop8641 = alloca i1, i1 0
-  %nop8642 = alloca i1, i1 0
-  %nop8643 = alloca i1, i1 0
-  %nop8644 = alloca i1, i1 0
-  %nop8645 = alloca i1, i1 0
-  %nop8646 = alloca i1, i1 0
-  %nop8647 = alloca i1, i1 0
-  %nop8648 = alloca i1, i1 0
-  %nop8649 = alloca i1, i1 0
-  %nop8650 = alloca i1, i1 0
-  %nop8651 = alloca i1, i1 0
-  %nop8652 = alloca i1, i1 0
-  %nop8653 = alloca i1, i1 0
-  %nop8654 = alloca i1, i1 0
-  %nop8655 = alloca i1, i1 0
-  %nop8656 = alloca i1, i1 0
-  %nop8657 = alloca i1, i1 0
-  %nop8658 = alloca i1, i1 0
-  %nop8659 = alloca i1, i1 0
-  %nop8660 = alloca i1, i1 0
-  %nop8661 = alloca i1, i1 0
-  %nop8662 = alloca i1, i1 0
-  %nop8663 = alloca i1, i1 0
-  %nop8664 = alloca i1, i1 0
-  %nop8665 = alloca i1, i1 0
-  %nop8666 = alloca i1, i1 0
-  %nop8667 = alloca i1, i1 0
-  %nop8668 = alloca i1, i1 0
-  %nop8669 = alloca i1, i1 0
-  %nop8670 = alloca i1, i1 0
-  %nop8671 = alloca i1, i1 0
-  %nop8672 = alloca i1, i1 0
-  %nop8673 = alloca i1, i1 0
-  %nop8674 = alloca i1, i1 0
-  %nop8675 = alloca i1, i1 0
-  %nop8676 = alloca i1, i1 0
-  %nop8677 = alloca i1, i1 0
-  %nop8678 = alloca i1, i1 0
-  %nop8679 = alloca i1, i1 0
-  %nop8680 = alloca i1, i1 0
-  %nop8681 = alloca i1, i1 0
-  %nop8682 = alloca i1, i1 0
-  %nop8683 = alloca i1, i1 0
-  %nop8684 = alloca i1, i1 0
-  %nop8685 = alloca i1, i1 0
-  %nop8686 = alloca i1, i1 0
-  %nop8687 = alloca i1, i1 0
-  %nop8688 = alloca i1, i1 0
-  %nop8689 = alloca i1, i1 0
-  %nop8690 = alloca i1, i1 0
-  %nop8691 = alloca i1, i1 0
-  %nop8692 = alloca i1, i1 0
-  %nop8693 = alloca i1, i1 0
-  %nop8694 = alloca i1, i1 0
-  %nop8695 = alloca i1, i1 0
-  %nop8696 = alloca i1, i1 0
-  %nop8697 = alloca i1, i1 0
-  %nop8698 = alloca i1, i1 0
-  %nop8699 = alloca i1, i1 0
-  %nop8700 = alloca i1, i1 0
-  %nop8701 = alloca i1, i1 0
-  %nop8702 = alloca i1, i1 0
-  %nop8703 = alloca i1, i1 0
-  %nop8704 = alloca i1, i1 0
-  %nop8705 = alloca i1, i1 0
-  %nop8706 = alloca i1, i1 0
-  %nop8707 = alloca i1, i1 0
-  %nop8708 = alloca i1, i1 0
-  %nop8709 = alloca i1, i1 0
-  %nop8710 = alloca i1, i1 0
-  %nop8711 = alloca i1, i1 0
-  %nop8712 = alloca i1, i1 0
-  %nop8713 = alloca i1, i1 0
-  %nop8714 = alloca i1, i1 0
-  %nop8715 = alloca i1, i1 0
-  %nop8716 = alloca i1, i1 0
-  %nop8717 = alloca i1, i1 0
-  %nop8718 = alloca i1, i1 0
-  %nop8719 = alloca i1, i1 0
-  %nop8720 = alloca i1, i1 0
-  %nop8721 = alloca i1, i1 0
-  %nop8722 = alloca i1, i1 0
-  %nop8723 = alloca i1, i1 0
-  %nop8724 = alloca i1, i1 0
-  %nop8725 = alloca i1, i1 0
-  %nop8726 = alloca i1, i1 0
-  %nop8727 = alloca i1, i1 0
-  %nop8728 = alloca i1, i1 0
-  %nop8729 = alloca i1, i1 0
-  %nop8730 = alloca i1, i1 0
-  %nop8731 = alloca i1, i1 0
-  %nop8732 = alloca i1, i1 0
-  %nop8733 = alloca i1, i1 0
-  %nop8734 = alloca i1, i1 0
-  %nop8735 = alloca i1, i1 0
-  %nop8736 = alloca i1, i1 0
-  %nop8737 = alloca i1, i1 0
-  %nop8738 = alloca i1, i1 0
-  %nop8739 = alloca i1, i1 0
-  %nop8740 = alloca i1, i1 0
-  %nop8741 = alloca i1, i1 0
-  %nop8742 = alloca i1, i1 0
-  %nop8743 = alloca i1, i1 0
-  %nop8744 = alloca i1, i1 0
-  %nop8745 = alloca i1, i1 0
-  %nop8746 = alloca i1, i1 0
-  %nop8747 = alloca i1, i1 0
-  %nop8748 = alloca i1, i1 0
-  %nop8749 = alloca i1, i1 0
-  %nop8750 = alloca i1, i1 0
-  %nop8751 = alloca i1, i1 0
-  %nop8752 = alloca i1, i1 0
-  %nop8753 = alloca i1, i1 0
-  %nop8754 = alloca i1, i1 0
-  %nop8755 = alloca i1, i1 0
-  %nop8756 = alloca i1, i1 0
-  %nop8757 = alloca i1, i1 0
-  %nop8758 = alloca i1, i1 0
-  %nop8759 = alloca i1, i1 0
-  %nop8760 = alloca i1, i1 0
-  %nop8761 = alloca i1, i1 0
-  %nop8762 = alloca i1, i1 0
-  %nop8763 = alloca i1, i1 0
-  %nop8764 = alloca i1, i1 0
-  %nop8765 = alloca i1, i1 0
-  %nop8766 = alloca i1, i1 0
-  %nop8767 = alloca i1, i1 0
-  %nop8768 = alloca i1, i1 0
-  %nop8769 = alloca i1, i1 0
-  %nop8770 = alloca i1, i1 0
-  %nop8771 = alloca i1, i1 0
-  %nop8772 = alloca i1, i1 0
-  %nop8773 = alloca i1, i1 0
-  %nop8774 = alloca i1, i1 0
-  %nop8775 = alloca i1, i1 0
-  %nop8776 = alloca i1, i1 0
-  %nop8777 = alloca i1, i1 0
-  %nop8778 = alloca i1, i1 0
-  %nop8779 = alloca i1, i1 0
-  %nop8780 = alloca i1, i1 0
-  %nop8781 = alloca i1, i1 0
-  %nop8782 = alloca i1, i1 0
-  %nop8783 = alloca i1, i1 0
-  %nop8784 = alloca i1, i1 0
-  %nop8785 = alloca i1, i1 0
-  %nop8786 = alloca i1, i1 0
-  %nop8787 = alloca i1, i1 0
-  %nop8788 = alloca i1, i1 0
-  %nop8789 = alloca i1, i1 0
-  %nop8790 = alloca i1, i1 0
-  %nop8791 = alloca i1, i1 0
-  %nop8792 = alloca i1, i1 0
-  %nop8793 = alloca i1, i1 0
-  %nop8794 = alloca i1, i1 0
-  %nop8795 = alloca i1, i1 0
-  %nop8796 = alloca i1, i1 0
-  %nop8797 = alloca i1, i1 0
-  %nop8798 = alloca i1, i1 0
-  %nop8799 = alloca i1, i1 0
-  %nop8800 = alloca i1, i1 0
-  %nop8801 = alloca i1, i1 0
-  %nop8802 = alloca i1, i1 0
-  %nop8803 = alloca i1, i1 0
-  %nop8804 = alloca i1, i1 0
-  %nop8805 = alloca i1, i1 0
-  %nop8806 = alloca i1, i1 0
-  %nop8807 = alloca i1, i1 0
-  %nop8808 = alloca i1, i1 0
-  %nop8809 = alloca i1, i1 0
-  %nop8810 = alloca i1, i1 0
-  %nop8811 = alloca i1, i1 0
-  %nop8812 = alloca i1, i1 0
-  %nop8813 = alloca i1, i1 0
-  %nop8814 = alloca i1, i1 0
-  %nop8815 = alloca i1, i1 0
-  %nop8816 = alloca i1, i1 0
-  %nop8817 = alloca i1, i1 0
-  %nop8818 = alloca i1, i1 0
-  %nop8819 = alloca i1, i1 0
-  %nop8820 = alloca i1, i1 0
-  %nop8821 = alloca i1, i1 0
-  %nop8822 = alloca i1, i1 0
-  %nop8823 = alloca i1, i1 0
-  %nop8824 = alloca i1, i1 0
-  %nop8825 = alloca i1, i1 0
-  %nop8826 = alloca i1, i1 0
-  %nop8827 = alloca i1, i1 0
-  %nop8828 = alloca i1, i1 0
-  %nop8829 = alloca i1, i1 0
-  %nop8830 = alloca i1, i1 0
-  %nop8831 = alloca i1, i1 0
-  %nop8832 = alloca i1, i1 0
-  %nop8833 = alloca i1, i1 0
-  %nop8834 = alloca i1, i1 0
-  %nop8835 = alloca i1, i1 0
-  %nop8836 = alloca i1, i1 0
-  %nop8837 = alloca i1, i1 0
-  %nop8838 = alloca i1, i1 0
-  %nop8839 = alloca i1, i1 0
-  %nop8840 = alloca i1, i1 0
-  %nop8841 = alloca i1, i1 0
-  %nop8842 = alloca i1, i1 0
-  %nop8843 = alloca i1, i1 0
-  %nop8844 = alloca i1, i1 0
-  %nop8845 = alloca i1, i1 0
-  %nop8846 = alloca i1, i1 0
-  %nop8847 = alloca i1, i1 0
-  %nop8848 = alloca i1, i1 0
-  %nop8849 = alloca i1, i1 0
-  %nop8850 = alloca i1, i1 0
-  %nop8851 = alloca i1, i1 0
-  %nop8852 = alloca i1, i1 0
-  %nop8853 = alloca i1, i1 0
-  %nop8854 = alloca i1, i1 0
-  %nop8855 = alloca i1, i1 0
-  %nop8856 = alloca i1, i1 0
-  %nop8857 = alloca i1, i1 0
-  %nop8858 = alloca i1, i1 0
-  %nop8859 = alloca i1, i1 0
-  %nop8860 = alloca i1, i1 0
-  %nop8861 = alloca i1, i1 0
-  %nop8862 = alloca i1, i1 0
-  %nop8863 = alloca i1, i1 0
-  %nop8864 = alloca i1, i1 0
-  %nop8865 = alloca i1, i1 0
-  %nop8866 = alloca i1, i1 0
-  %nop8867 = alloca i1, i1 0
-  %nop8868 = alloca i1, i1 0
-  %nop8869 = alloca i1, i1 0
-  %nop8870 = alloca i1, i1 0
-  %nop8871 = alloca i1, i1 0
-  %nop8872 = alloca i1, i1 0
-  %nop8873 = alloca i1, i1 0
-  %nop8874 = alloca i1, i1 0
-  %nop8875 = alloca i1, i1 0
-  %nop8876 = alloca i1, i1 0
-  %nop8877 = alloca i1, i1 0
-  %nop8878 = alloca i1, i1 0
-  %nop8879 = alloca i1, i1 0
-  %nop8880 = alloca i1, i1 0
-  %nop8881 = alloca i1, i1 0
-  %nop8882 = alloca i1, i1 0
-  %nop8883 = alloca i1, i1 0
-  %nop8884 = alloca i1, i1 0
-  %nop8885 = alloca i1, i1 0
-  %nop8886 = alloca i1, i1 0
-  %nop8887 = alloca i1, i1 0
-  %nop8888 = alloca i1, i1 0
-  %nop8889 = alloca i1, i1 0
-  %nop8890 = alloca i1, i1 0
-  %nop8891 = alloca i1, i1 0
-  %nop8892 = alloca i1, i1 0
-  %nop8893 = alloca i1, i1 0
-  %nop8894 = alloca i1, i1 0
-  %nop8895 = alloca i1, i1 0
-  %nop8896 = alloca i1, i1 0
-  %nop8897 = alloca i1, i1 0
-  %nop8898 = alloca i1, i1 0
-  %nop8899 = alloca i1, i1 0
-  %nop8900 = alloca i1, i1 0
-  %nop8901 = alloca i1, i1 0
-  %nop8902 = alloca i1, i1 0
-  %nop8903 = alloca i1, i1 0
-  %nop8904 = alloca i1, i1 0
-  %nop8905 = alloca i1, i1 0
-  %nop8906 = alloca i1, i1 0
-  %nop8907 = alloca i1, i1 0
-  %nop8908 = alloca i1, i1 0
-  %nop8909 = alloca i1, i1 0
-  %nop8910 = alloca i1, i1 0
-  %nop8911 = alloca i1, i1 0
-  %nop8912 = alloca i1, i1 0
-  %nop8913 = alloca i1, i1 0
-  %nop8914 = alloca i1, i1 0
-  %nop8915 = alloca i1, i1 0
-  %nop8916 = alloca i1, i1 0
-  %nop8917 = alloca i1, i1 0
-  %nop8918 = alloca i1, i1 0
-  %nop8919 = alloca i1, i1 0
-  %nop8920 = alloca i1, i1 0
-  %nop8921 = alloca i1, i1 0
-  %nop8922 = alloca i1, i1 0
-  %nop8923 = alloca i1, i1 0
-  %nop8924 = alloca i1, i1 0
-  %nop8925 = alloca i1, i1 0
-  %nop8926 = alloca i1, i1 0
-  %nop8927 = alloca i1, i1 0
-  %nop8928 = alloca i1, i1 0
-  %nop8929 = alloca i1, i1 0
-  %nop8930 = alloca i1, i1 0
-  %nop8931 = alloca i1, i1 0
-  %nop8932 = alloca i1, i1 0
-  %nop8933 = alloca i1, i1 0
-  %nop8934 = alloca i1, i1 0
-  %nop8935 = alloca i1, i1 0
-  %nop8936 = alloca i1, i1 0
-  %nop8937 = alloca i1, i1 0
-  %nop8938 = alloca i1, i1 0
-  %nop8939 = alloca i1, i1 0
-  %nop8940 = alloca i1, i1 0
-  %nop8941 = alloca i1, i1 0
-  %nop8942 = alloca i1, i1 0
-  %nop8943 = alloca i1, i1 0
-  %nop8944 = alloca i1, i1 0
-  %nop8945 = alloca i1, i1 0
-  %nop8946 = alloca i1, i1 0
-  %nop8947 = alloca i1, i1 0
-  %nop8948 = alloca i1, i1 0
-  %nop8949 = alloca i1, i1 0
-  %nop8950 = alloca i1, i1 0
-  %nop8951 = alloca i1, i1 0
-  %nop8952 = alloca i1, i1 0
-  %nop8953 = alloca i1, i1 0
-  %nop8954 = alloca i1, i1 0
-  %nop8955 = alloca i1, i1 0
-  %nop8956 = alloca i1, i1 0
-  %nop8957 = alloca i1, i1 0
-  %nop8958 = alloca i1, i1 0
-  %nop8959 = alloca i1, i1 0
-  %nop8960 = alloca i1, i1 0
-  %nop8961 = alloca i1, i1 0
-  %nop8962 = alloca i1, i1 0
-  %nop8963 = alloca i1, i1 0
-  %nop8964 = alloca i1, i1 0
-  %nop8965 = alloca i1, i1 0
-  %nop8966 = alloca i1, i1 0
-  %nop8967 = alloca i1, i1 0
-  %nop8968 = alloca i1, i1 0
-  %nop8969 = alloca i1, i1 0
-  %nop8970 = alloca i1, i1 0
-  %nop8971 = alloca i1, i1 0
-  %nop8972 = alloca i1, i1 0
-  %nop8973 = alloca i1, i1 0
-  %nop8974 = alloca i1, i1 0
-  %nop8975 = alloca i1, i1 0
-  %nop8976 = alloca i1, i1 0
-  %nop8977 = alloca i1, i1 0
-  %nop8978 = alloca i1, i1 0
-  %nop8979 = alloca i1, i1 0
-  %nop8980 = alloca i1, i1 0
-  %nop8981 = alloca i1, i1 0
-  %nop8982 = alloca i1, i1 0
-  %nop8983 = alloca i1, i1 0
-  %nop8984 = alloca i1, i1 0
-  %nop8985 = alloca i1, i1 0
-  %nop8986 = alloca i1, i1 0
-  %nop8987 = alloca i1, i1 0
-  %nop8988 = alloca i1, i1 0
-  %nop8989 = alloca i1, i1 0
-  %nop8990 = alloca i1, i1 0
-  %nop8991 = alloca i1, i1 0
-  %nop8992 = alloca i1, i1 0
-  %nop8993 = alloca i1, i1 0
-  %nop8994 = alloca i1, i1 0
-  %nop8995 = alloca i1, i1 0
-  %nop8996 = alloca i1, i1 0
-  %nop8997 = alloca i1, i1 0
-  %nop8998 = alloca i1, i1 0
-  %nop8999 = alloca i1, i1 0
-  %nop9000 = alloca i1, i1 0
-  %nop9001 = alloca i1, i1 0
-  %nop9002 = alloca i1, i1 0
-  %nop9003 = alloca i1, i1 0
-  %nop9004 = alloca i1, i1 0
-  %nop9005 = alloca i1, i1 0
-  %nop9006 = alloca i1, i1 0
-  %nop9007 = alloca i1, i1 0
-  %nop9008 = alloca i1, i1 0
-  %nop9009 = alloca i1, i1 0
-  %nop9010 = alloca i1, i1 0
-  %nop9011 = alloca i1, i1 0
-  %nop9012 = alloca i1, i1 0
-  %nop9013 = alloca i1, i1 0
-  %nop9014 = alloca i1, i1 0
-  %nop9015 = alloca i1, i1 0
-  %nop9016 = alloca i1, i1 0
-  %nop9017 = alloca i1, i1 0
-  %nop9018 = alloca i1, i1 0
-  %nop9019 = alloca i1, i1 0
-  %nop9020 = alloca i1, i1 0
-  %nop9021 = alloca i1, i1 0
-  %nop9022 = alloca i1, i1 0
-  %nop9023 = alloca i1, i1 0
-  %nop9024 = alloca i1, i1 0
-  %nop9025 = alloca i1, i1 0
-  %nop9026 = alloca i1, i1 0
-  %nop9027 = alloca i1, i1 0
-  %nop9028 = alloca i1, i1 0
-  %nop9029 = alloca i1, i1 0
-  %nop9030 = alloca i1, i1 0
-  %nop9031 = alloca i1, i1 0
-  %nop9032 = alloca i1, i1 0
-  %nop9033 = alloca i1, i1 0
-  %nop9034 = alloca i1, i1 0
-  %nop9035 = alloca i1, i1 0
-  %nop9036 = alloca i1, i1 0
-  %nop9037 = alloca i1, i1 0
-  %nop9038 = alloca i1, i1 0
-  %nop9039 = alloca i1, i1 0
-  %nop9040 = alloca i1, i1 0
-  %nop9041 = alloca i1, i1 0
-  %nop9042 = alloca i1, i1 0
-  %nop9043 = alloca i1, i1 0
-  %nop9044 = alloca i1, i1 0
-  %nop9045 = alloca i1, i1 0
-  %nop9046 = alloca i1, i1 0
-  %nop9047 = alloca i1, i1 0
-  %nop9048 = alloca i1, i1 0
-  %nop9049 = alloca i1, i1 0
-  %nop9050 = alloca i1, i1 0
-  %nop9051 = alloca i1, i1 0
-  %nop9052 = alloca i1, i1 0
-  %nop9053 = alloca i1, i1 0
-  %nop9054 = alloca i1, i1 0
-  %nop9055 = alloca i1, i1 0
-  %nop9056 = alloca i1, i1 0
-  %nop9057 = alloca i1, i1 0
-  %nop9058 = alloca i1, i1 0
-  %nop9059 = alloca i1, i1 0
-  %nop9060 = alloca i1, i1 0
-  %nop9061 = alloca i1, i1 0
-  %nop9062 = alloca i1, i1 0
-  %nop9063 = alloca i1, i1 0
-  %nop9064 = alloca i1, i1 0
-  %nop9065 = alloca i1, i1 0
-  %nop9066 = alloca i1, i1 0
-  %nop9067 = alloca i1, i1 0
-  %nop9068 = alloca i1, i1 0
-  %nop9069 = alloca i1, i1 0
-  %nop9070 = alloca i1, i1 0
-  %nop9071 = alloca i1, i1 0
-  %nop9072 = alloca i1, i1 0
-  %nop9073 = alloca i1, i1 0
-  %nop9074 = alloca i1, i1 0
-  %nop9075 = alloca i1, i1 0
-  %nop9076 = alloca i1, i1 0
-  %nop9077 = alloca i1, i1 0
-  %nop9078 = alloca i1, i1 0
-  %nop9079 = alloca i1, i1 0
-  %nop9080 = alloca i1, i1 0
-  %nop9081 = alloca i1, i1 0
-  %nop9082 = alloca i1, i1 0
-  %nop9083 = alloca i1, i1 0
-  %nop9084 = alloca i1, i1 0
-  %nop9085 = alloca i1, i1 0
-  %nop9086 = alloca i1, i1 0
-  %nop9087 = alloca i1, i1 0
-  %nop9088 = alloca i1, i1 0
-  %nop9089 = alloca i1, i1 0
-  %nop9090 = alloca i1, i1 0
-  %nop9091 = alloca i1, i1 0
-  %nop9092 = alloca i1, i1 0
-  %nop9093 = alloca i1, i1 0
-  %nop9094 = alloca i1, i1 0
-  %nop9095 = alloca i1, i1 0
-  %nop9096 = alloca i1, i1 0
-  %nop9097 = alloca i1, i1 0
-  %nop9098 = alloca i1, i1 0
-  %nop9099 = alloca i1, i1 0
-  %nop9100 = alloca i1, i1 0
-  %nop9101 = alloca i1, i1 0
-  %nop9102 = alloca i1, i1 0
-  %nop9103 = alloca i1, i1 0
-  %nop9104 = alloca i1, i1 0
-  %nop9105 = alloca i1, i1 0
-  %nop9106 = alloca i1, i1 0
-  %nop9107 = alloca i1, i1 0
-  %nop9108 = alloca i1, i1 0
-  %nop9109 = alloca i1, i1 0
-  %nop9110 = alloca i1, i1 0
-  %nop9111 = alloca i1, i1 0
-  %nop9112 = alloca i1, i1 0
-  %nop9113 = alloca i1, i1 0
-  %nop9114 = alloca i1, i1 0
-  %nop9115 = alloca i1, i1 0
-  %nop9116 = alloca i1, i1 0
-  %nop9117 = alloca i1, i1 0
-  %nop9118 = alloca i1, i1 0
-  %nop9119 = alloca i1, i1 0
-  %nop9120 = alloca i1, i1 0
-  %nop9121 = alloca i1, i1 0
-  %nop9122 = alloca i1, i1 0
-  %nop9123 = alloca i1, i1 0
-  %nop9124 = alloca i1, i1 0
-  %nop9125 = alloca i1, i1 0
-  %nop9126 = alloca i1, i1 0
-  %nop9127 = alloca i1, i1 0
-  %nop9128 = alloca i1, i1 0
-  %nop9129 = alloca i1, i1 0
-  %nop9130 = alloca i1, i1 0
-  %nop9131 = alloca i1, i1 0
-  %nop9132 = alloca i1, i1 0
-  %nop9133 = alloca i1, i1 0
-  %nop9134 = alloca i1, i1 0
-  %nop9135 = alloca i1, i1 0
-  %nop9136 = alloca i1, i1 0
-  %nop9137 = alloca i1, i1 0
-  %nop9138 = alloca i1, i1 0
-  %nop9139 = alloca i1, i1 0
-  %nop9140 = alloca i1, i1 0
-  %nop9141 = alloca i1, i1 0
-  %nop9142 = alloca i1, i1 0
-  %nop9143 = alloca i1, i1 0
-  %nop9144 = alloca i1, i1 0
-  %nop9145 = alloca i1, i1 0
-  %nop9146 = alloca i1, i1 0
-  %nop9147 = alloca i1, i1 0
-  %nop9148 = alloca i1, i1 0
-  %nop9149 = alloca i1, i1 0
-  %nop9150 = alloca i1, i1 0
-  %nop9151 = alloca i1, i1 0
-  %nop9152 = alloca i1, i1 0
-  %nop9153 = alloca i1, i1 0
-  %nop9154 = alloca i1, i1 0
-  %nop9155 = alloca i1, i1 0
-  %nop9156 = alloca i1, i1 0
-  %nop9157 = alloca i1, i1 0
-  %nop9158 = alloca i1, i1 0
-  %nop9159 = alloca i1, i1 0
-  %nop9160 = alloca i1, i1 0
-  %nop9161 = alloca i1, i1 0
-  %nop9162 = alloca i1, i1 0
-  %nop9163 = alloca i1, i1 0
-  %nop9164 = alloca i1, i1 0
-  %nop9165 = alloca i1, i1 0
-  %nop9166 = alloca i1, i1 0
-  %nop9167 = alloca i1, i1 0
-  %nop9168 = alloca i1, i1 0
-  %nop9169 = alloca i1, i1 0
-  %nop9170 = alloca i1, i1 0
-  %nop9171 = alloca i1, i1 0
-  %nop9172 = alloca i1, i1 0
-  %nop9173 = alloca i1, i1 0
-  %nop9174 = alloca i1, i1 0
-  %nop9175 = alloca i1, i1 0
-  %nop9176 = alloca i1, i1 0
-  %nop9177 = alloca i1, i1 0
-  %nop9178 = alloca i1, i1 0
-  %nop9179 = alloca i1, i1 0
-  %nop9180 = alloca i1, i1 0
-  %nop9181 = alloca i1, i1 0
-  %nop9182 = alloca i1, i1 0
-  %nop9183 = alloca i1, i1 0
-  %nop9184 = alloca i1, i1 0
-  %nop9185 = alloca i1, i1 0
-  %nop9186 = alloca i1, i1 0
-  %nop9187 = alloca i1, i1 0
-  %nop9188 = alloca i1, i1 0
-  %nop9189 = alloca i1, i1 0
-  %nop9190 = alloca i1, i1 0
-  %nop9191 = alloca i1, i1 0
-  %nop9192 = alloca i1, i1 0
-  %nop9193 = alloca i1, i1 0
-  %nop9194 = alloca i1, i1 0
-  %nop9195 = alloca i1, i1 0
-  %nop9196 = alloca i1, i1 0
-  %nop9197 = alloca i1, i1 0
-  %nop9198 = alloca i1, i1 0
-  %nop9199 = alloca i1, i1 0
-  %nop9200 = alloca i1, i1 0
-  %nop9201 = alloca i1, i1 0
-  %nop9202 = alloca i1, i1 0
-  %nop9203 = alloca i1, i1 0
-  %nop9204 = alloca i1, i1 0
-  %nop9205 = alloca i1, i1 0
-  %nop9206 = alloca i1, i1 0
-  %nop9207 = alloca i1, i1 0
-  %nop9208 = alloca i1, i1 0
-  %nop9209 = alloca i1, i1 0
-  %nop9210 = alloca i1, i1 0
-  %nop9211 = alloca i1, i1 0
-  %nop9212 = alloca i1, i1 0
-  %nop9213 = alloca i1, i1 0
-  %nop9214 = alloca i1, i1 0
-  %nop9215 = alloca i1, i1 0
-  %nop9216 = alloca i1, i1 0
-  %nop9217 = alloca i1, i1 0
-  %nop9218 = alloca i1, i1 0
-  %nop9219 = alloca i1, i1 0
-  %nop9220 = alloca i1, i1 0
-  %nop9221 = alloca i1, i1 0
-  %nop9222 = alloca i1, i1 0
-  %nop9223 = alloca i1, i1 0
-  %nop9224 = alloca i1, i1 0
-  %nop9225 = alloca i1, i1 0
-  %nop9226 = alloca i1, i1 0
-  %nop9227 = alloca i1, i1 0
-  %nop9228 = alloca i1, i1 0
-  %nop9229 = alloca i1, i1 0
-  %nop9230 = alloca i1, i1 0
-  %nop9231 = alloca i1, i1 0
-  %nop9232 = alloca i1, i1 0
-  %nop9233 = alloca i1, i1 0
-  %nop9234 = alloca i1, i1 0
-  %nop9235 = alloca i1, i1 0
-  %nop9236 = alloca i1, i1 0
-  %nop9237 = alloca i1, i1 0
-  %nop9238 = alloca i1, i1 0
-  %nop9239 = alloca i1, i1 0
-  %nop9240 = alloca i1, i1 0
-  %nop9241 = alloca i1, i1 0
-  %nop9242 = alloca i1, i1 0
-  %nop9243 = alloca i1, i1 0
-  %nop9244 = alloca i1, i1 0
-  %nop9245 = alloca i1, i1 0
-  %nop9246 = alloca i1, i1 0
-  %nop9247 = alloca i1, i1 0
-  %nop9248 = alloca i1, i1 0
-  %nop9249 = alloca i1, i1 0
-  %nop9250 = alloca i1, i1 0
-  %nop9251 = alloca i1, i1 0
-  %nop9252 = alloca i1, i1 0
-  %nop9253 = alloca i1, i1 0
-  %nop9254 = alloca i1, i1 0
-  %nop9255 = alloca i1, i1 0
-  %nop9256 = alloca i1, i1 0
-  %nop9257 = alloca i1, i1 0
-  %nop9258 = alloca i1, i1 0
-  %nop9259 = alloca i1, i1 0
-  %nop9260 = alloca i1, i1 0
-  %nop9261 = alloca i1, i1 0
-  %nop9262 = alloca i1, i1 0
-  %nop9263 = alloca i1, i1 0
-  %nop9264 = alloca i1, i1 0
-  %nop9265 = alloca i1, i1 0
-  %nop9266 = alloca i1, i1 0
-  %nop9267 = alloca i1, i1 0
-  %nop9268 = alloca i1, i1 0
-  %nop9269 = alloca i1, i1 0
-  %nop9270 = alloca i1, i1 0
-  %nop9271 = alloca i1, i1 0
-  %nop9272 = alloca i1, i1 0
-  %nop9273 = alloca i1, i1 0
-  %nop9274 = alloca i1, i1 0
-  %nop9275 = alloca i1, i1 0
-  %nop9276 = alloca i1, i1 0
-  %nop9277 = alloca i1, i1 0
-  %nop9278 = alloca i1, i1 0
-  %nop9279 = alloca i1, i1 0
-  %nop9280 = alloca i1, i1 0
-  %nop9281 = alloca i1, i1 0
-  %nop9282 = alloca i1, i1 0
-  %nop9283 = alloca i1, i1 0
-  %nop9284 = alloca i1, i1 0
-  %nop9285 = alloca i1, i1 0
-  %nop9286 = alloca i1, i1 0
-  %nop9287 = alloca i1, i1 0
-  %nop9288 = alloca i1, i1 0
-  %nop9289 = alloca i1, i1 0
-  %nop9290 = alloca i1, i1 0
-  %nop9291 = alloca i1, i1 0
-  %nop9292 = alloca i1, i1 0
-  %nop9293 = alloca i1, i1 0
-  %nop9294 = alloca i1, i1 0
-  %nop9295 = alloca i1, i1 0
-  %nop9296 = alloca i1, i1 0
-  %nop9297 = alloca i1, i1 0
-  %nop9298 = alloca i1, i1 0
-  %nop9299 = alloca i1, i1 0
-  %nop9300 = alloca i1, i1 0
-  %nop9301 = alloca i1, i1 0
-  %nop9302 = alloca i1, i1 0
-  %nop9303 = alloca i1, i1 0
-  %nop9304 = alloca i1, i1 0
-  %nop9305 = alloca i1, i1 0
-  %nop9306 = alloca i1, i1 0
-  %nop9307 = alloca i1, i1 0
-  %nop9308 = alloca i1, i1 0
-  %nop9309 = alloca i1, i1 0
-  %nop9310 = alloca i1, i1 0
-  %nop9311 = alloca i1, i1 0
-  %nop9312 = alloca i1, i1 0
-  %nop9313 = alloca i1, i1 0
-  %nop9314 = alloca i1, i1 0
-  %nop9315 = alloca i1, i1 0
-  %nop9316 = alloca i1, i1 0
-  %nop9317 = alloca i1, i1 0
-  %nop9318 = alloca i1, i1 0
-  %nop9319 = alloca i1, i1 0
-  %nop9320 = alloca i1, i1 0
-  %nop9321 = alloca i1, i1 0
-  %nop9322 = alloca i1, i1 0
-  %nop9323 = alloca i1, i1 0
-  %nop9324 = alloca i1, i1 0
-  %nop9325 = alloca i1, i1 0
-  %nop9326 = alloca i1, i1 0
-  %nop9327 = alloca i1, i1 0
-  %nop9328 = alloca i1, i1 0
-  %nop9329 = alloca i1, i1 0
-  %nop9330 = alloca i1, i1 0
-  %nop9331 = alloca i1, i1 0
-  %nop9332 = alloca i1, i1 0
-  %nop9333 = alloca i1, i1 0
-  %nop9334 = alloca i1, i1 0
-  %nop9335 = alloca i1, i1 0
-  %nop9336 = alloca i1, i1 0
-  %nop9337 = alloca i1, i1 0
-  %nop9338 = alloca i1, i1 0
-  %nop9339 = alloca i1, i1 0
-  %nop9340 = alloca i1, i1 0
-  %nop9341 = alloca i1, i1 0
-  %nop9342 = alloca i1, i1 0
-  %nop9343 = alloca i1, i1 0
-  %nop9344 = alloca i1, i1 0
-  %nop9345 = alloca i1, i1 0
-  %nop9346 = alloca i1, i1 0
-  %nop9347 = alloca i1, i1 0
-  %nop9348 = alloca i1, i1 0
-  %nop9349 = alloca i1, i1 0
-  %nop9350 = alloca i1, i1 0
-  %nop9351 = alloca i1, i1 0
-  %nop9352 = alloca i1, i1 0
-  %nop9353 = alloca i1, i1 0
-  %nop9354 = alloca i1, i1 0
-  %nop9355 = alloca i1, i1 0
-  %nop9356 = alloca i1, i1 0
-  %nop9357 = alloca i1, i1 0
-  %nop9358 = alloca i1, i1 0
-  %nop9359 = alloca i1, i1 0
-  %nop9360 = alloca i1, i1 0
-  %nop9361 = alloca i1, i1 0
-  %nop9362 = alloca i1, i1 0
-  %nop9363 = alloca i1, i1 0
-  %nop9364 = alloca i1, i1 0
-  %nop9365 = alloca i1, i1 0
-  %nop9366 = alloca i1, i1 0
-  %nop9367 = alloca i1, i1 0
-  %nop9368 = alloca i1, i1 0
-  %nop9369 = alloca i1, i1 0
-  %nop9370 = alloca i1, i1 0
-  %nop9371 = alloca i1, i1 0
-  %nop9372 = alloca i1, i1 0
-  %nop9373 = alloca i1, i1 0
-  %nop9374 = alloca i1, i1 0
-  %nop9375 = alloca i1, i1 0
-  %nop9376 = alloca i1, i1 0
-  %nop9377 = alloca i1, i1 0
-  %nop9378 = alloca i1, i1 0
-  %nop9379 = alloca i1, i1 0
-  %nop9380 = alloca i1, i1 0
-  %nop9381 = alloca i1, i1 0
-  %nop9382 = alloca i1, i1 0
-  %nop9383 = alloca i1, i1 0
-  %nop9384 = alloca i1, i1 0
-  %nop9385 = alloca i1, i1 0
-  %nop9386 = alloca i1, i1 0
-  %nop9387 = alloca i1, i1 0
-  %nop9388 = alloca i1, i1 0
-  %nop9389 = alloca i1, i1 0
-  %nop9390 = alloca i1, i1 0
-  %nop9391 = alloca i1, i1 0
-  %nop9392 = alloca i1, i1 0
-  %nop9393 = alloca i1, i1 0
-  %nop9394 = alloca i1, i1 0
-  %nop9395 = alloca i1, i1 0
-  %nop9396 = alloca i1, i1 0
-  %nop9397 = alloca i1, i1 0
-  %nop9398 = alloca i1, i1 0
-  %nop9399 = alloca i1, i1 0
-  %nop9400 = alloca i1, i1 0
-  %nop9401 = alloca i1, i1 0
-  %nop9402 = alloca i1, i1 0
-  %nop9403 = alloca i1, i1 0
-  %nop9404 = alloca i1, i1 0
-  %nop9405 = alloca i1, i1 0
-  %nop9406 = alloca i1, i1 0
-  %nop9407 = alloca i1, i1 0
-  %nop9408 = alloca i1, i1 0
-  %nop9409 = alloca i1, i1 0
-  %nop9410 = alloca i1, i1 0
-  %nop9411 = alloca i1, i1 0
-  %nop9412 = alloca i1, i1 0
-  %nop9413 = alloca i1, i1 0
-  %nop9414 = alloca i1, i1 0
-  %nop9415 = alloca i1, i1 0
-  %nop9416 = alloca i1, i1 0
-  %nop9417 = alloca i1, i1 0
-  %nop9418 = alloca i1, i1 0
-  %nop9419 = alloca i1, i1 0
-  %nop9420 = alloca i1, i1 0
-  %nop9421 = alloca i1, i1 0
-  %nop9422 = alloca i1, i1 0
-  %nop9423 = alloca i1, i1 0
-  %nop9424 = alloca i1, i1 0
-  %nop9425 = alloca i1, i1 0
-  %nop9426 = alloca i1, i1 0
-  %nop9427 = alloca i1, i1 0
-  %nop9428 = alloca i1, i1 0
-  %nop9429 = alloca i1, i1 0
-  %nop9430 = alloca i1, i1 0
-  %nop9431 = alloca i1, i1 0
-  %nop9432 = alloca i1, i1 0
-  %nop9433 = alloca i1, i1 0
-  %nop9434 = alloca i1, i1 0
-  %nop9435 = alloca i1, i1 0
-  %nop9436 = alloca i1, i1 0
-  %nop9437 = alloca i1, i1 0
-  %nop9438 = alloca i1, i1 0
-  %nop9439 = alloca i1, i1 0
-  %nop9440 = alloca i1, i1 0
-  %nop9441 = alloca i1, i1 0
-  %nop9442 = alloca i1, i1 0
-  %nop9443 = alloca i1, i1 0
-  %nop9444 = alloca i1, i1 0
-  %nop9445 = alloca i1, i1 0
-  %nop9446 = alloca i1, i1 0
-  %nop9447 = alloca i1, i1 0
-  %nop9448 = alloca i1, i1 0
-  %nop9449 = alloca i1, i1 0
-  %nop9450 = alloca i1, i1 0
-  %nop9451 = alloca i1, i1 0
-  %nop9452 = alloca i1, i1 0
-  %nop9453 = alloca i1, i1 0
-  %nop9454 = alloca i1, i1 0
-  %nop9455 = alloca i1, i1 0
-  %nop9456 = alloca i1, i1 0
-  %nop9457 = alloca i1, i1 0
-  %nop9458 = alloca i1, i1 0
-  %nop9459 = alloca i1, i1 0
-  %nop9460 = alloca i1, i1 0
-  %nop9461 = alloca i1, i1 0
-  %nop9462 = alloca i1, i1 0
-  %nop9463 = alloca i1, i1 0
-  %nop9464 = alloca i1, i1 0
-  %nop9465 = alloca i1, i1 0
-  %nop9466 = alloca i1, i1 0
-  %nop9467 = alloca i1, i1 0
-  %nop9468 = alloca i1, i1 0
-  %nop9469 = alloca i1, i1 0
-  %nop9470 = alloca i1, i1 0
-  %nop9471 = alloca i1, i1 0
-  %nop9472 = alloca i1, i1 0
-  %nop9473 = alloca i1, i1 0
-  %nop9474 = alloca i1, i1 0
-  %nop9475 = alloca i1, i1 0
-  %nop9476 = alloca i1, i1 0
-  %nop9477 = alloca i1, i1 0
-  %nop9478 = alloca i1, i1 0
-  %nop9479 = alloca i1, i1 0
-  %nop9480 = alloca i1, i1 0
-  %nop9481 = alloca i1, i1 0
-  %nop9482 = alloca i1, i1 0
-  %nop9483 = alloca i1, i1 0
-  %nop9484 = alloca i1, i1 0
-  %nop9485 = alloca i1, i1 0
-  %nop9486 = alloca i1, i1 0
-  %nop9487 = alloca i1, i1 0
-  %nop9488 = alloca i1, i1 0
-  %nop9489 = alloca i1, i1 0
-  %nop9490 = alloca i1, i1 0
-  %nop9491 = alloca i1, i1 0
-  %nop9492 = alloca i1, i1 0
-  %nop9493 = alloca i1, i1 0
-  %nop9494 = alloca i1, i1 0
-  %nop9495 = alloca i1, i1 0
-  %nop9496 = alloca i1, i1 0
-  %nop9497 = alloca i1, i1 0
-  %nop9498 = alloca i1, i1 0
-  %nop9499 = alloca i1, i1 0
-  %nop9500 = alloca i1, i1 0
-  %nop9501 = alloca i1, i1 0
-  %nop9502 = alloca i1, i1 0
-  %nop9503 = alloca i1, i1 0
-  %nop9504 = alloca i1, i1 0
-  %nop9505 = alloca i1, i1 0
-  %nop9506 = alloca i1, i1 0
-  %nop9507 = alloca i1, i1 0
-  %nop9508 = alloca i1, i1 0
-  %nop9509 = alloca i1, i1 0
-  %nop9510 = alloca i1, i1 0
-  %nop9511 = alloca i1, i1 0
-  %nop9512 = alloca i1, i1 0
-  %nop9513 = alloca i1, i1 0
-  %nop9514 = alloca i1, i1 0
-  %nop9515 = alloca i1, i1 0
-  %nop9516 = alloca i1, i1 0
-  %nop9517 = alloca i1, i1 0
-  %nop9518 = alloca i1, i1 0
-  %nop9519 = alloca i1, i1 0
-  %nop9520 = alloca i1, i1 0
-  %nop9521 = alloca i1, i1 0
-  %nop9522 = alloca i1, i1 0
-  %nop9523 = alloca i1, i1 0
-  %nop9524 = alloca i1, i1 0
-  %nop9525 = alloca i1, i1 0
-  %nop9526 = alloca i1, i1 0
-  %nop9527 = alloca i1, i1 0
-  %nop9528 = alloca i1, i1 0
-  %nop9529 = alloca i1, i1 0
-  %nop9530 = alloca i1, i1 0
-  %nop9531 = alloca i1, i1 0
-  %nop9532 = alloca i1, i1 0
-  %nop9533 = alloca i1, i1 0
-  %nop9534 = alloca i1, i1 0
-  %nop9535 = alloca i1, i1 0
-  %nop9536 = alloca i1, i1 0
-  %nop9537 = alloca i1, i1 0
-  %nop9538 = alloca i1, i1 0
-  %nop9539 = alloca i1, i1 0
-  %nop9540 = alloca i1, i1 0
-  %nop9541 = alloca i1, i1 0
-  %nop9542 = alloca i1, i1 0
-  %nop9543 = alloca i1, i1 0
-  %nop9544 = alloca i1, i1 0
-  %nop9545 = alloca i1, i1 0
-  %nop9546 = alloca i1, i1 0
-  %nop9547 = alloca i1, i1 0
-  %nop9548 = alloca i1, i1 0
-  %nop9549 = alloca i1, i1 0
-  %nop9550 = alloca i1, i1 0
-  %nop9551 = alloca i1, i1 0
-  %nop9552 = alloca i1, i1 0
-  %nop9553 = alloca i1, i1 0
-  %nop9554 = alloca i1, i1 0
-  %nop9555 = alloca i1, i1 0
-  %nop9556 = alloca i1, i1 0
-  %nop9557 = alloca i1, i1 0
-  %nop9558 = alloca i1, i1 0
-  %nop9559 = alloca i1, i1 0
-  %nop9560 = alloca i1, i1 0
-  %nop9561 = alloca i1, i1 0
-  %nop9562 = alloca i1, i1 0
-  %nop9563 = alloca i1, i1 0
-  %nop9564 = alloca i1, i1 0
-  %nop9565 = alloca i1, i1 0
-  %nop9566 = alloca i1, i1 0
-  %nop9567 = alloca i1, i1 0
-  %nop9568 = alloca i1, i1 0
-  %nop9569 = alloca i1, i1 0
-  %nop9570 = alloca i1, i1 0
-  %nop9571 = alloca i1, i1 0
-  %nop9572 = alloca i1, i1 0
-  %nop9573 = alloca i1, i1 0
-  %nop9574 = alloca i1, i1 0
-  %nop9575 = alloca i1, i1 0
-  %nop9576 = alloca i1, i1 0
-  %nop9577 = alloca i1, i1 0
-  %nop9578 = alloca i1, i1 0
-  %nop9579 = alloca i1, i1 0
-  %nop9580 = alloca i1, i1 0
-  %nop9581 = alloca i1, i1 0
-  %nop9582 = alloca i1, i1 0
-  %nop9583 = alloca i1, i1 0
-  %nop9584 = alloca i1, i1 0
-  %nop9585 = alloca i1, i1 0
-  %nop9586 = alloca i1, i1 0
-  %nop9587 = alloca i1, i1 0
-  %nop9588 = alloca i1, i1 0
-  %nop9589 = alloca i1, i1 0
-  %nop9590 = alloca i1, i1 0
-  %nop9591 = alloca i1, i1 0
-  %nop9592 = alloca i1, i1 0
-  %nop9593 = alloca i1, i1 0
-  %nop9594 = alloca i1, i1 0
-  %nop9595 = alloca i1, i1 0
-  %nop9596 = alloca i1, i1 0
-  %nop9597 = alloca i1, i1 0
-  %nop9598 = alloca i1, i1 0
-  %nop9599 = alloca i1, i1 0
-  %nop9600 = alloca i1, i1 0
-  %nop9601 = alloca i1, i1 0
-  %nop9602 = alloca i1, i1 0
-  %nop9603 = alloca i1, i1 0
-  %nop9604 = alloca i1, i1 0
-  %nop9605 = alloca i1, i1 0
-  %nop9606 = alloca i1, i1 0
-  %nop9607 = alloca i1, i1 0
-  %nop9608 = alloca i1, i1 0
-  %nop9609 = alloca i1, i1 0
-  %nop9610 = alloca i1, i1 0
-  %nop9611 = alloca i1, i1 0
-  %nop9612 = alloca i1, i1 0
-  %nop9613 = alloca i1, i1 0
-  %nop9614 = alloca i1, i1 0
-  %nop9615 = alloca i1, i1 0
-  %nop9616 = alloca i1, i1 0
-  %nop9617 = alloca i1, i1 0
-  %nop9618 = alloca i1, i1 0
-  %nop9619 = alloca i1, i1 0
-  %nop9620 = alloca i1, i1 0
-  %nop9621 = alloca i1, i1 0
-  %nop9622 = alloca i1, i1 0
-  %nop9623 = alloca i1, i1 0
-  %nop9624 = alloca i1, i1 0
-  %nop9625 = alloca i1, i1 0
-  %nop9626 = alloca i1, i1 0
-  %nop9627 = alloca i1, i1 0
-  %nop9628 = alloca i1, i1 0
-  %nop9629 = alloca i1, i1 0
-  %nop9630 = alloca i1, i1 0
-  %nop9631 = alloca i1, i1 0
-  %nop9632 = alloca i1, i1 0
-  %nop9633 = alloca i1, i1 0
-  %nop9634 = alloca i1, i1 0
-  %nop9635 = alloca i1, i1 0
-  %nop9636 = alloca i1, i1 0
-  %nop9637 = alloca i1, i1 0
-  %nop9638 = alloca i1, i1 0
-  %nop9639 = alloca i1, i1 0
-  %nop9640 = alloca i1, i1 0
-  %nop9641 = alloca i1, i1 0
-  %nop9642 = alloca i1, i1 0
-  %nop9643 = alloca i1, i1 0
-  %nop9644 = alloca i1, i1 0
-  %nop9645 = alloca i1, i1 0
-  %nop9646 = alloca i1, i1 0
-  %nop9647 = alloca i1, i1 0
-  %nop9648 = alloca i1, i1 0
-  %nop9649 = alloca i1, i1 0
-  %nop9650 = alloca i1, i1 0
-  %nop9651 = alloca i1, i1 0
-  %nop9652 = alloca i1, i1 0
-  %nop9653 = alloca i1, i1 0
-  %nop9654 = alloca i1, i1 0
-  %nop9655 = alloca i1, i1 0
-  %nop9656 = alloca i1, i1 0
-  %nop9657 = alloca i1, i1 0
-  %nop9658 = alloca i1, i1 0
-  %nop9659 = alloca i1, i1 0
-  %nop9660 = alloca i1, i1 0
-  %nop9661 = alloca i1, i1 0
-  %nop9662 = alloca i1, i1 0
-  %nop9663 = alloca i1, i1 0
-  %nop9664 = alloca i1, i1 0
-  %nop9665 = alloca i1, i1 0
-  %nop9666 = alloca i1, i1 0
-  %nop9667 = alloca i1, i1 0
-  %nop9668 = alloca i1, i1 0
-  %nop9669 = alloca i1, i1 0
-  %nop9670 = alloca i1, i1 0
-  %nop9671 = alloca i1, i1 0
-  %nop9672 = alloca i1, i1 0
-  %nop9673 = alloca i1, i1 0
-  %nop9674 = alloca i1, i1 0
-  %nop9675 = alloca i1, i1 0
-  %nop9676 = alloca i1, i1 0
-  %nop9677 = alloca i1, i1 0
-  %nop9678 = alloca i1, i1 0
-  %nop9679 = alloca i1, i1 0
-  %nop9680 = alloca i1, i1 0
-  %nop9681 = alloca i1, i1 0
-  %nop9682 = alloca i1, i1 0
-  %nop9683 = alloca i1, i1 0
-  %nop9684 = alloca i1, i1 0
-  %nop9685 = alloca i1, i1 0
-  %nop9686 = alloca i1, i1 0
-  %nop9687 = alloca i1, i1 0
-  %nop9688 = alloca i1, i1 0
-  %nop9689 = alloca i1, i1 0
-  %nop9690 = alloca i1, i1 0
-  %nop9691 = alloca i1, i1 0
-  %nop9692 = alloca i1, i1 0
-  %nop9693 = alloca i1, i1 0
-  %nop9694 = alloca i1, i1 0
-  %nop9695 = alloca i1, i1 0
-  %nop9696 = alloca i1, i1 0
-  %nop9697 = alloca i1, i1 0
-  %nop9698 = alloca i1, i1 0
-  %nop9699 = alloca i1, i1 0
-  %nop9700 = alloca i1, i1 0
-  %nop9701 = alloca i1, i1 0
-  %nop9702 = alloca i1, i1 0
-  %nop9703 = alloca i1, i1 0
-  %nop9704 = alloca i1, i1 0
-  %nop9705 = alloca i1, i1 0
-  %nop9706 = alloca i1, i1 0
-  %nop9707 = alloca i1, i1 0
-  %nop9708 = alloca i1, i1 0
-  %nop9709 = alloca i1, i1 0
-  %nop9710 = alloca i1, i1 0
-  %nop9711 = alloca i1, i1 0
-  %nop9712 = alloca i1, i1 0
-  %nop9713 = alloca i1, i1 0
-  %nop9714 = alloca i1, i1 0
-  %nop9715 = alloca i1, i1 0
-  %nop9716 = alloca i1, i1 0
-  %nop9717 = alloca i1, i1 0
-  %nop9718 = alloca i1, i1 0
-  %nop9719 = alloca i1, i1 0
-  %nop9720 = alloca i1, i1 0
-  %nop9721 = alloca i1, i1 0
-  %nop9722 = alloca i1, i1 0
-  %nop9723 = alloca i1, i1 0
-  %nop9724 = alloca i1, i1 0
-  %nop9725 = alloca i1, i1 0
-  %nop9726 = alloca i1, i1 0
-  %nop9727 = alloca i1, i1 0
-  %nop9728 = alloca i1, i1 0
-  %nop9729 = alloca i1, i1 0
-  %nop9730 = alloca i1, i1 0
-  %nop9731 = alloca i1, i1 0
-  %nop9732 = alloca i1, i1 0
-  %nop9733 = alloca i1, i1 0
-  %nop9734 = alloca i1, i1 0
-  %nop9735 = alloca i1, i1 0
-  %nop9736 = alloca i1, i1 0
-  %nop9737 = alloca i1, i1 0
-  %nop9738 = alloca i1, i1 0
-  %nop9739 = alloca i1, i1 0
-  %nop9740 = alloca i1, i1 0
-  %nop9741 = alloca i1, i1 0
-  %nop9742 = alloca i1, i1 0
-  %nop9743 = alloca i1, i1 0
-  %nop9744 = alloca i1, i1 0
-  %nop9745 = alloca i1, i1 0
-  %nop9746 = alloca i1, i1 0
-  %nop9747 = alloca i1, i1 0
-  %nop9748 = alloca i1, i1 0
-  %nop9749 = alloca i1, i1 0
-  %nop9750 = alloca i1, i1 0
-  %nop9751 = alloca i1, i1 0
-  %nop9752 = alloca i1, i1 0
-  %nop9753 = alloca i1, i1 0
-  %nop9754 = alloca i1, i1 0
-  %nop9755 = alloca i1, i1 0
-  %nop9756 = alloca i1, i1 0
-  %nop9757 = alloca i1, i1 0
-  %nop9758 = alloca i1, i1 0
-  %nop9759 = alloca i1, i1 0
-  %nop9760 = alloca i1, i1 0
-  %nop9761 = alloca i1, i1 0
-  %nop9762 = alloca i1, i1 0
-  %nop9763 = alloca i1, i1 0
-  %nop9764 = alloca i1, i1 0
-  %nop9765 = alloca i1, i1 0
-  %nop9766 = alloca i1, i1 0
-  %nop9767 = alloca i1, i1 0
-  %nop9768 = alloca i1, i1 0
-  %nop9769 = alloca i1, i1 0
-  %nop9770 = alloca i1, i1 0
-  %nop9771 = alloca i1, i1 0
-  %nop9772 = alloca i1, i1 0
-  %nop9773 = alloca i1, i1 0
-  %nop9774 = alloca i1, i1 0
-  %nop9775 = alloca i1, i1 0
-  %nop9776 = alloca i1, i1 0
-  %nop9777 = alloca i1, i1 0
-  %nop9778 = alloca i1, i1 0
-  %nop9779 = alloca i1, i1 0
-  %nop9780 = alloca i1, i1 0
-  %nop9781 = alloca i1, i1 0
-  %nop9782 = alloca i1, i1 0
-  %nop9783 = alloca i1, i1 0
-  %nop9784 = alloca i1, i1 0
-  %nop9785 = alloca i1, i1 0
-  %nop9786 = alloca i1, i1 0
-  %nop9787 = alloca i1, i1 0
-  %nop9788 = alloca i1, i1 0
-  %nop9789 = alloca i1, i1 0
-  %nop9790 = alloca i1, i1 0
-  %nop9791 = alloca i1, i1 0
-  %nop9792 = alloca i1, i1 0
-  %nop9793 = alloca i1, i1 0
-  %nop9794 = alloca i1, i1 0
-  %nop9795 = alloca i1, i1 0
-  %nop9796 = alloca i1, i1 0
-  %nop9797 = alloca i1, i1 0
-  %nop9798 = alloca i1, i1 0
-  %nop9799 = alloca i1, i1 0
-  %nop9800 = alloca i1, i1 0
-  %nop9801 = alloca i1, i1 0
-  %nop9802 = alloca i1, i1 0
-  %nop9803 = alloca i1, i1 0
-  %nop9804 = alloca i1, i1 0
-  %nop9805 = alloca i1, i1 0
-  %nop9806 = alloca i1, i1 0
-  %nop9807 = alloca i1, i1 0
-  %nop9808 = alloca i1, i1 0
-  %nop9809 = alloca i1, i1 0
-  %nop9810 = alloca i1, i1 0
-  %nop9811 = alloca i1, i1 0
-  %nop9812 = alloca i1, i1 0
-  %nop9813 = alloca i1, i1 0
-  %nop9814 = alloca i1, i1 0
-  %nop9815 = alloca i1, i1 0
-  %nop9816 = alloca i1, i1 0
-  %nop9817 = alloca i1, i1 0
-  %nop9818 = alloca i1, i1 0
-  %nop9819 = alloca i1, i1 0
-  %nop9820 = alloca i1, i1 0
-  %nop9821 = alloca i1, i1 0
-  %nop9822 = alloca i1, i1 0
-  %nop9823 = alloca i1, i1 0
-  %nop9824 = alloca i1, i1 0
-  %nop9825 = alloca i1, i1 0
-  %nop9826 = alloca i1, i1 0
-  %nop9827 = alloca i1, i1 0
-  %nop9828 = alloca i1, i1 0
-  %nop9829 = alloca i1, i1 0
-  %nop9830 = alloca i1, i1 0
-  %nop9831 = alloca i1, i1 0
-  %nop9832 = alloca i1, i1 0
-  %nop9833 = alloca i1, i1 0
-  %nop9834 = alloca i1, i1 0
-  %nop9835 = alloca i1, i1 0
-  %nop9836 = alloca i1, i1 0
-  %nop9837 = alloca i1, i1 0
-  %nop9838 = alloca i1, i1 0
-  %nop9839 = alloca i1, i1 0
-  %nop9840 = alloca i1, i1 0
-  %nop9841 = alloca i1, i1 0
-  %nop9842 = alloca i1, i1 0
-  %nop9843 = alloca i1, i1 0
-  %nop9844 = alloca i1, i1 0
-  %nop9845 = alloca i1, i1 0
-  %nop9846 = alloca i1, i1 0
-  %nop9847 = alloca i1, i1 0
-  %nop9848 = alloca i1, i1 0
-  %nop9849 = alloca i1, i1 0
-  %nop9850 = alloca i1, i1 0
-  %nop9851 = alloca i1, i1 0
-  %nop9852 = alloca i1, i1 0
-  %nop9853 = alloca i1, i1 0
-  %nop9854 = alloca i1, i1 0
-  %nop9855 = alloca i1, i1 0
-  %nop9856 = alloca i1, i1 0
-  %nop9857 = alloca i1, i1 0
-  %nop9858 = alloca i1, i1 0
-  %nop9859 = alloca i1, i1 0
-  %nop9860 = alloca i1, i1 0
-  %nop9861 = alloca i1, i1 0
-  %nop9862 = alloca i1, i1 0
-  %nop9863 = alloca i1, i1 0
-  %nop9864 = alloca i1, i1 0
-  %nop9865 = alloca i1, i1 0
-  %nop9866 = alloca i1, i1 0
-  %nop9867 = alloca i1, i1 0
-  %nop9868 = alloca i1, i1 0
-  %nop9869 = alloca i1, i1 0
-  %nop9870 = alloca i1, i1 0
-  %nop9871 = alloca i1, i1 0
-  %nop9872 = alloca i1, i1 0
-  %nop9873 = alloca i1, i1 0
-  %nop9874 = alloca i1, i1 0
-  %nop9875 = alloca i1, i1 0
-  %nop9876 = alloca i1, i1 0
-  %nop9877 = alloca i1, i1 0
-  %nop9878 = alloca i1, i1 0
-  %nop9879 = alloca i1, i1 0
-  %nop9880 = alloca i1, i1 0
-  %nop9881 = alloca i1, i1 0
-  %nop9882 = alloca i1, i1 0
-  %nop9883 = alloca i1, i1 0
-  %nop9884 = alloca i1, i1 0
-  %nop9885 = alloca i1, i1 0
-  %nop9886 = alloca i1, i1 0
-  %nop9887 = alloca i1, i1 0
-  %nop9888 = alloca i1, i1 0
-  %nop9889 = alloca i1, i1 0
-  %nop9890 = alloca i1, i1 0
-  %nop9891 = alloca i1, i1 0
-  %nop9892 = alloca i1, i1 0
-  %nop9893 = alloca i1, i1 0
-  %nop9894 = alloca i1, i1 0
-  %nop9895 = alloca i1, i1 0
-  %nop9896 = alloca i1, i1 0
-  %nop9897 = alloca i1, i1 0
-  %nop9898 = alloca i1, i1 0
-  %nop9899 = alloca i1, i1 0
-  %nop9900 = alloca i1, i1 0
-  %nop9901 = alloca i1, i1 0
-  %nop9902 = alloca i1, i1 0
-  %nop9903 = alloca i1, i1 0
-  %nop9904 = alloca i1, i1 0
-  %nop9905 = alloca i1, i1 0
-  %nop9906 = alloca i1, i1 0
-  %nop9907 = alloca i1, i1 0
-  %nop9908 = alloca i1, i1 0
-  %nop9909 = alloca i1, i1 0
-  %nop9910 = alloca i1, i1 0
-  %nop9911 = alloca i1, i1 0
-  %nop9912 = alloca i1, i1 0
-  %nop9913 = alloca i1, i1 0
-  %nop9914 = alloca i1, i1 0
-  %nop9915 = alloca i1, i1 0
-  %nop9916 = alloca i1, i1 0
-  %nop9917 = alloca i1, i1 0
-  %nop9918 = alloca i1, i1 0
-  %nop9919 = alloca i1, i1 0
-  %nop9920 = alloca i1, i1 0
-  %nop9921 = alloca i1, i1 0
-  %nop9922 = alloca i1, i1 0
-  %nop9923 = alloca i1, i1 0
-  %nop9924 = alloca i1, i1 0
-  %nop9925 = alloca i1, i1 0
-  %nop9926 = alloca i1, i1 0
-  %nop9927 = alloca i1, i1 0
-  %nop9928 = alloca i1, i1 0
-  %nop9929 = alloca i1, i1 0
-  %nop9930 = alloca i1, i1 0
-  %nop9931 = alloca i1, i1 0
-  %nop9932 = alloca i1, i1 0
-  %nop9933 = alloca i1, i1 0
-  %nop9934 = alloca i1, i1 0
-  %nop9935 = alloca i1, i1 0
-  %nop9936 = alloca i1, i1 0
-  %nop9937 = alloca i1, i1 0
-  %nop9938 = alloca i1, i1 0
-  %nop9939 = alloca i1, i1 0
-  %nop9940 = alloca i1, i1 0
-  %nop9941 = alloca i1, i1 0
-  %nop9942 = alloca i1, i1 0
-  %nop9943 = alloca i1, i1 0
-  %nop9944 = alloca i1, i1 0
-  %nop9945 = alloca i1, i1 0
-  %nop9946 = alloca i1, i1 0
-  %nop9947 = alloca i1, i1 0
-  %nop9948 = alloca i1, i1 0
-  %nop9949 = alloca i1, i1 0
-  %nop9950 = alloca i1, i1 0
-  %nop9951 = alloca i1, i1 0
-  %nop9952 = alloca i1, i1 0
-  %nop9953 = alloca i1, i1 0
-  %nop9954 = alloca i1, i1 0
-  %nop9955 = alloca i1, i1 0
-  %nop9956 = alloca i1, i1 0
-  %nop9957 = alloca i1, i1 0
-  %nop9958 = alloca i1, i1 0
-  %nop9959 = alloca i1, i1 0
-  %nop9960 = alloca i1, i1 0
-  %nop9961 = alloca i1, i1 0
-  %nop9962 = alloca i1, i1 0
-  %nop9963 = alloca i1, i1 0
-  %nop9964 = alloca i1, i1 0
-  %nop9965 = alloca i1, i1 0
-  %nop9966 = alloca i1, i1 0
-  %nop9967 = alloca i1, i1 0
-  %nop9968 = alloca i1, i1 0
-  %nop9969 = alloca i1, i1 0
-  %nop9970 = alloca i1, i1 0
-  %nop9971 = alloca i1, i1 0
-  %nop9972 = alloca i1, i1 0
-  %nop9973 = alloca i1, i1 0
-  %nop9974 = alloca i1, i1 0
-  %nop9975 = alloca i1, i1 0
-  %nop9976 = alloca i1, i1 0
-  %nop9977 = alloca i1, i1 0
-  %nop9978 = alloca i1, i1 0
-  %nop9979 = alloca i1, i1 0
-  %nop9980 = alloca i1, i1 0
-  %nop9981 = alloca i1, i1 0
-  %nop9982 = alloca i1, i1 0
-  %nop9983 = alloca i1, i1 0
-  %nop9984 = alloca i1, i1 0
-  %nop9985 = alloca i1, i1 0
-  %nop9986 = alloca i1, i1 0
-  %nop9987 = alloca i1, i1 0
-  %nop9988 = alloca i1, i1 0
-  %nop9989 = alloca i1, i1 0
-  %nop9990 = alloca i1, i1 0
-  %nop9991 = alloca i1, i1 0
-  %nop9992 = alloca i1, i1 0
-  %nop9993 = alloca i1, i1 0
-  %nop9994 = alloca i1, i1 0
-  %nop9995 = alloca i1, i1 0
-  %nop9996 = alloca i1, i1 0
-  %nop9997 = alloca i1, i1 0
-  %nop9998 = alloca i1, i1 0
-  %nop9999 = alloca i1, i1 0
-  %nop10000 = alloca i1, i1 0
-  %nop10001 = alloca i1, i1 0
-  %nop10002 = alloca i1, i1 0
-  %nop10003 = alloca i1, i1 0
-  %nop10004 = alloca i1, i1 0
-  %nop10005 = alloca i1, i1 0
-  %nop10006 = alloca i1, i1 0
-  %nop10007 = alloca i1, i1 0
-  %nop10008 = alloca i1, i1 0
-  %nop10009 = alloca i1, i1 0
-  %nop10010 = alloca i1, i1 0
-  %nop10011 = alloca i1, i1 0
-  %nop10012 = alloca i1, i1 0
-  %nop10013 = alloca i1, i1 0
-  %nop10014 = alloca i1, i1 0
-  %nop10015 = alloca i1, i1 0
-  %nop10016 = alloca i1, i1 0
-  %nop10017 = alloca i1, i1 0
-  %nop10018 = alloca i1, i1 0
-  %nop10019 = alloca i1, i1 0
-  %nop10020 = alloca i1, i1 0
-  %nop10021 = alloca i1, i1 0
-  %nop10022 = alloca i1, i1 0
-  %nop10023 = alloca i1, i1 0
-  %nop10024 = alloca i1, i1 0
-  %nop10025 = alloca i1, i1 0
-  %nop10026 = alloca i1, i1 0
-  %nop10027 = alloca i1, i1 0
-  %nop10028 = alloca i1, i1 0
-  %nop10029 = alloca i1, i1 0
-  %nop10030 = alloca i1, i1 0
-  %nop10031 = alloca i1, i1 0
-  %nop10032 = alloca i1, i1 0
-  %nop10033 = alloca i1, i1 0
-  %nop10034 = alloca i1, i1 0
-  %nop10035 = alloca i1, i1 0
-  %nop10036 = alloca i1, i1 0
-  %nop10037 = alloca i1, i1 0
-  %nop10038 = alloca i1, i1 0
-  %nop10039 = alloca i1, i1 0
-  %nop10040 = alloca i1, i1 0
-  %nop10041 = alloca i1, i1 0
-  %nop10042 = alloca i1, i1 0
-  %nop10043 = alloca i1, i1 0
-  %nop10044 = alloca i1, i1 0
-  %nop10045 = alloca i1, i1 0
-  %nop10046 = alloca i1, i1 0
-  %nop10047 = alloca i1, i1 0
-  %nop10048 = alloca i1, i1 0
-  %nop10049 = alloca i1, i1 0
-  %nop10050 = alloca i1, i1 0
-  %nop10051 = alloca i1, i1 0
-  %nop10052 = alloca i1, i1 0
-  %nop10053 = alloca i1, i1 0
-  %nop10054 = alloca i1, i1 0
-  %nop10055 = alloca i1, i1 0
-  %nop10056 = alloca i1, i1 0
-  %nop10057 = alloca i1, i1 0
-  %nop10058 = alloca i1, i1 0
-  %nop10059 = alloca i1, i1 0
-  %nop10060 = alloca i1, i1 0
-  %nop10061 = alloca i1, i1 0
-  %nop10062 = alloca i1, i1 0
-  %nop10063 = alloca i1, i1 0
-  %nop10064 = alloca i1, i1 0
-  %nop10065 = alloca i1, i1 0
-  %nop10066 = alloca i1, i1 0
-  %nop10067 = alloca i1, i1 0
-  %nop10068 = alloca i1, i1 0
-  %nop10069 = alloca i1, i1 0
-  %nop10070 = alloca i1, i1 0
-  %nop10071 = alloca i1, i1 0
-  %nop10072 = alloca i1, i1 0
-  %nop10073 = alloca i1, i1 0
-  %nop10074 = alloca i1, i1 0
-  %nop10075 = alloca i1, i1 0
-  %nop10076 = alloca i1, i1 0
-  %nop10077 = alloca i1, i1 0
-  %nop10078 = alloca i1, i1 0
-  %nop10079 = alloca i1, i1 0
-  %nop10080 = alloca i1, i1 0
-  %nop10081 = alloca i1, i1 0
-  %nop10082 = alloca i1, i1 0
-  %nop10083 = alloca i1, i1 0
-  %nop10084 = alloca i1, i1 0
-  %nop10085 = alloca i1, i1 0
-  %nop10086 = alloca i1, i1 0
-  %nop10087 = alloca i1, i1 0
-  %nop10088 = alloca i1, i1 0
-  %nop10089 = alloca i1, i1 0
-  %nop10090 = alloca i1, i1 0
-  %nop10091 = alloca i1, i1 0
-  %nop10092 = alloca i1, i1 0
-  %nop10093 = alloca i1, i1 0
-  %nop10094 = alloca i1, i1 0
-  %nop10095 = alloca i1, i1 0
-  %nop10096 = alloca i1, i1 0
-  %nop10097 = alloca i1, i1 0
-  %nop10098 = alloca i1, i1 0
-  %nop10099 = alloca i1, i1 0
-  %nop10100 = alloca i1, i1 0
-  %nop10101 = alloca i1, i1 0
-  %nop10102 = alloca i1, i1 0
-  %nop10103 = alloca i1, i1 0
-  %nop10104 = alloca i1, i1 0
-  %nop10105 = alloca i1, i1 0
-  %nop10106 = alloca i1, i1 0
-  %nop10107 = alloca i1, i1 0
-  %nop10108 = alloca i1, i1 0
-  %nop10109 = alloca i1, i1 0
-  %nop10110 = alloca i1, i1 0
-  %nop10111 = alloca i1, i1 0
-  %nop10112 = alloca i1, i1 0
-  %nop10113 = alloca i1, i1 0
-  %nop10114 = alloca i1, i1 0
-  %nop10115 = alloca i1, i1 0
-  %nop10116 = alloca i1, i1 0
-  %nop10117 = alloca i1, i1 0
-  %nop10118 = alloca i1, i1 0
-  %nop10119 = alloca i1, i1 0
-  %nop10120 = alloca i1, i1 0
-  %nop10121 = alloca i1, i1 0
-  %nop10122 = alloca i1, i1 0
-  %nop10123 = alloca i1, i1 0
-  %nop10124 = alloca i1, i1 0
-  %nop10125 = alloca i1, i1 0
-  %nop10126 = alloca i1, i1 0
-  %nop10127 = alloca i1, i1 0
-  %nop10128 = alloca i1, i1 0
-  %nop10129 = alloca i1, i1 0
-  %nop10130 = alloca i1, i1 0
-  %nop10131 = alloca i1, i1 0
-  %nop10132 = alloca i1, i1 0
-  %nop10133 = alloca i1, i1 0
-  %nop10134 = alloca i1, i1 0
-  %nop10135 = alloca i1, i1 0
-  %nop10136 = alloca i1, i1 0
-  %nop10137 = alloca i1, i1 0
-  %nop10138 = alloca i1, i1 0
-  %nop10139 = alloca i1, i1 0
-  %nop10140 = alloca i1, i1 0
-  %nop10141 = alloca i1, i1 0
-  %nop10142 = alloca i1, i1 0
-  %nop10143 = alloca i1, i1 0
-  %nop10144 = alloca i1, i1 0
-  %nop10145 = alloca i1, i1 0
-  %nop10146 = alloca i1, i1 0
-  %nop10147 = alloca i1, i1 0
-  %nop10148 = alloca i1, i1 0
-  %nop10149 = alloca i1, i1 0
-  %nop10150 = alloca i1, i1 0
-  %nop10151 = alloca i1, i1 0
-  %nop10152 = alloca i1, i1 0
-  %nop10153 = alloca i1, i1 0
-  %nop10154 = alloca i1, i1 0
-  %nop10155 = alloca i1, i1 0
-  %nop10156 = alloca i1, i1 0
-  %nop10157 = alloca i1, i1 0
-  %nop10158 = alloca i1, i1 0
-  %nop10159 = alloca i1, i1 0
-  %nop10160 = alloca i1, i1 0
-  %nop10161 = alloca i1, i1 0
-  %nop10162 = alloca i1, i1 0
-  %nop10163 = alloca i1, i1 0
-  %nop10164 = alloca i1, i1 0
-  %nop10165 = alloca i1, i1 0
-  %nop10166 = alloca i1, i1 0
-  %nop10167 = alloca i1, i1 0
-  %nop10168 = alloca i1, i1 0
-  %nop10169 = alloca i1, i1 0
-  %nop10170 = alloca i1, i1 0
-  %nop10171 = alloca i1, i1 0
-  %nop10172 = alloca i1, i1 0
-  %nop10173 = alloca i1, i1 0
-  %nop10174 = alloca i1, i1 0
-  %nop10175 = alloca i1, i1 0
-  %nop10176 = alloca i1, i1 0
-  %nop10177 = alloca i1, i1 0
-  %nop10178 = alloca i1, i1 0
-  %nop10179 = alloca i1, i1 0
-  %nop10180 = alloca i1, i1 0
-  %nop10181 = alloca i1, i1 0
-  %nop10182 = alloca i1, i1 0
-  %nop10183 = alloca i1, i1 0
-  %nop10184 = alloca i1, i1 0
-  %nop10185 = alloca i1, i1 0
-  %nop10186 = alloca i1, i1 0
-  %nop10187 = alloca i1, i1 0
-  %nop10188 = alloca i1, i1 0
-  %nop10189 = alloca i1, i1 0
-  %nop10190 = alloca i1, i1 0
-  %nop10191 = alloca i1, i1 0
-  %nop10192 = alloca i1, i1 0
-  %nop10193 = alloca i1, i1 0
-  %nop10194 = alloca i1, i1 0
-  %nop10195 = alloca i1, i1 0
-  %nop10196 = alloca i1, i1 0
-  %nop10197 = alloca i1, i1 0
-  %nop10198 = alloca i1, i1 0
-  %nop10199 = alloca i1, i1 0
-  %nop10200 = alloca i1, i1 0
-  %nop10201 = alloca i1, i1 0
-  %nop10202 = alloca i1, i1 0
-  %nop10203 = alloca i1, i1 0
-  %nop10204 = alloca i1, i1 0
-  %nop10205 = alloca i1, i1 0
-  %nop10206 = alloca i1, i1 0
-  %nop10207 = alloca i1, i1 0
-  %nop10208 = alloca i1, i1 0
-  %nop10209 = alloca i1, i1 0
-  %nop10210 = alloca i1, i1 0
-  %nop10211 = alloca i1, i1 0
-  %nop10212 = alloca i1, i1 0
-  %nop10213 = alloca i1, i1 0
-  %nop10214 = alloca i1, i1 0
-  %nop10215 = alloca i1, i1 0
-  %nop10216 = alloca i1, i1 0
-  %nop10217 = alloca i1, i1 0
-  %nop10218 = alloca i1, i1 0
-  %nop10219 = alloca i1, i1 0
-  %nop10220 = alloca i1, i1 0
-  %nop10221 = alloca i1, i1 0
-  %nop10222 = alloca i1, i1 0
-  %nop10223 = alloca i1, i1 0
-  %nop10224 = alloca i1, i1 0
-  %nop10225 = alloca i1, i1 0
-  %nop10226 = alloca i1, i1 0
-  %nop10227 = alloca i1, i1 0
-  %nop10228 = alloca i1, i1 0
-  %nop10229 = alloca i1, i1 0
-  %nop10230 = alloca i1, i1 0
-  %nop10231 = alloca i1, i1 0
-  %nop10232 = alloca i1, i1 0
-  %nop10233 = alloca i1, i1 0
-  %nop10234 = alloca i1, i1 0
-  %nop10235 = alloca i1, i1 0
-  %nop10236 = alloca i1, i1 0
-  %nop10237 = alloca i1, i1 0
-  %nop10238 = alloca i1, i1 0
-  %nop10239 = alloca i1, i1 0
-  %nop10240 = alloca i1, i1 0
-  %nop10241 = alloca i1, i1 0
-  %nop10242 = alloca i1, i1 0
-  %nop10243 = alloca i1, i1 0
-  %nop10244 = alloca i1, i1 0
-  %nop10245 = alloca i1, i1 0
-  %nop10246 = alloca i1, i1 0
-  %nop10247 = alloca i1, i1 0
-  %nop10248 = alloca i1, i1 0
-  %nop10249 = alloca i1, i1 0
-  %nop10250 = alloca i1, i1 0
-  %nop10251 = alloca i1, i1 0
-  %nop10252 = alloca i1, i1 0
-  %nop10253 = alloca i1, i1 0
-  %nop10254 = alloca i1, i1 0
-  %nop10255 = alloca i1, i1 0
-  %nop10256 = alloca i1, i1 0
-  %nop10257 = alloca i1, i1 0
-  %nop10258 = alloca i1, i1 0
-  %nop10259 = alloca i1, i1 0
-  %nop10260 = alloca i1, i1 0
-  %nop10261 = alloca i1, i1 0
-  %nop10262 = alloca i1, i1 0
-  %nop10263 = alloca i1, i1 0
-  %nop10264 = alloca i1, i1 0
-  %nop10265 = alloca i1, i1 0
-  %nop10266 = alloca i1, i1 0
-  %nop10267 = alloca i1, i1 0
-  %nop10268 = alloca i1, i1 0
-  %nop10269 = alloca i1, i1 0
-  %nop10270 = alloca i1, i1 0
-  %nop10271 = alloca i1, i1 0
-  %nop10272 = alloca i1, i1 0
-  %nop10273 = alloca i1, i1 0
-  %nop10274 = alloca i1, i1 0
-  %nop10275 = alloca i1, i1 0
-  %nop10276 = alloca i1, i1 0
-  %nop10277 = alloca i1, i1 0
-  %nop10278 = alloca i1, i1 0
-  %nop10279 = alloca i1, i1 0
-  %nop10280 = alloca i1, i1 0
-  %nop10281 = alloca i1, i1 0
-  %nop10282 = alloca i1, i1 0
-  %nop10283 = alloca i1, i1 0
-  %nop10284 = alloca i1, i1 0
-  %nop10285 = alloca i1, i1 0
-  %nop10286 = alloca i1, i1 0
-  %nop10287 = alloca i1, i1 0
-  %nop10288 = alloca i1, i1 0
-  %nop10289 = alloca i1, i1 0
-  %nop10290 = alloca i1, i1 0
-  %nop10291 = alloca i1, i1 0
-  %nop10292 = alloca i1, i1 0
-  %nop10293 = alloca i1, i1 0
-  %nop10294 = alloca i1, i1 0
-  %nop10295 = alloca i1, i1 0
-  %nop10296 = alloca i1, i1 0
-  %nop10297 = alloca i1, i1 0
-  %nop10298 = alloca i1, i1 0
-  %nop10299 = alloca i1, i1 0
-  %nop10300 = alloca i1, i1 0
-  %nop10301 = alloca i1, i1 0
-  %nop10302 = alloca i1, i1 0
-  %nop10303 = alloca i1, i1 0
-  %nop10304 = alloca i1, i1 0
-  %nop10305 = alloca i1, i1 0
-  %nop10306 = alloca i1, i1 0
-  %nop10307 = alloca i1, i1 0
-  %nop10308 = alloca i1, i1 0
-  %nop10309 = alloca i1, i1 0
-  %nop10310 = alloca i1, i1 0
-  %nop10311 = alloca i1, i1 0
-  %nop10312 = alloca i1, i1 0
-  %nop10313 = alloca i1, i1 0
-  %nop10314 = alloca i1, i1 0
-  %nop10315 = alloca i1, i1 0
-  %nop10316 = alloca i1, i1 0
-  %nop10317 = alloca i1, i1 0
-  %nop10318 = alloca i1, i1 0
-  %nop10319 = alloca i1, i1 0
-  %nop10320 = alloca i1, i1 0
-  %nop10321 = alloca i1, i1 0
-  %nop10322 = alloca i1, i1 0
-  %nop10323 = alloca i1, i1 0
-  %nop10324 = alloca i1, i1 0
-  %nop10325 = alloca i1, i1 0
-  %nop10326 = alloca i1, i1 0
-  %nop10327 = alloca i1, i1 0
-  %nop10328 = alloca i1, i1 0
-  %nop10329 = alloca i1, i1 0
-  %nop10330 = alloca i1, i1 0
-  %nop10331 = alloca i1, i1 0
-  %nop10332 = alloca i1, i1 0
-  %nop10333 = alloca i1, i1 0
-  %nop10334 = alloca i1, i1 0
-  %nop10335 = alloca i1, i1 0
-  %nop10336 = alloca i1, i1 0
-  %nop10337 = alloca i1, i1 0
-  %nop10338 = alloca i1, i1 0
-  %nop10339 = alloca i1, i1 0
-  %nop10340 = alloca i1, i1 0
-  %nop10341 = alloca i1, i1 0
-  %nop10342 = alloca i1, i1 0
-  %nop10343 = alloca i1, i1 0
-  %nop10344 = alloca i1, i1 0
-  %nop10345 = alloca i1, i1 0
-  %nop10346 = alloca i1, i1 0
-  %nop10347 = alloca i1, i1 0
-  %nop10348 = alloca i1, i1 0
-  %nop10349 = alloca i1, i1 0
-  %nop10350 = alloca i1, i1 0
-  %nop10351 = alloca i1, i1 0
-  %nop10352 = alloca i1, i1 0
-  %nop10353 = alloca i1, i1 0
-  %nop10354 = alloca i1, i1 0
-  %nop10355 = alloca i1, i1 0
-  %nop10356 = alloca i1, i1 0
-  %nop10357 = alloca i1, i1 0
-  %nop10358 = alloca i1, i1 0
-  %nop10359 = alloca i1, i1 0
-  %nop10360 = alloca i1, i1 0
-  %nop10361 = alloca i1, i1 0
-  %nop10362 = alloca i1, i1 0
-  %nop10363 = alloca i1, i1 0
-  %nop10364 = alloca i1, i1 0
-  %nop10365 = alloca i1, i1 0
-  %nop10366 = alloca i1, i1 0
-  %nop10367 = alloca i1, i1 0
-  %nop10368 = alloca i1, i1 0
-  %nop10369 = alloca i1, i1 0
-  %nop10370 = alloca i1, i1 0
-  %nop10371 = alloca i1, i1 0
-  %nop10372 = alloca i1, i1 0
-  %nop10373 = alloca i1, i1 0
-  %nop10374 = alloca i1, i1 0
-  %nop10375 = alloca i1, i1 0
-  %nop10376 = alloca i1, i1 0
-  %nop10377 = alloca i1, i1 0
-  %nop10378 = alloca i1, i1 0
-  %nop10379 = alloca i1, i1 0
-  %nop10380 = alloca i1, i1 0
-  %nop10381 = alloca i1, i1 0
-  %nop10382 = alloca i1, i1 0
-  %nop10383 = alloca i1, i1 0
-  %nop10384 = alloca i1, i1 0
-  %nop10385 = alloca i1, i1 0
-  %nop10386 = alloca i1, i1 0
-  %nop10387 = alloca i1, i1 0
-  %nop10388 = alloca i1, i1 0
-  %nop10389 = alloca i1, i1 0
-  %nop10390 = alloca i1, i1 0
-  %nop10391 = alloca i1, i1 0
-  %nop10392 = alloca i1, i1 0
-  %nop10393 = alloca i1, i1 0
-  %nop10394 = alloca i1, i1 0
-  %nop10395 = alloca i1, i1 0
-  %nop10396 = alloca i1, i1 0
-  %nop10397 = alloca i1, i1 0
-  %nop10398 = alloca i1, i1 0
-  %nop10399 = alloca i1, i1 0
-  %nop10400 = alloca i1, i1 0
-  %nop10401 = alloca i1, i1 0
-  %nop10402 = alloca i1, i1 0
-  %nop10403 = alloca i1, i1 0
-  %nop10404 = alloca i1, i1 0
-  %nop10405 = alloca i1, i1 0
-  %nop10406 = alloca i1, i1 0
-  %nop10407 = alloca i1, i1 0
-  %nop10408 = alloca i1, i1 0
-  %nop10409 = alloca i1, i1 0
-  %nop10410 = alloca i1, i1 0
-  %nop10411 = alloca i1, i1 0
-  %nop10412 = alloca i1, i1 0
-  %nop10413 = alloca i1, i1 0
-  %nop10414 = alloca i1, i1 0
-  %nop10415 = alloca i1, i1 0
-  %nop10416 = alloca i1, i1 0
-  %nop10417 = alloca i1, i1 0
-  %nop10418 = alloca i1, i1 0
-  %nop10419 = alloca i1, i1 0
-  %nop10420 = alloca i1, i1 0
-  %nop10421 = alloca i1, i1 0
-  %nop10422 = alloca i1, i1 0
-  %nop10423 = alloca i1, i1 0
-  %nop10424 = alloca i1, i1 0
-  %nop10425 = alloca i1, i1 0
-  %nop10426 = alloca i1, i1 0
-  %nop10427 = alloca i1, i1 0
-  %nop10428 = alloca i1, i1 0
-  %nop10429 = alloca i1, i1 0
-  %nop10430 = alloca i1, i1 0
-  %nop10431 = alloca i1, i1 0
-  %nop10432 = alloca i1, i1 0
-  %nop10433 = alloca i1, i1 0
-  %nop10434 = alloca i1, i1 0
-  %nop10435 = alloca i1, i1 0
-  %nop10436 = alloca i1, i1 0
-  %nop10437 = alloca i1, i1 0
-  %nop10438 = alloca i1, i1 0
-  %nop10439 = alloca i1, i1 0
-  %nop10440 = alloca i1, i1 0
-  %nop10441 = alloca i1, i1 0
-  %nop10442 = alloca i1, i1 0
-  %nop10443 = alloca i1, i1 0
-  %nop10444 = alloca i1, i1 0
-  %nop10445 = alloca i1, i1 0
-  %nop10446 = alloca i1, i1 0
-  %nop10447 = alloca i1, i1 0
-  %nop10448 = alloca i1, i1 0
-  %nop10449 = alloca i1, i1 0
-  %nop10450 = alloca i1, i1 0
-  %nop10451 = alloca i1, i1 0
-  %nop10452 = alloca i1, i1 0
-  %nop10453 = alloca i1, i1 0
-  %nop10454 = alloca i1, i1 0
-  %nop10455 = alloca i1, i1 0
-  %nop10456 = alloca i1, i1 0
-  %nop10457 = alloca i1, i1 0
-  %nop10458 = alloca i1, i1 0
-  %nop10459 = alloca i1, i1 0
-  %nop10460 = alloca i1, i1 0
-  %nop10461 = alloca i1, i1 0
-  %nop10462 = alloca i1, i1 0
-  %nop10463 = alloca i1, i1 0
-  %nop10464 = alloca i1, i1 0
-  %nop10465 = alloca i1, i1 0
-  %nop10466 = alloca i1, i1 0
-  %nop10467 = alloca i1, i1 0
-  %nop10468 = alloca i1, i1 0
-  %nop10469 = alloca i1, i1 0
-  %nop10470 = alloca i1, i1 0
-  %nop10471 = alloca i1, i1 0
-  %nop10472 = alloca i1, i1 0
-  %nop10473 = alloca i1, i1 0
-  %nop10474 = alloca i1, i1 0
-  %nop10475 = alloca i1, i1 0
-  %nop10476 = alloca i1, i1 0
-  %nop10477 = alloca i1, i1 0
-  %nop10478 = alloca i1, i1 0
-  %nop10479 = alloca i1, i1 0
-  %nop10480 = alloca i1, i1 0
-  %nop10481 = alloca i1, i1 0
-  %nop10482 = alloca i1, i1 0
-  %nop10483 = alloca i1, i1 0
-  %nop10484 = alloca i1, i1 0
-  %nop10485 = alloca i1, i1 0
-  %nop10486 = alloca i1, i1 0
-  %nop10487 = alloca i1, i1 0
-  %nop10488 = alloca i1, i1 0
-  %nop10489 = alloca i1, i1 0
-  %nop10490 = alloca i1, i1 0
-  %nop10491 = alloca i1, i1 0
-  %nop10492 = alloca i1, i1 0
-  %nop10493 = alloca i1, i1 0
-  %nop10494 = alloca i1, i1 0
-  %nop10495 = alloca i1, i1 0
-  %nop10496 = alloca i1, i1 0
-  %nop10497 = alloca i1, i1 0
-  %nop10498 = alloca i1, i1 0
-  %nop10499 = alloca i1, i1 0
-  %nop10500 = alloca i1, i1 0
-  %nop10501 = alloca i1, i1 0
-  %nop10502 = alloca i1, i1 0
-  %nop10503 = alloca i1, i1 0
-  %nop10504 = alloca i1, i1 0
-  %nop10505 = alloca i1, i1 0
-  %nop10506 = alloca i1, i1 0
-  %nop10507 = alloca i1, i1 0
-  %nop10508 = alloca i1, i1 0
-  %nop10509 = alloca i1, i1 0
-  %nop10510 = alloca i1, i1 0
-  %nop10511 = alloca i1, i1 0
-  %nop10512 = alloca i1, i1 0
-  %nop10513 = alloca i1, i1 0
-  %nop10514 = alloca i1, i1 0
-  %nop10515 = alloca i1, i1 0
-  %nop10516 = alloca i1, i1 0
-  %nop10517 = alloca i1, i1 0
-  %nop10518 = alloca i1, i1 0
-  %nop10519 = alloca i1, i1 0
-  %nop10520 = alloca i1, i1 0
-  %nop10521 = alloca i1, i1 0
-  %nop10522 = alloca i1, i1 0
-  %nop10523 = alloca i1, i1 0
-  %nop10524 = alloca i1, i1 0
-  %nop10525 = alloca i1, i1 0
-  %nop10526 = alloca i1, i1 0
-  %nop10527 = alloca i1, i1 0
-  %nop10528 = alloca i1, i1 0
-  %nop10529 = alloca i1, i1 0
-  %nop10530 = alloca i1, i1 0
-  %nop10531 = alloca i1, i1 0
-  %nop10532 = alloca i1, i1 0
-  %nop10533 = alloca i1, i1 0
-  %nop10534 = alloca i1, i1 0
-  %nop10535 = alloca i1, i1 0
-  %nop10536 = alloca i1, i1 0
-  %nop10537 = alloca i1, i1 0
-  %nop10538 = alloca i1, i1 0
-  %nop10539 = alloca i1, i1 0
-  %nop10540 = alloca i1, i1 0
-  %nop10541 = alloca i1, i1 0
-  %nop10542 = alloca i1, i1 0
-  %nop10543 = alloca i1, i1 0
-  %nop10544 = alloca i1, i1 0
-  %nop10545 = alloca i1, i1 0
-  %nop10546 = alloca i1, i1 0
-  %nop10547 = alloca i1, i1 0
-  %nop10548 = alloca i1, i1 0
-  %nop10549 = alloca i1, i1 0
-  %nop10550 = alloca i1, i1 0
-  %nop10551 = alloca i1, i1 0
-  %nop10552 = alloca i1, i1 0
-  %nop10553 = alloca i1, i1 0
-  %nop10554 = alloca i1, i1 0
-  %nop10555 = alloca i1, i1 0
-  %nop10556 = alloca i1, i1 0
-  %nop10557 = alloca i1, i1 0
-  %nop10558 = alloca i1, i1 0
-  %nop10559 = alloca i1, i1 0
-  %nop10560 = alloca i1, i1 0
-  %nop10561 = alloca i1, i1 0
-  %nop10562 = alloca i1, i1 0
-  %nop10563 = alloca i1, i1 0
-  %nop10564 = alloca i1, i1 0
-  %nop10565 = alloca i1, i1 0
-  %nop10566 = alloca i1, i1 0
-  %nop10567 = alloca i1, i1 0
-  %nop10568 = alloca i1, i1 0
-  %nop10569 = alloca i1, i1 0
-  %nop10570 = alloca i1, i1 0
-  %nop10571 = alloca i1, i1 0
-  %nop10572 = alloca i1, i1 0
-  %nop10573 = alloca i1, i1 0
-  %nop10574 = alloca i1, i1 0
-  %nop10575 = alloca i1, i1 0
-  %nop10576 = alloca i1, i1 0
-  %nop10577 = alloca i1, i1 0
-  %nop10578 = alloca i1, i1 0
-  %nop10579 = alloca i1, i1 0
-  %nop10580 = alloca i1, i1 0
-  %nop10581 = alloca i1, i1 0
-  %nop10582 = alloca i1, i1 0
-  %nop10583 = alloca i1, i1 0
-  %nop10584 = alloca i1, i1 0
-  %nop10585 = alloca i1, i1 0
-  %nop10586 = alloca i1, i1 0
-  %nop10587 = alloca i1, i1 0
-  %nop10588 = alloca i1, i1 0
-  %nop10589 = alloca i1, i1 0
-  %nop10590 = alloca i1, i1 0
-  %nop10591 = alloca i1, i1 0
-  %nop10592 = alloca i1, i1 0
-  %nop10593 = alloca i1, i1 0
-  %nop10594 = alloca i1, i1 0
-  %nop10595 = alloca i1, i1 0
-  %nop10596 = alloca i1, i1 0
-  %nop10597 = alloca i1, i1 0
-  %nop10598 = alloca i1, i1 0
-  %nop10599 = alloca i1, i1 0
-  %nop10600 = alloca i1, i1 0
-  %nop10601 = alloca i1, i1 0
-  %nop10602 = alloca i1, i1 0
-  %nop10603 = alloca i1, i1 0
-  %nop10604 = alloca i1, i1 0
-  %nop10605 = alloca i1, i1 0
-  %nop10606 = alloca i1, i1 0
-  %nop10607 = alloca i1, i1 0
-  %nop10608 = alloca i1, i1 0
-  %nop10609 = alloca i1, i1 0
-  %nop10610 = alloca i1, i1 0
-  %nop10611 = alloca i1, i1 0
-  %nop10612 = alloca i1, i1 0
-  %nop10613 = alloca i1, i1 0
-  %nop10614 = alloca i1, i1 0
-  %nop10615 = alloca i1, i1 0
-  %nop10616 = alloca i1, i1 0
-  %nop10617 = alloca i1, i1 0
-  %nop10618 = alloca i1, i1 0
-  %nop10619 = alloca i1, i1 0
-  %nop10620 = alloca i1, i1 0
-  %nop10621 = alloca i1, i1 0
-  %nop10622 = alloca i1, i1 0
-  %nop10623 = alloca i1, i1 0
-  %nop10624 = alloca i1, i1 0
-  %nop10625 = alloca i1, i1 0
-  %nop10626 = alloca i1, i1 0
-  %nop10627 = alloca i1, i1 0
-  %nop10628 = alloca i1, i1 0
-  %nop10629 = alloca i1, i1 0
-  %nop10630 = alloca i1, i1 0
-  %nop10631 = alloca i1, i1 0
-  %nop10632 = alloca i1, i1 0
-  %nop10633 = alloca i1, i1 0
-  %nop10634 = alloca i1, i1 0
-  %nop10635 = alloca i1, i1 0
-  %nop10636 = alloca i1, i1 0
-  %nop10637 = alloca i1, i1 0
-  %nop10638 = alloca i1, i1 0
-  %nop10639 = alloca i1, i1 0
-  %nop10640 = alloca i1, i1 0
-  %nop10641 = alloca i1, i1 0
-  %nop10642 = alloca i1, i1 0
-  %nop10643 = alloca i1, i1 0
-  %nop10644 = alloca i1, i1 0
-  %nop10645 = alloca i1, i1 0
-  %nop10646 = alloca i1, i1 0
-  %nop10647 = alloca i1, i1 0
-  %nop10648 = alloca i1, i1 0
-  %nop10649 = alloca i1, i1 0
-  %nop10650 = alloca i1, i1 0
-  %nop10651 = alloca i1, i1 0
-  %nop10652 = alloca i1, i1 0
-  %nop10653 = alloca i1, i1 0
-  %nop10654 = alloca i1, i1 0
-  %nop10655 = alloca i1, i1 0
-  %nop10656 = alloca i1, i1 0
-  %nop10657 = alloca i1, i1 0
-  %nop10658 = alloca i1, i1 0
-  %nop10659 = alloca i1, i1 0
-  %nop10660 = alloca i1, i1 0
-  %nop10661 = alloca i1, i1 0
-  %nop10662 = alloca i1, i1 0
-  %nop10663 = alloca i1, i1 0
-  %nop10664 = alloca i1, i1 0
-  %nop10665 = alloca i1, i1 0
-  %nop10666 = alloca i1, i1 0
-  %nop10667 = alloca i1, i1 0
-  %nop10668 = alloca i1, i1 0
-  %nop10669 = alloca i1, i1 0
-  %nop10670 = alloca i1, i1 0
-  %nop10671 = alloca i1, i1 0
-  %nop10672 = alloca i1, i1 0
-  %nop10673 = alloca i1, i1 0
-  %nop10674 = alloca i1, i1 0
-  %nop10675 = alloca i1, i1 0
-  %nop10676 = alloca i1, i1 0
-  %nop10677 = alloca i1, i1 0
-  %nop10678 = alloca i1, i1 0
-  %nop10679 = alloca i1, i1 0
-  %nop10680 = alloca i1, i1 0
-  %nop10681 = alloca i1, i1 0
-  %nop10682 = alloca i1, i1 0
-  %nop10683 = alloca i1, i1 0
-  %nop10684 = alloca i1, i1 0
-  %nop10685 = alloca i1, i1 0
-  %nop10686 = alloca i1, i1 0
-  %nop10687 = alloca i1, i1 0
-  %nop10688 = alloca i1, i1 0
-  %nop10689 = alloca i1, i1 0
-  %nop10690 = alloca i1, i1 0
-  %nop10691 = alloca i1, i1 0
-  %nop10692 = alloca i1, i1 0
-  %nop10693 = alloca i1, i1 0
-  %nop10694 = alloca i1, i1 0
-  %nop10695 = alloca i1, i1 0
-  %nop10696 = alloca i1, i1 0
-  %nop10697 = alloca i1, i1 0
-  %nop10698 = alloca i1, i1 0
-  %nop10699 = alloca i1, i1 0
-  %nop10700 = alloca i1, i1 0
-  %nop10701 = alloca i1, i1 0
-  %nop10702 = alloca i1, i1 0
-  %nop10703 = alloca i1, i1 0
-  %nop10704 = alloca i1, i1 0
-  %nop10705 = alloca i1, i1 0
-  %nop10706 = alloca i1, i1 0
-  %nop10707 = alloca i1, i1 0
-  %nop10708 = alloca i1, i1 0
-  %nop10709 = alloca i1, i1 0
-  %nop10710 = alloca i1, i1 0
-  %nop10711 = alloca i1, i1 0
-  %nop10712 = alloca i1, i1 0
-  %nop10713 = alloca i1, i1 0
-  %nop10714 = alloca i1, i1 0
-  %nop10715 = alloca i1, i1 0
-  %nop10716 = alloca i1, i1 0
-  %nop10717 = alloca i1, i1 0
-  %nop10718 = alloca i1, i1 0
-  %nop10719 = alloca i1, i1 0
-  %nop10720 = alloca i1, i1 0
-  %nop10721 = alloca i1, i1 0
-  %nop10722 = alloca i1, i1 0
-  %nop10723 = alloca i1, i1 0
-  %nop10724 = alloca i1, i1 0
-  %nop10725 = alloca i1, i1 0
-  %nop10726 = alloca i1, i1 0
-  %nop10727 = alloca i1, i1 0
-  %nop10728 = alloca i1, i1 0
-  %nop10729 = alloca i1, i1 0
-  %nop10730 = alloca i1, i1 0
-  %nop10731 = alloca i1, i1 0
-  %nop10732 = alloca i1, i1 0
-  %nop10733 = alloca i1, i1 0
-  %nop10734 = alloca i1, i1 0
-  %nop10735 = alloca i1, i1 0
-  %nop10736 = alloca i1, i1 0
-  %nop10737 = alloca i1, i1 0
-  %nop10738 = alloca i1, i1 0
-  %nop10739 = alloca i1, i1 0
-  %nop10740 = alloca i1, i1 0
-  %nop10741 = alloca i1, i1 0
-  %nop10742 = alloca i1, i1 0
-  %nop10743 = alloca i1, i1 0
-  %nop10744 = alloca i1, i1 0
-  %nop10745 = alloca i1, i1 0
-  %nop10746 = alloca i1, i1 0
-  %nop10747 = alloca i1, i1 0
-  %nop10748 = alloca i1, i1 0
-  %nop10749 = alloca i1, i1 0
-  %nop10750 = alloca i1, i1 0
-  %nop10751 = alloca i1, i1 0
-  %nop10752 = alloca i1, i1 0
-  %nop10753 = alloca i1, i1 0
-  %nop10754 = alloca i1, i1 0
-  %nop10755 = alloca i1, i1 0
-  %nop10756 = alloca i1, i1 0
-  %nop10757 = alloca i1, i1 0
-  %nop10758 = alloca i1, i1 0
-  %nop10759 = alloca i1, i1 0
-  %nop10760 = alloca i1, i1 0
-  %nop10761 = alloca i1, i1 0
-  %nop10762 = alloca i1, i1 0
-  %nop10763 = alloca i1, i1 0
-  %nop10764 = alloca i1, i1 0
-  %nop10765 = alloca i1, i1 0
-  %nop10766 = alloca i1, i1 0
-  %nop10767 = alloca i1, i1 0
-  %nop10768 = alloca i1, i1 0
-  %nop10769 = alloca i1, i1 0
-  %nop10770 = alloca i1, i1 0
-  %nop10771 = alloca i1, i1 0
-  %nop10772 = alloca i1, i1 0
-  %nop10773 = alloca i1, i1 0
-  %nop10774 = alloca i1, i1 0
-  %nop10775 = alloca i1, i1 0
-  %nop10776 = alloca i1, i1 0
-  %nop10777 = alloca i1, i1 0
-  %nop10778 = alloca i1, i1 0
-  %nop10779 = alloca i1, i1 0
-  %nop10780 = alloca i1, i1 0
-  %nop10781 = alloca i1, i1 0
-  %nop10782 = alloca i1, i1 0
-  %nop10783 = alloca i1, i1 0
-  %nop10784 = alloca i1, i1 0
-  %nop10785 = alloca i1, i1 0
-  %nop10786 = alloca i1, i1 0
-  %nop10787 = alloca i1, i1 0
-  %nop10788 = alloca i1, i1 0
-  %nop10789 = alloca i1, i1 0
-  %nop10790 = alloca i1, i1 0
-  %nop10791 = alloca i1, i1 0
-  %nop10792 = alloca i1, i1 0
-  %nop10793 = alloca i1, i1 0
-  %nop10794 = alloca i1, i1 0
-  %nop10795 = alloca i1, i1 0
-  %nop10796 = alloca i1, i1 0
-  %nop10797 = alloca i1, i1 0
-  %nop10798 = alloca i1, i1 0
-  %nop10799 = alloca i1, i1 0
-  %nop10800 = alloca i1, i1 0
-  %nop10801 = alloca i1, i1 0
-  %nop10802 = alloca i1, i1 0
-  %nop10803 = alloca i1, i1 0
-  %nop10804 = alloca i1, i1 0
-  %nop10805 = alloca i1, i1 0
-  %nop10806 = alloca i1, i1 0
-  %nop10807 = alloca i1, i1 0
-  %nop10808 = alloca i1, i1 0
-  %nop10809 = alloca i1, i1 0
-  %nop10810 = alloca i1, i1 0
-  %nop10811 = alloca i1, i1 0
-  %nop10812 = alloca i1, i1 0
-  %nop10813 = alloca i1, i1 0
-  %nop10814 = alloca i1, i1 0
-  %nop10815 = alloca i1, i1 0
-  %nop10816 = alloca i1, i1 0
-  %nop10817 = alloca i1, i1 0
-  %nop10818 = alloca i1, i1 0
-  %nop10819 = alloca i1, i1 0
-  %nop10820 = alloca i1, i1 0
-  %nop10821 = alloca i1, i1 0
-  %nop10822 = alloca i1, i1 0
-  %nop10823 = alloca i1, i1 0
-  %nop10824 = alloca i1, i1 0
-  %nop10825 = alloca i1, i1 0
-  %nop10826 = alloca i1, i1 0
-  %nop10827 = alloca i1, i1 0
-  %nop10828 = alloca i1, i1 0
-  %nop10829 = alloca i1, i1 0
-  %nop10830 = alloca i1, i1 0
-  %nop10831 = alloca i1, i1 0
-  %nop10832 = alloca i1, i1 0
-  %nop10833 = alloca i1, i1 0
-  %nop10834 = alloca i1, i1 0
-  %nop10835 = alloca i1, i1 0
-  %nop10836 = alloca i1, i1 0
-  %nop10837 = alloca i1, i1 0
-  %nop10838 = alloca i1, i1 0
-  %nop10839 = alloca i1, i1 0
-  %nop10840 = alloca i1, i1 0
-  %nop10841 = alloca i1, i1 0
-  %nop10842 = alloca i1, i1 0
-  %nop10843 = alloca i1, i1 0
-  %nop10844 = alloca i1, i1 0
-  %nop10845 = alloca i1, i1 0
-  %nop10846 = alloca i1, i1 0
-  %nop10847 = alloca i1, i1 0
-  %nop10848 = alloca i1, i1 0
-  %nop10849 = alloca i1, i1 0
-  %nop10850 = alloca i1, i1 0
-  %nop10851 = alloca i1, i1 0
-  %nop10852 = alloca i1, i1 0
-  %nop10853 = alloca i1, i1 0
-  %nop10854 = alloca i1, i1 0
-  %nop10855 = alloca i1, i1 0
-  %nop10856 = alloca i1, i1 0
-  %nop10857 = alloca i1, i1 0
-  %nop10858 = alloca i1, i1 0
-  %nop10859 = alloca i1, i1 0
-  %nop10860 = alloca i1, i1 0
-  %nop10861 = alloca i1, i1 0
-  %nop10862 = alloca i1, i1 0
-  %nop10863 = alloca i1, i1 0
-  %nop10864 = alloca i1, i1 0
-  %nop10865 = alloca i1, i1 0
-  %nop10866 = alloca i1, i1 0
-  %nop10867 = alloca i1, i1 0
-  %nop10868 = alloca i1, i1 0
-  %nop10869 = alloca i1, i1 0
-  %nop10870 = alloca i1, i1 0
-  %nop10871 = alloca i1, i1 0
-  %nop10872 = alloca i1, i1 0
-  %nop10873 = alloca i1, i1 0
-  %nop10874 = alloca i1, i1 0
-  %nop10875 = alloca i1, i1 0
-  %nop10876 = alloca i1, i1 0
-  %nop10877 = alloca i1, i1 0
-  %nop10878 = alloca i1, i1 0
-  %nop10879 = alloca i1, i1 0
-  %nop10880 = alloca i1, i1 0
-  %nop10881 = alloca i1, i1 0
-  %nop10882 = alloca i1, i1 0
-  %nop10883 = alloca i1, i1 0
-  %nop10884 = alloca i1, i1 0
-  %nop10885 = alloca i1, i1 0
-  %nop10886 = alloca i1, i1 0
-  %nop10887 = alloca i1, i1 0
-  %nop10888 = alloca i1, i1 0
-  %nop10889 = alloca i1, i1 0
-  %nop10890 = alloca i1, i1 0
-  %nop10891 = alloca i1, i1 0
-  %nop10892 = alloca i1, i1 0
-  %nop10893 = alloca i1, i1 0
-  %nop10894 = alloca i1, i1 0
-  %nop10895 = alloca i1, i1 0
-  %nop10896 = alloca i1, i1 0
-  %nop10897 = alloca i1, i1 0
-  %nop10898 = alloca i1, i1 0
-  %nop10899 = alloca i1, i1 0
-  %nop10900 = alloca i1, i1 0
-  %nop10901 = alloca i1, i1 0
-  %nop10902 = alloca i1, i1 0
-  %nop10903 = alloca i1, i1 0
-  %nop10904 = alloca i1, i1 0
-  %nop10905 = alloca i1, i1 0
-  %nop10906 = alloca i1, i1 0
-  %nop10907 = alloca i1, i1 0
-  %nop10908 = alloca i1, i1 0
-  %nop10909 = alloca i1, i1 0
-  %nop10910 = alloca i1, i1 0
-  %nop10911 = alloca i1, i1 0
-  %nop10912 = alloca i1, i1 0
-  %nop10913 = alloca i1, i1 0
-  %nop10914 = alloca i1, i1 0
-  %nop10915 = alloca i1, i1 0
-  %nop10916 = alloca i1, i1 0
-  %nop10917 = alloca i1, i1 0
-  %nop10918 = alloca i1, i1 0
-  %nop10919 = alloca i1, i1 0
-  %nop10920 = alloca i1, i1 0
-  %nop10921 = alloca i1, i1 0
-  %nop10922 = alloca i1, i1 0
-  %nop10923 = alloca i1, i1 0
-  %nop10924 = alloca i1, i1 0
-  %nop10925 = alloca i1, i1 0
-  %nop10926 = alloca i1, i1 0
-  %nop10927 = alloca i1, i1 0
-  %nop10928 = alloca i1, i1 0
-  %nop10929 = alloca i1, i1 0
-  %nop10930 = alloca i1, i1 0
-  %nop10931 = alloca i1, i1 0
-  %nop10932 = alloca i1, i1 0
-  %nop10933 = alloca i1, i1 0
-  %nop10934 = alloca i1, i1 0
-  %nop10935 = alloca i1, i1 0
-  %nop10936 = alloca i1, i1 0
-  %nop10937 = alloca i1, i1 0
-  %nop10938 = alloca i1, i1 0
-  %nop10939 = alloca i1, i1 0
-  %nop10940 = alloca i1, i1 0
-  %nop10941 = alloca i1, i1 0
-  %nop10942 = alloca i1, i1 0
-  %nop10943 = alloca i1, i1 0
-  %nop10944 = alloca i1, i1 0
-  %nop10945 = alloca i1, i1 0
-  %nop10946 = alloca i1, i1 0
-  %nop10947 = alloca i1, i1 0
-  %nop10948 = alloca i1, i1 0
-  %nop10949 = alloca i1, i1 0
-  %nop10950 = alloca i1, i1 0
-  %nop10951 = alloca i1, i1 0
-  %nop10952 = alloca i1, i1 0
-  %nop10953 = alloca i1, i1 0
-  %nop10954 = alloca i1, i1 0
-  %nop10955 = alloca i1, i1 0
-  %nop10956 = alloca i1, i1 0
-  %nop10957 = alloca i1, i1 0
-  %nop10958 = alloca i1, i1 0
-  %nop10959 = alloca i1, i1 0
-  %nop10960 = alloca i1, i1 0
-  %nop10961 = alloca i1, i1 0
-  %nop10962 = alloca i1, i1 0
-  %nop10963 = alloca i1, i1 0
-  %nop10964 = alloca i1, i1 0
-  %nop10965 = alloca i1, i1 0
-  %nop10966 = alloca i1, i1 0
-  %nop10967 = alloca i1, i1 0
-  %nop10968 = alloca i1, i1 0
-  %nop10969 = alloca i1, i1 0
-  %nop10970 = alloca i1, i1 0
-  %nop10971 = alloca i1, i1 0
-  %nop10972 = alloca i1, i1 0
-  %nop10973 = alloca i1, i1 0
-  %nop10974 = alloca i1, i1 0
-  %nop10975 = alloca i1, i1 0
-  %nop10976 = alloca i1, i1 0
-  %nop10977 = alloca i1, i1 0
-  %nop10978 = alloca i1, i1 0
-  %nop10979 = alloca i1, i1 0
-  %nop10980 = alloca i1, i1 0
-  %nop10981 = alloca i1, i1 0
-  %nop10982 = alloca i1, i1 0
-  %nop10983 = alloca i1, i1 0
-  %nop10984 = alloca i1, i1 0
-  %nop10985 = alloca i1, i1 0
-  %nop10986 = alloca i1, i1 0
-  %nop10987 = alloca i1, i1 0
-  %nop10988 = alloca i1, i1 0
-  %nop10989 = alloca i1, i1 0
-  %nop10990 = alloca i1, i1 0
-  %nop10991 = alloca i1, i1 0
-  %nop10992 = alloca i1, i1 0
-  %nop10993 = alloca i1, i1 0
-  %nop10994 = alloca i1, i1 0
-  %nop10995 = alloca i1, i1 0
-  %nop10996 = alloca i1, i1 0
-  %nop10997 = alloca i1, i1 0
-  %nop10998 = alloca i1, i1 0
-  %nop10999 = alloca i1, i1 0
-  %nop11000 = alloca i1, i1 0
-  %nop11001 = alloca i1, i1 0
-  %nop11002 = alloca i1, i1 0
-  %nop11003 = alloca i1, i1 0
-  %nop11004 = alloca i1, i1 0
-  %nop11005 = alloca i1, i1 0
-  %nop11006 = alloca i1, i1 0
-  %nop11007 = alloca i1, i1 0
-  %nop11008 = alloca i1, i1 0
-  %nop11009 = alloca i1, i1 0
-  %nop11010 = alloca i1, i1 0
-  %nop11011 = alloca i1, i1 0
-  %nop11012 = alloca i1, i1 0
-  %nop11013 = alloca i1, i1 0
-  %nop11014 = alloca i1, i1 0
-  %nop11015 = alloca i1, i1 0
-  %nop11016 = alloca i1, i1 0
-  %nop11017 = alloca i1, i1 0
-  %nop11018 = alloca i1, i1 0
-  %nop11019 = alloca i1, i1 0
-  %nop11020 = alloca i1, i1 0
-  %nop11021 = alloca i1, i1 0
-  %nop11022 = alloca i1, i1 0
-  %nop11023 = alloca i1, i1 0
-  %nop11024 = alloca i1, i1 0
-  %nop11025 = alloca i1, i1 0
-  %nop11026 = alloca i1, i1 0
-  %nop11027 = alloca i1, i1 0
-  %nop11028 = alloca i1, i1 0
-  %nop11029 = alloca i1, i1 0
-  %nop11030 = alloca i1, i1 0
-  %nop11031 = alloca i1, i1 0
-  %nop11032 = alloca i1, i1 0
-  %nop11033 = alloca i1, i1 0
-  %nop11034 = alloca i1, i1 0
-  %nop11035 = alloca i1, i1 0
-  %nop11036 = alloca i1, i1 0
-  %nop11037 = alloca i1, i1 0
-  %nop11038 = alloca i1, i1 0
-  %nop11039 = alloca i1, i1 0
-  %nop11040 = alloca i1, i1 0
-  %nop11041 = alloca i1, i1 0
-  %nop11042 = alloca i1, i1 0
-  %nop11043 = alloca i1, i1 0
-  %nop11044 = alloca i1, i1 0
-  %nop11045 = alloca i1, i1 0
-  %nop11046 = alloca i1, i1 0
-  %nop11047 = alloca i1, i1 0
-  %nop11048 = alloca i1, i1 0
-  %nop11049 = alloca i1, i1 0
-  %nop11050 = alloca i1, i1 0
-  %nop11051 = alloca i1, i1 0
-  %nop11052 = alloca i1, i1 0
-  %nop11053 = alloca i1, i1 0
-  %nop11054 = alloca i1, i1 0
-  %nop11055 = alloca i1, i1 0
-  %nop11056 = alloca i1, i1 0
-  %nop11057 = alloca i1, i1 0
-  %nop11058 = alloca i1, i1 0
-  %nop11059 = alloca i1, i1 0
-  %nop11060 = alloca i1, i1 0
-  %nop11061 = alloca i1, i1 0
-  %nop11062 = alloca i1, i1 0
-  %nop11063 = alloca i1, i1 0
-  %nop11064 = alloca i1, i1 0
-  %nop11065 = alloca i1, i1 0
-  %nop11066 = alloca i1, i1 0
-  %nop11067 = alloca i1, i1 0
-  %nop11068 = alloca i1, i1 0
-  %nop11069 = alloca i1, i1 0
-  %nop11070 = alloca i1, i1 0
-  %nop11071 = alloca i1, i1 0
-  %nop11072 = alloca i1, i1 0
-  %nop11073 = alloca i1, i1 0
-  %nop11074 = alloca i1, i1 0
-  %nop11075 = alloca i1, i1 0
-  %nop11076 = alloca i1, i1 0
-  %nop11077 = alloca i1, i1 0
-  %nop11078 = alloca i1, i1 0
-  %nop11079 = alloca i1, i1 0
-  %nop11080 = alloca i1, i1 0
-  %nop11081 = alloca i1, i1 0
-  %nop11082 = alloca i1, i1 0
-  %nop11083 = alloca i1, i1 0
-  %nop11084 = alloca i1, i1 0
-  %nop11085 = alloca i1, i1 0
-  %nop11086 = alloca i1, i1 0
-  %nop11087 = alloca i1, i1 0
-  %nop11088 = alloca i1, i1 0
-  %nop11089 = alloca i1, i1 0
-  %nop11090 = alloca i1, i1 0
-  %nop11091 = alloca i1, i1 0
-  %nop11092 = alloca i1, i1 0
-  %nop11093 = alloca i1, i1 0
-  %nop11094 = alloca i1, i1 0
-  %nop11095 = alloca i1, i1 0
-  %nop11096 = alloca i1, i1 0
-  %nop11097 = alloca i1, i1 0
-  %nop11098 = alloca i1, i1 0
-  %nop11099 = alloca i1, i1 0
-  %nop11100 = alloca i1, i1 0
-  %nop11101 = alloca i1, i1 0
-  %nop11102 = alloca i1, i1 0
-  %nop11103 = alloca i1, i1 0
-  %nop11104 = alloca i1, i1 0
-  %nop11105 = alloca i1, i1 0
-  %nop11106 = alloca i1, i1 0
-  %nop11107 = alloca i1, i1 0
-  %nop11108 = alloca i1, i1 0
-  %nop11109 = alloca i1, i1 0
-  %nop11110 = alloca i1, i1 0
-  %nop11111 = alloca i1, i1 0
-  %nop11112 = alloca i1, i1 0
-  %nop11113 = alloca i1, i1 0
-  %nop11114 = alloca i1, i1 0
-  %nop11115 = alloca i1, i1 0
-  %nop11116 = alloca i1, i1 0
-  %nop11117 = alloca i1, i1 0
-  %nop11118 = alloca i1, i1 0
-  %nop11119 = alloca i1, i1 0
-  %nop11120 = alloca i1, i1 0
-  %nop11121 = alloca i1, i1 0
-  %nop11122 = alloca i1, i1 0
-  %nop11123 = alloca i1, i1 0
-  %nop11124 = alloca i1, i1 0
-  %nop11125 = alloca i1, i1 0
-  %nop11126 = alloca i1, i1 0
-  %nop11127 = alloca i1, i1 0
-  %nop11128 = alloca i1, i1 0
-  %nop11129 = alloca i1, i1 0
-  %nop11130 = alloca i1, i1 0
-  %nop11131 = alloca i1, i1 0
-  %nop11132 = alloca i1, i1 0
-  %nop11133 = alloca i1, i1 0
-  %nop11134 = alloca i1, i1 0
-  %nop11135 = alloca i1, i1 0
-  %nop11136 = alloca i1, i1 0
-  %nop11137 = alloca i1, i1 0
-  %nop11138 = alloca i1, i1 0
-  %nop11139 = alloca i1, i1 0
-  %nop11140 = alloca i1, i1 0
-  %nop11141 = alloca i1, i1 0
-  %nop11142 = alloca i1, i1 0
-  %nop11143 = alloca i1, i1 0
-  %nop11144 = alloca i1, i1 0
-  %nop11145 = alloca i1, i1 0
-  %nop11146 = alloca i1, i1 0
-  %nop11147 = alloca i1, i1 0
-  %nop11148 = alloca i1, i1 0
-  %nop11149 = alloca i1, i1 0
-  %nop11150 = alloca i1, i1 0
-  %nop11151 = alloca i1, i1 0
-  %nop11152 = alloca i1, i1 0
-  %nop11153 = alloca i1, i1 0
-  %nop11154 = alloca i1, i1 0
-  %nop11155 = alloca i1, i1 0
-  %nop11156 = alloca i1, i1 0
-  %nop11157 = alloca i1, i1 0
-  %nop11158 = alloca i1, i1 0
-  %nop11159 = alloca i1, i1 0
-  %nop11160 = alloca i1, i1 0
-  %nop11161 = alloca i1, i1 0
-  %nop11162 = alloca i1, i1 0
-  %nop11163 = alloca i1, i1 0
-  %nop11164 = alloca i1, i1 0
-  %nop11165 = alloca i1, i1 0
-  %nop11166 = alloca i1, i1 0
-  %nop11167 = alloca i1, i1 0
-  %nop11168 = alloca i1, i1 0
-  %nop11169 = alloca i1, i1 0
-  %nop11170 = alloca i1, i1 0
-  %nop11171 = alloca i1, i1 0
-  %nop11172 = alloca i1, i1 0
-  %nop11173 = alloca i1, i1 0
-  %nop11174 = alloca i1, i1 0
-  %nop11175 = alloca i1, i1 0
-  %nop11176 = alloca i1, i1 0
-  %nop11177 = alloca i1, i1 0
-  %nop11178 = alloca i1, i1 0
-  %nop11179 = alloca i1, i1 0
-  %nop11180 = alloca i1, i1 0
-  %nop11181 = alloca i1, i1 0
-  %nop11182 = alloca i1, i1 0
-  %nop11183 = alloca i1, i1 0
-  %nop11184 = alloca i1, i1 0
-  %nop11185 = alloca i1, i1 0
-  %nop11186 = alloca i1, i1 0
-  %nop11187 = alloca i1, i1 0
-  %nop11188 = alloca i1, i1 0
-  %nop11189 = alloca i1, i1 0
-  %nop11190 = alloca i1, i1 0
-  %nop11191 = alloca i1, i1 0
-  %nop11192 = alloca i1, i1 0
-  %nop11193 = alloca i1, i1 0
-  %nop11194 = alloca i1, i1 0
-  %nop11195 = alloca i1, i1 0
-  %nop11196 = alloca i1, i1 0
-  %nop11197 = alloca i1, i1 0
-  %nop11198 = alloca i1, i1 0
-  %nop11199 = alloca i1, i1 0
-  %nop11200 = alloca i1, i1 0
-  %nop11201 = alloca i1, i1 0
-  %nop11202 = alloca i1, i1 0
-  %nop11203 = alloca i1, i1 0
-  %nop11204 = alloca i1, i1 0
-  %nop11205 = alloca i1, i1 0
-  %nop11206 = alloca i1, i1 0
-  %nop11207 = alloca i1, i1 0
-  %nop11208 = alloca i1, i1 0
-  %nop11209 = alloca i1, i1 0
-  %nop11210 = alloca i1, i1 0
-  %nop11211 = alloca i1, i1 0
-  %nop11212 = alloca i1, i1 0
-  %nop11213 = alloca i1, i1 0
-  %nop11214 = alloca i1, i1 0
-  %nop11215 = alloca i1, i1 0
-  %nop11216 = alloca i1, i1 0
-  %nop11217 = alloca i1, i1 0
-  %nop11218 = alloca i1, i1 0
-  %nop11219 = alloca i1, i1 0
-  %nop11220 = alloca i1, i1 0
-  %nop11221 = alloca i1, i1 0
-  %nop11222 = alloca i1, i1 0
-  %nop11223 = alloca i1, i1 0
-  %nop11224 = alloca i1, i1 0
-  %nop11225 = alloca i1, i1 0
-  %nop11226 = alloca i1, i1 0
-  %nop11227 = alloca i1, i1 0
-  %nop11228 = alloca i1, i1 0
-  %nop11229 = alloca i1, i1 0
-  %nop11230 = alloca i1, i1 0
-  %nop11231 = alloca i1, i1 0
-  %nop11232 = alloca i1, i1 0
-  %nop11233 = alloca i1, i1 0
-  %nop11234 = alloca i1, i1 0
-  %nop11235 = alloca i1, i1 0
-  %nop11236 = alloca i1, i1 0
-  %nop11237 = alloca i1, i1 0
-  %nop11238 = alloca i1, i1 0
-  %nop11239 = alloca i1, i1 0
-  %nop11240 = alloca i1, i1 0
-  %nop11241 = alloca i1, i1 0
-  %nop11242 = alloca i1, i1 0
-  %nop11243 = alloca i1, i1 0
-  %nop11244 = alloca i1, i1 0
-  %nop11245 = alloca i1, i1 0
-  %nop11246 = alloca i1, i1 0
-  %nop11247 = alloca i1, i1 0
-  %nop11248 = alloca i1, i1 0
-  %nop11249 = alloca i1, i1 0
-  %nop11250 = alloca i1, i1 0
-  %nop11251 = alloca i1, i1 0
-  %nop11252 = alloca i1, i1 0
-  %nop11253 = alloca i1, i1 0
-  %nop11254 = alloca i1, i1 0
-  %nop11255 = alloca i1, i1 0
-  %nop11256 = alloca i1, i1 0
-  %nop11257 = alloca i1, i1 0
-  %nop11258 = alloca i1, i1 0
-  %nop11259 = alloca i1, i1 0
-  %nop11260 = alloca i1, i1 0
-  %nop11261 = alloca i1, i1 0
-  %nop11262 = alloca i1, i1 0
-  %nop11263 = alloca i1, i1 0
-  %nop11264 = alloca i1, i1 0
-  %nop11265 = alloca i1, i1 0
-  %nop11266 = alloca i1, i1 0
-  %nop11267 = alloca i1, i1 0
-  %nop11268 = alloca i1, i1 0
-  %nop11269 = alloca i1, i1 0
-  %nop11270 = alloca i1, i1 0
-  %nop11271 = alloca i1, i1 0
-  %nop11272 = alloca i1, i1 0
-  %nop11273 = alloca i1, i1 0
-  %nop11274 = alloca i1, i1 0
-  %nop11275 = alloca i1, i1 0
-  %nop11276 = alloca i1, i1 0
-  %nop11277 = alloca i1, i1 0
-  %nop11278 = alloca i1, i1 0
-  %nop11279 = alloca i1, i1 0
-  %nop11280 = alloca i1, i1 0
-  %nop11281 = alloca i1, i1 0
-  %nop11282 = alloca i1, i1 0
-  %nop11283 = alloca i1, i1 0
-  %nop11284 = alloca i1, i1 0
-  %nop11285 = alloca i1, i1 0
-  %nop11286 = alloca i1, i1 0
-  %nop11287 = alloca i1, i1 0
-  %nop11288 = alloca i1, i1 0
-  %nop11289 = alloca i1, i1 0
-  %nop11290 = alloca i1, i1 0
-  %nop11291 = alloca i1, i1 0
-  %nop11292 = alloca i1, i1 0
-  %nop11293 = alloca i1, i1 0
-  %nop11294 = alloca i1, i1 0
-  %nop11295 = alloca i1, i1 0
-  %nop11296 = alloca i1, i1 0
-  %nop11297 = alloca i1, i1 0
-  %nop11298 = alloca i1, i1 0
-  %nop11299 = alloca i1, i1 0
-  %nop11300 = alloca i1, i1 0
-  %nop11301 = alloca i1, i1 0
-  %nop11302 = alloca i1, i1 0
-  %nop11303 = alloca i1, i1 0
-  %nop11304 = alloca i1, i1 0
-  %nop11305 = alloca i1, i1 0
-  %nop11306 = alloca i1, i1 0
-  %nop11307 = alloca i1, i1 0
-  %nop11308 = alloca i1, i1 0
-  %nop11309 = alloca i1, i1 0
-  %nop11310 = alloca i1, i1 0
-  %nop11311 = alloca i1, i1 0
-  %nop11312 = alloca i1, i1 0
-  %nop11313 = alloca i1, i1 0
-  %nop11314 = alloca i1, i1 0
-  %nop11315 = alloca i1, i1 0
-  %nop11316 = alloca i1, i1 0
-  %nop11317 = alloca i1, i1 0
-  %nop11318 = alloca i1, i1 0
-  %nop11319 = alloca i1, i1 0
-  %nop11320 = alloca i1, i1 0
-  %nop11321 = alloca i1, i1 0
-  %nop11322 = alloca i1, i1 0
-  %nop11323 = alloca i1, i1 0
-  %nop11324 = alloca i1, i1 0
-  %nop11325 = alloca i1, i1 0
-  %nop11326 = alloca i1, i1 0
-  %nop11327 = alloca i1, i1 0
-  %nop11328 = alloca i1, i1 0
-  %nop11329 = alloca i1, i1 0
-  %nop11330 = alloca i1, i1 0
-  %nop11331 = alloca i1, i1 0
-  %nop11332 = alloca i1, i1 0
-  %nop11333 = alloca i1, i1 0
-  %nop11334 = alloca i1, i1 0
-  %nop11335 = alloca i1, i1 0
-  %nop11336 = alloca i1, i1 0
-  %nop11337 = alloca i1, i1 0
-  %nop11338 = alloca i1, i1 0
-  %nop11339 = alloca i1, i1 0
-  %nop11340 = alloca i1, i1 0
-  %nop11341 = alloca i1, i1 0
-  %nop11342 = alloca i1, i1 0
-  %nop11343 = alloca i1, i1 0
-  %nop11344 = alloca i1, i1 0
-  %nop11345 = alloca i1, i1 0
-  %nop11346 = alloca i1, i1 0
-  %nop11347 = alloca i1, i1 0
-  %nop11348 = alloca i1, i1 0
-  %nop11349 = alloca i1, i1 0
-  %nop11350 = alloca i1, i1 0
-  %nop11351 = alloca i1, i1 0
-  %nop11352 = alloca i1, i1 0
-  %nop11353 = alloca i1, i1 0
-  %nop11354 = alloca i1, i1 0
-  %nop11355 = alloca i1, i1 0
-  %nop11356 = alloca i1, i1 0
-  %nop11357 = alloca i1, i1 0
-  %nop11358 = alloca i1, i1 0
-  %nop11359 = alloca i1, i1 0
-  %nop11360 = alloca i1, i1 0
-  %nop11361 = alloca i1, i1 0
-  %nop11362 = alloca i1, i1 0
-  %nop11363 = alloca i1, i1 0
-  %nop11364 = alloca i1, i1 0
-  %nop11365 = alloca i1, i1 0
-  %nop11366 = alloca i1, i1 0
-  %nop11367 = alloca i1, i1 0
-  %nop11368 = alloca i1, i1 0
-  %nop11369 = alloca i1, i1 0
-  %nop11370 = alloca i1, i1 0
-  %nop11371 = alloca i1, i1 0
-  %nop11372 = alloca i1, i1 0
-  %nop11373 = alloca i1, i1 0
-  %nop11374 = alloca i1, i1 0
-  %nop11375 = alloca i1, i1 0
-  %nop11376 = alloca i1, i1 0
-  %nop11377 = alloca i1, i1 0
-  %nop11378 = alloca i1, i1 0
-  %nop11379 = alloca i1, i1 0
-  %nop11380 = alloca i1, i1 0
-  %nop11381 = alloca i1, i1 0
-  %nop11382 = alloca i1, i1 0
-  %nop11383 = alloca i1, i1 0
-  %nop11384 = alloca i1, i1 0
-  %nop11385 = alloca i1, i1 0
-  %nop11386 = alloca i1, i1 0
-  %nop11387 = alloca i1, i1 0
-  %nop11388 = alloca i1, i1 0
-  %nop11389 = alloca i1, i1 0
-  %nop11390 = alloca i1, i1 0
-  %nop11391 = alloca i1, i1 0
-  %nop11392 = alloca i1, i1 0
-  %nop11393 = alloca i1, i1 0
-  %nop11394 = alloca i1, i1 0
-  %nop11395 = alloca i1, i1 0
-  %nop11396 = alloca i1, i1 0
-  %nop11397 = alloca i1, i1 0
-  %nop11398 = alloca i1, i1 0
-  %nop11399 = alloca i1, i1 0
-  %nop11400 = alloca i1, i1 0
-  %nop11401 = alloca i1, i1 0
-  %nop11402 = alloca i1, i1 0
-  %nop11403 = alloca i1, i1 0
-  %nop11404 = alloca i1, i1 0
-  %nop11405 = alloca i1, i1 0
-  %nop11406 = alloca i1, i1 0
-  %nop11407 = alloca i1, i1 0
-  %nop11408 = alloca i1, i1 0
-  %nop11409 = alloca i1, i1 0
-  %nop11410 = alloca i1, i1 0
-  %nop11411 = alloca i1, i1 0
-  %nop11412 = alloca i1, i1 0
-  %nop11413 = alloca i1, i1 0
-  %nop11414 = alloca i1, i1 0
-  %nop11415 = alloca i1, i1 0
-  %nop11416 = alloca i1, i1 0
-  %nop11417 = alloca i1, i1 0
-  %nop11418 = alloca i1, i1 0
-  %nop11419 = alloca i1, i1 0
-  %nop11420 = alloca i1, i1 0
-  %nop11421 = alloca i1, i1 0
-  %nop11422 = alloca i1, i1 0
-  %nop11423 = alloca i1, i1 0
-  %nop11424 = alloca i1, i1 0
-  %nop11425 = alloca i1, i1 0
-  %nop11426 = alloca i1, i1 0
-  %nop11427 = alloca i1, i1 0
-  %nop11428 = alloca i1, i1 0
-  %nop11429 = alloca i1, i1 0
-  %nop11430 = alloca i1, i1 0
-  %nop11431 = alloca i1, i1 0
-  %nop11432 = alloca i1, i1 0
-  %nop11433 = alloca i1, i1 0
-  %nop11434 = alloca i1, i1 0
-  %nop11435 = alloca i1, i1 0
-  %nop11436 = alloca i1, i1 0
-  %nop11437 = alloca i1, i1 0
-  %nop11438 = alloca i1, i1 0
-  %nop11439 = alloca i1, i1 0
-  %nop11440 = alloca i1, i1 0
-  %nop11441 = alloca i1, i1 0
-  %nop11442 = alloca i1, i1 0
-  %nop11443 = alloca i1, i1 0
-  %nop11444 = alloca i1, i1 0
-  %nop11445 = alloca i1, i1 0
-  %nop11446 = alloca i1, i1 0
-  %nop11447 = alloca i1, i1 0
-  %nop11448 = alloca i1, i1 0
-  %nop11449 = alloca i1, i1 0
-  %nop11450 = alloca i1, i1 0
-  %nop11451 = alloca i1, i1 0
-  %nop11452 = alloca i1, i1 0
-  %nop11453 = alloca i1, i1 0
-  %nop11454 = alloca i1, i1 0
-  %nop11455 = alloca i1, i1 0
-  %nop11456 = alloca i1, i1 0
-  %nop11457 = alloca i1, i1 0
-  %nop11458 = alloca i1, i1 0
-  %nop11459 = alloca i1, i1 0
-  %nop11460 = alloca i1, i1 0
-  %nop11461 = alloca i1, i1 0
-  %nop11462 = alloca i1, i1 0
-  %nop11463 = alloca i1, i1 0
-  %nop11464 = alloca i1, i1 0
-  %nop11465 = alloca i1, i1 0
-  %nop11466 = alloca i1, i1 0
-  %nop11467 = alloca i1, i1 0
-  %nop11468 = alloca i1, i1 0
-  %nop11469 = alloca i1, i1 0
-  %nop11470 = alloca i1, i1 0
-  %nop11471 = alloca i1, i1 0
-  %nop11472 = alloca i1, i1 0
-  %nop11473 = alloca i1, i1 0
-  %nop11474 = alloca i1, i1 0
-  %nop11475 = alloca i1, i1 0
-  %nop11476 = alloca i1, i1 0
-  %nop11477 = alloca i1, i1 0
-  %nop11478 = alloca i1, i1 0
-  %nop11479 = alloca i1, i1 0
-  %nop11480 = alloca i1, i1 0
-  %nop11481 = alloca i1, i1 0
-  %nop11482 = alloca i1, i1 0
-  %nop11483 = alloca i1, i1 0
-  %nop11484 = alloca i1, i1 0
-  %nop11485 = alloca i1, i1 0
-  %nop11486 = alloca i1, i1 0
-  %nop11487 = alloca i1, i1 0
-  %nop11488 = alloca i1, i1 0
-  %nop11489 = alloca i1, i1 0
-  %nop11490 = alloca i1, i1 0
-  %nop11491 = alloca i1, i1 0
-  %nop11492 = alloca i1, i1 0
-  %nop11493 = alloca i1, i1 0
-  %nop11494 = alloca i1, i1 0
-  %nop11495 = alloca i1, i1 0
-  %nop11496 = alloca i1, i1 0
-  %nop11497 = alloca i1, i1 0
-  %nop11498 = alloca i1, i1 0
-  %nop11499 = alloca i1, i1 0
-  %nop11500 = alloca i1, i1 0
-  %nop11501 = alloca i1, i1 0
-  %nop11502 = alloca i1, i1 0
-  %nop11503 = alloca i1, i1 0
-  %nop11504 = alloca i1, i1 0
-  %nop11505 = alloca i1, i1 0
-  %nop11506 = alloca i1, i1 0
-  %nop11507 = alloca i1, i1 0
-  %nop11508 = alloca i1, i1 0
-  %nop11509 = alloca i1, i1 0
-  %nop11510 = alloca i1, i1 0
-  %nop11511 = alloca i1, i1 0
-  %nop11512 = alloca i1, i1 0
-  %nop11513 = alloca i1, i1 0
-  %nop11514 = alloca i1, i1 0
-  %nop11515 = alloca i1, i1 0
-  %nop11516 = alloca i1, i1 0
-  %nop11517 = alloca i1, i1 0
-  %nop11518 = alloca i1, i1 0
-  %nop11519 = alloca i1, i1 0
-  %nop11520 = alloca i1, i1 0
-  %nop11521 = alloca i1, i1 0
-  %nop11522 = alloca i1, i1 0
-  %nop11523 = alloca i1, i1 0
-  %nop11524 = alloca i1, i1 0
-  %nop11525 = alloca i1, i1 0
-  %nop11526 = alloca i1, i1 0
-  %nop11527 = alloca i1, i1 0
-  %nop11528 = alloca i1, i1 0
-  %nop11529 = alloca i1, i1 0
-  %nop11530 = alloca i1, i1 0
-  %nop11531 = alloca i1, i1 0
-  %nop11532 = alloca i1, i1 0
-  %nop11533 = alloca i1, i1 0
-  %nop11534 = alloca i1, i1 0
-  %nop11535 = alloca i1, i1 0
-  %nop11536 = alloca i1, i1 0
-  %nop11537 = alloca i1, i1 0
-  %nop11538 = alloca i1, i1 0
-  %nop11539 = alloca i1, i1 0
-  %nop11540 = alloca i1, i1 0
-  %nop11541 = alloca i1, i1 0
-  %nop11542 = alloca i1, i1 0
-  %nop11543 = alloca i1, i1 0
-  %nop11544 = alloca i1, i1 0
-  %nop11545 = alloca i1, i1 0
-  %nop11546 = alloca i1, i1 0
-  %nop11547 = alloca i1, i1 0
-  %nop11548 = alloca i1, i1 0
-  %nop11549 = alloca i1, i1 0
-  %nop11550 = alloca i1, i1 0
-  %nop11551 = alloca i1, i1 0
-  %nop11552 = alloca i1, i1 0
-  %nop11553 = alloca i1, i1 0
-  %nop11554 = alloca i1, i1 0
-  %nop11555 = alloca i1, i1 0
-  %nop11556 = alloca i1, i1 0
-  %nop11557 = alloca i1, i1 0
-  %nop11558 = alloca i1, i1 0
-  %nop11559 = alloca i1, i1 0
-  %nop11560 = alloca i1, i1 0
-  %nop11561 = alloca i1, i1 0
-  %nop11562 = alloca i1, i1 0
-  %nop11563 = alloca i1, i1 0
-  %nop11564 = alloca i1, i1 0
-  %nop11565 = alloca i1, i1 0
-  %nop11566 = alloca i1, i1 0
-  %nop11567 = alloca i1, i1 0
-  %nop11568 = alloca i1, i1 0
-  %nop11569 = alloca i1, i1 0
-  %nop11570 = alloca i1, i1 0
-  %nop11571 = alloca i1, i1 0
-  %nop11572 = alloca i1, i1 0
-  %nop11573 = alloca i1, i1 0
-  %nop11574 = alloca i1, i1 0
-  %nop11575 = alloca i1, i1 0
-  %nop11576 = alloca i1, i1 0
-  %nop11577 = alloca i1, i1 0
-  %nop11578 = alloca i1, i1 0
-  %nop11579 = alloca i1, i1 0
-  %nop11580 = alloca i1, i1 0
-  %nop11581 = alloca i1, i1 0
-  %nop11582 = alloca i1, i1 0
-  %nop11583 = alloca i1, i1 0
-  %nop11584 = alloca i1, i1 0
-  %nop11585 = alloca i1, i1 0
-  %nop11586 = alloca i1, i1 0
-  %nop11587 = alloca i1, i1 0
-  %nop11588 = alloca i1, i1 0
-  %nop11589 = alloca i1, i1 0
-  %nop11590 = alloca i1, i1 0
-  %nop11591 = alloca i1, i1 0
-  %nop11592 = alloca i1, i1 0
-  %nop11593 = alloca i1, i1 0
-  %nop11594 = alloca i1, i1 0
-  %nop11595 = alloca i1, i1 0
-  %nop11596 = alloca i1, i1 0
-  %nop11597 = alloca i1, i1 0
-  %nop11598 = alloca i1, i1 0
-  %nop11599 = alloca i1, i1 0
-  %nop11600 = alloca i1, i1 0
-  %nop11601 = alloca i1, i1 0
-  %nop11602 = alloca i1, i1 0
-  %nop11603 = alloca i1, i1 0
-  %nop11604 = alloca i1, i1 0
-  %nop11605 = alloca i1, i1 0
-  %nop11606 = alloca i1, i1 0
-  %nop11607 = alloca i1, i1 0
-  %nop11608 = alloca i1, i1 0
-  %nop11609 = alloca i1, i1 0
-  %nop11610 = alloca i1, i1 0
-  %nop11611 = alloca i1, i1 0
-  %nop11612 = alloca i1, i1 0
-  %nop11613 = alloca i1, i1 0
-  %nop11614 = alloca i1, i1 0
-  %nop11615 = alloca i1, i1 0
-  %nop11616 = alloca i1, i1 0
-  %nop11617 = alloca i1, i1 0
-  %nop11618 = alloca i1, i1 0
-  %nop11619 = alloca i1, i1 0
-  %nop11620 = alloca i1, i1 0
-  %nop11621 = alloca i1, i1 0
-  %nop11622 = alloca i1, i1 0
-  %nop11623 = alloca i1, i1 0
-  %nop11624 = alloca i1, i1 0
-  %nop11625 = alloca i1, i1 0
-  %nop11626 = alloca i1, i1 0
-  %nop11627 = alloca i1, i1 0
-  %nop11628 = alloca i1, i1 0
-  %nop11629 = alloca i1, i1 0
-  %nop11630 = alloca i1, i1 0
-  %nop11631 = alloca i1, i1 0
-  %nop11632 = alloca i1, i1 0
-  %nop11633 = alloca i1, i1 0
-  %nop11634 = alloca i1, i1 0
-  %nop11635 = alloca i1, i1 0
-  %nop11636 = alloca i1, i1 0
-  %nop11637 = alloca i1, i1 0
-  %nop11638 = alloca i1, i1 0
-  %nop11639 = alloca i1, i1 0
-  %nop11640 = alloca i1, i1 0
-  %nop11641 = alloca i1, i1 0
-  %nop11642 = alloca i1, i1 0
-  %nop11643 = alloca i1, i1 0
-  %nop11644 = alloca i1, i1 0
-  %nop11645 = alloca i1, i1 0
-  %nop11646 = alloca i1, i1 0
-  %nop11647 = alloca i1, i1 0
-  %nop11648 = alloca i1, i1 0
-  %nop11649 = alloca i1, i1 0
-  %nop11650 = alloca i1, i1 0
-  %nop11651 = alloca i1, i1 0
-  %nop11652 = alloca i1, i1 0
-  %nop11653 = alloca i1, i1 0
-  %nop11654 = alloca i1, i1 0
-  %nop11655 = alloca i1, i1 0
-  %nop11656 = alloca i1, i1 0
-  %nop11657 = alloca i1, i1 0
-  %nop11658 = alloca i1, i1 0
-  %nop11659 = alloca i1, i1 0
-  %nop11660 = alloca i1, i1 0
-  %nop11661 = alloca i1, i1 0
-  %nop11662 = alloca i1, i1 0
-  %nop11663 = alloca i1, i1 0
-  %nop11664 = alloca i1, i1 0
-  %nop11665 = alloca i1, i1 0
-  %nop11666 = alloca i1, i1 0
-  %nop11667 = alloca i1, i1 0
-  %nop11668 = alloca i1, i1 0
-  %nop11669 = alloca i1, i1 0
-  %nop11670 = alloca i1, i1 0
-  %nop11671 = alloca i1, i1 0
-  %nop11672 = alloca i1, i1 0
-  %nop11673 = alloca i1, i1 0
-  %nop11674 = alloca i1, i1 0
-  %nop11675 = alloca i1, i1 0
-  %nop11676 = alloca i1, i1 0
-  %nop11677 = alloca i1, i1 0
-  %nop11678 = alloca i1, i1 0
-  %nop11679 = alloca i1, i1 0
-  %nop11680 = alloca i1, i1 0
-  %nop11681 = alloca i1, i1 0
-  %nop11682 = alloca i1, i1 0
-  %nop11683 = alloca i1, i1 0
-  %nop11684 = alloca i1, i1 0
-  %nop11685 = alloca i1, i1 0
-  %nop11686 = alloca i1, i1 0
-  %nop11687 = alloca i1, i1 0
-  %nop11688 = alloca i1, i1 0
-  %nop11689 = alloca i1, i1 0
-  %nop11690 = alloca i1, i1 0
-  %nop11691 = alloca i1, i1 0
-  %nop11692 = alloca i1, i1 0
-  %nop11693 = alloca i1, i1 0
-  %nop11694 = alloca i1, i1 0
-  %nop11695 = alloca i1, i1 0
-  %nop11696 = alloca i1, i1 0
-  %nop11697 = alloca i1, i1 0
-  %nop11698 = alloca i1, i1 0
-  %nop11699 = alloca i1, i1 0
-  %nop11700 = alloca i1, i1 0
-  %nop11701 = alloca i1, i1 0
-  %nop11702 = alloca i1, i1 0
-  %nop11703 = alloca i1, i1 0
-  %nop11704 = alloca i1, i1 0
-  %nop11705 = alloca i1, i1 0
-  %nop11706 = alloca i1, i1 0
-  %nop11707 = alloca i1, i1 0
-  %nop11708 = alloca i1, i1 0
-  %nop11709 = alloca i1, i1 0
-  %nop11710 = alloca i1, i1 0
-  %nop11711 = alloca i1, i1 0
-  %nop11712 = alloca i1, i1 0
-  %nop11713 = alloca i1, i1 0
-  %nop11714 = alloca i1, i1 0
-  %nop11715 = alloca i1, i1 0
-  %nop11716 = alloca i1, i1 0
-  %nop11717 = alloca i1, i1 0
-  %nop11718 = alloca i1, i1 0
-  %nop11719 = alloca i1, i1 0
-  %nop11720 = alloca i1, i1 0
-  %nop11721 = alloca i1, i1 0
-  %nop11722 = alloca i1, i1 0
-  %nop11723 = alloca i1, i1 0
-  %nop11724 = alloca i1, i1 0
-  %nop11725 = alloca i1, i1 0
-  %nop11726 = alloca i1, i1 0
-  %nop11727 = alloca i1, i1 0
-  %nop11728 = alloca i1, i1 0
-  %nop11729 = alloca i1, i1 0
-  %nop11730 = alloca i1, i1 0
-  %nop11731 = alloca i1, i1 0
-  %nop11732 = alloca i1, i1 0
-  %nop11733 = alloca i1, i1 0
-  %nop11734 = alloca i1, i1 0
-  %nop11735 = alloca i1, i1 0
-  %nop11736 = alloca i1, i1 0
-  %nop11737 = alloca i1, i1 0
-  %nop11738 = alloca i1, i1 0
-  %nop11739 = alloca i1, i1 0
-  %nop11740 = alloca i1, i1 0
-  %nop11741 = alloca i1, i1 0
-  %nop11742 = alloca i1, i1 0
-  %nop11743 = alloca i1, i1 0
-  %nop11744 = alloca i1, i1 0
-  %nop11745 = alloca i1, i1 0
-  %nop11746 = alloca i1, i1 0
-  %nop11747 = alloca i1, i1 0
-  %nop11748 = alloca i1, i1 0
-  %nop11749 = alloca i1, i1 0
-  %nop11750 = alloca i1, i1 0
-  %nop11751 = alloca i1, i1 0
-  %nop11752 = alloca i1, i1 0
-  %nop11753 = alloca i1, i1 0
-  %nop11754 = alloca i1, i1 0
-  %nop11755 = alloca i1, i1 0
-  %nop11756 = alloca i1, i1 0
-  %nop11757 = alloca i1, i1 0
-  %nop11758 = alloca i1, i1 0
-  %nop11759 = alloca i1, i1 0
-  %nop11760 = alloca i1, i1 0
-  %nop11761 = alloca i1, i1 0
-  %nop11762 = alloca i1, i1 0
-  %nop11763 = alloca i1, i1 0
-  %nop11764 = alloca i1, i1 0
-  %nop11765 = alloca i1, i1 0
-  %nop11766 = alloca i1, i1 0
-  %nop11767 = alloca i1, i1 0
-  %nop11768 = alloca i1, i1 0
-  %nop11769 = alloca i1, i1 0
-  %nop11770 = alloca i1, i1 0
-  %nop11771 = alloca i1, i1 0
-  %nop11772 = alloca i1, i1 0
-  %nop11773 = alloca i1, i1 0
-  %nop11774 = alloca i1, i1 0
-  %nop11775 = alloca i1, i1 0
-  %nop11776 = alloca i1, i1 0
-  %nop11777 = alloca i1, i1 0
-  %nop11778 = alloca i1, i1 0
-  %nop11779 = alloca i1, i1 0
-  %nop11780 = alloca i1, i1 0
-  %nop11781 = alloca i1, i1 0
-  %nop11782 = alloca i1, i1 0
-  %nop11783 = alloca i1, i1 0
-  %nop11784 = alloca i1, i1 0
-  %nop11785 = alloca i1, i1 0
-  %nop11786 = alloca i1, i1 0
-  %nop11787 = alloca i1, i1 0
-  %nop11788 = alloca i1, i1 0
-  %nop11789 = alloca i1, i1 0
-  %nop11790 = alloca i1, i1 0
-  %nop11791 = alloca i1, i1 0
-  %nop11792 = alloca i1, i1 0
-  %nop11793 = alloca i1, i1 0
-  %nop11794 = alloca i1, i1 0
-  %nop11795 = alloca i1, i1 0
-  %nop11796 = alloca i1, i1 0
-  %nop11797 = alloca i1, i1 0
-  %nop11798 = alloca i1, i1 0
-  %nop11799 = alloca i1, i1 0
-  %nop11800 = alloca i1, i1 0
-  %nop11801 = alloca i1, i1 0
-  %nop11802 = alloca i1, i1 0
-  %nop11803 = alloca i1, i1 0
-  %nop11804 = alloca i1, i1 0
-  %nop11805 = alloca i1, i1 0
-  %nop11806 = alloca i1, i1 0
-  %nop11807 = alloca i1, i1 0
-  %nop11808 = alloca i1, i1 0
-  %nop11809 = alloca i1, i1 0
-  %nop11810 = alloca i1, i1 0
-  %nop11811 = alloca i1, i1 0
-  %nop11812 = alloca i1, i1 0
-  %nop11813 = alloca i1, i1 0
-  %nop11814 = alloca i1, i1 0
-  %nop11815 = alloca i1, i1 0
-  %nop11816 = alloca i1, i1 0
-  %nop11817 = alloca i1, i1 0
-  %nop11818 = alloca i1, i1 0
-  %nop11819 = alloca i1, i1 0
-  %nop11820 = alloca i1, i1 0
-  %nop11821 = alloca i1, i1 0
-  %nop11822 = alloca i1, i1 0
-  %nop11823 = alloca i1, i1 0
-  %nop11824 = alloca i1, i1 0
-  %nop11825 = alloca i1, i1 0
-  %nop11826 = alloca i1, i1 0
-  %nop11827 = alloca i1, i1 0
-  %nop11828 = alloca i1, i1 0
-  %nop11829 = alloca i1, i1 0
-  %nop11830 = alloca i1, i1 0
-  %nop11831 = alloca i1, i1 0
-  %nop11832 = alloca i1, i1 0
-  %nop11833 = alloca i1, i1 0
-  %nop11834 = alloca i1, i1 0
-  %nop11835 = alloca i1, i1 0
-  %nop11836 = alloca i1, i1 0
-  %nop11837 = alloca i1, i1 0
-  %nop11838 = alloca i1, i1 0
-  %nop11839 = alloca i1, i1 0
-  %nop11840 = alloca i1, i1 0
-  %nop11841 = alloca i1, i1 0
-  %nop11842 = alloca i1, i1 0
-  %nop11843 = alloca i1, i1 0
-  %nop11844 = alloca i1, i1 0
-  %nop11845 = alloca i1, i1 0
-  %nop11846 = alloca i1, i1 0
-  %nop11847 = alloca i1, i1 0
-  %nop11848 = alloca i1, i1 0
-  %nop11849 = alloca i1, i1 0
-  %nop11850 = alloca i1, i1 0
-  %nop11851 = alloca i1, i1 0
-  %nop11852 = alloca i1, i1 0
-  %nop11853 = alloca i1, i1 0
-  %nop11854 = alloca i1, i1 0
-  %nop11855 = alloca i1, i1 0
-  %nop11856 = alloca i1, i1 0
-  %nop11857 = alloca i1, i1 0
-  %nop11858 = alloca i1, i1 0
-  %nop11859 = alloca i1, i1 0
-  %nop11860 = alloca i1, i1 0
-  %nop11861 = alloca i1, i1 0
-  %nop11862 = alloca i1, i1 0
-  %nop11863 = alloca i1, i1 0
-  %nop11864 = alloca i1, i1 0
-  %nop11865 = alloca i1, i1 0
-  %nop11866 = alloca i1, i1 0
-  %nop11867 = alloca i1, i1 0
-  %nop11868 = alloca i1, i1 0
-  %nop11869 = alloca i1, i1 0
-  %nop11870 = alloca i1, i1 0
-  %nop11871 = alloca i1, i1 0
-  %nop11872 = alloca i1, i1 0
-  %nop11873 = alloca i1, i1 0
-  %nop11874 = alloca i1, i1 0
-  %nop11875 = alloca i1, i1 0
-  %nop11876 = alloca i1, i1 0
-  %nop11877 = alloca i1, i1 0
-  %nop11878 = alloca i1, i1 0
-  %nop11879 = alloca i1, i1 0
-  %nop11880 = alloca i1, i1 0
-  %nop11881 = alloca i1, i1 0
-  %nop11882 = alloca i1, i1 0
-  %nop11883 = alloca i1, i1 0
-  %nop11884 = alloca i1, i1 0
-  %nop11885 = alloca i1, i1 0
-  %nop11886 = alloca i1, i1 0
-  %nop11887 = alloca i1, i1 0
-  %nop11888 = alloca i1, i1 0
-  %nop11889 = alloca i1, i1 0
-  %nop11890 = alloca i1, i1 0
-  %nop11891 = alloca i1, i1 0
-  %nop11892 = alloca i1, i1 0
-  %nop11893 = alloca i1, i1 0
-  %nop11894 = alloca i1, i1 0
-  %nop11895 = alloca i1, i1 0
-  %nop11896 = alloca i1, i1 0
-  %nop11897 = alloca i1, i1 0
-  %nop11898 = alloca i1, i1 0
-  %nop11899 = alloca i1, i1 0
-  %nop11900 = alloca i1, i1 0
-  %nop11901 = alloca i1, i1 0
-  %nop11902 = alloca i1, i1 0
-  %nop11903 = alloca i1, i1 0
-  %nop11904 = alloca i1, i1 0
-  %nop11905 = alloca i1, i1 0
-  %nop11906 = alloca i1, i1 0
-  %nop11907 = alloca i1, i1 0
-  %nop11908 = alloca i1, i1 0
-  %nop11909 = alloca i1, i1 0
-  %nop11910 = alloca i1, i1 0
-  %nop11911 = alloca i1, i1 0
-  %nop11912 = alloca i1, i1 0
-  %nop11913 = alloca i1, i1 0
-  %nop11914 = alloca i1, i1 0
-  %nop11915 = alloca i1, i1 0
-  %nop11916 = alloca i1, i1 0
-  %nop11917 = alloca i1, i1 0
-  %nop11918 = alloca i1, i1 0
-  %nop11919 = alloca i1, i1 0
-  %nop11920 = alloca i1, i1 0
-  %nop11921 = alloca i1, i1 0
-  %nop11922 = alloca i1, i1 0
-  %nop11923 = alloca i1, i1 0
-  %nop11924 = alloca i1, i1 0
-  %nop11925 = alloca i1, i1 0
-  %nop11926 = alloca i1, i1 0
-  %nop11927 = alloca i1, i1 0
-  %nop11928 = alloca i1, i1 0
-  %nop11929 = alloca i1, i1 0
-  %nop11930 = alloca i1, i1 0
-  %nop11931 = alloca i1, i1 0
-  %nop11932 = alloca i1, i1 0
-  %nop11933 = alloca i1, i1 0
-  %nop11934 = alloca i1, i1 0
-  %nop11935 = alloca i1, i1 0
-  %nop11936 = alloca i1, i1 0
-  %nop11937 = alloca i1, i1 0
-  %nop11938 = alloca i1, i1 0
-  %nop11939 = alloca i1, i1 0
-  %nop11940 = alloca i1, i1 0
-  %nop11941 = alloca i1, i1 0
-  %nop11942 = alloca i1, i1 0
-  %nop11943 = alloca i1, i1 0
-  %nop11944 = alloca i1, i1 0
-  %nop11945 = alloca i1, i1 0
-  %nop11946 = alloca i1, i1 0
-  %nop11947 = alloca i1, i1 0
-  %nop11948 = alloca i1, i1 0
-  %nop11949 = alloca i1, i1 0
-  %nop11950 = alloca i1, i1 0
-  %nop11951 = alloca i1, i1 0
-  %nop11952 = alloca i1, i1 0
-  %nop11953 = alloca i1, i1 0
-  %nop11954 = alloca i1, i1 0
-  %nop11955 = alloca i1, i1 0
-  %nop11956 = alloca i1, i1 0
-  %nop11957 = alloca i1, i1 0
-  %nop11958 = alloca i1, i1 0
-  %nop11959 = alloca i1, i1 0
-  %nop11960 = alloca i1, i1 0
-  %nop11961 = alloca i1, i1 0
-  %nop11962 = alloca i1, i1 0
-  %nop11963 = alloca i1, i1 0
-  %nop11964 = alloca i1, i1 0
-  %nop11965 = alloca i1, i1 0
-  %nop11966 = alloca i1, i1 0
-  %nop11967 = alloca i1, i1 0
-  %nop11968 = alloca i1, i1 0
-  %nop11969 = alloca i1, i1 0
-  %nop11970 = alloca i1, i1 0
-  %nop11971 = alloca i1, i1 0
-  %nop11972 = alloca i1, i1 0
-  %nop11973 = alloca i1, i1 0
-  %nop11974 = alloca i1, i1 0
-  %nop11975 = alloca i1, i1 0
-  %nop11976 = alloca i1, i1 0
-  %nop11977 = alloca i1, i1 0
-  %nop11978 = alloca i1, i1 0
-  %nop11979 = alloca i1, i1 0
-  %nop11980 = alloca i1, i1 0
-  %nop11981 = alloca i1, i1 0
-  %nop11982 = alloca i1, i1 0
-  %nop11983 = alloca i1, i1 0
-  %nop11984 = alloca i1, i1 0
-  %nop11985 = alloca i1, i1 0
-  %nop11986 = alloca i1, i1 0
-  %nop11987 = alloca i1, i1 0
-  %nop11988 = alloca i1, i1 0
-  %nop11989 = alloca i1, i1 0
-  %nop11990 = alloca i1, i1 0
-  %nop11991 = alloca i1, i1 0
-  %nop11992 = alloca i1, i1 0
-  %nop11993 = alloca i1, i1 0
-  %nop11994 = alloca i1, i1 0
-  %nop11995 = alloca i1, i1 0
-  %nop11996 = alloca i1, i1 0
-  %nop11997 = alloca i1, i1 0
-  %nop11998 = alloca i1, i1 0
-  %nop11999 = alloca i1, i1 0
-  %nop12000 = alloca i1, i1 0
-  %nop12001 = alloca i1, i1 0
-  %nop12002 = alloca i1, i1 0
-  %nop12003 = alloca i1, i1 0
-  %nop12004 = alloca i1, i1 0
-  %nop12005 = alloca i1, i1 0
-  %nop12006 = alloca i1, i1 0
-  %nop12007 = alloca i1, i1 0
-  %nop12008 = alloca i1, i1 0
-  %nop12009 = alloca i1, i1 0
-  %nop12010 = alloca i1, i1 0
-  %nop12011 = alloca i1, i1 0
-  %nop12012 = alloca i1, i1 0
-  %nop12013 = alloca i1, i1 0
-  %nop12014 = alloca i1, i1 0
-  %nop12015 = alloca i1, i1 0
-  %nop12016 = alloca i1, i1 0
-  %nop12017 = alloca i1, i1 0
-  %nop12018 = alloca i1, i1 0
-  %nop12019 = alloca i1, i1 0
-  %nop12020 = alloca i1, i1 0
-  %nop12021 = alloca i1, i1 0
-  %nop12022 = alloca i1, i1 0
-  %nop12023 = alloca i1, i1 0
-  %nop12024 = alloca i1, i1 0
-  %nop12025 = alloca i1, i1 0
-  %nop12026 = alloca i1, i1 0
-  %nop12027 = alloca i1, i1 0
-  %nop12028 = alloca i1, i1 0
-  %nop12029 = alloca i1, i1 0
-  %nop12030 = alloca i1, i1 0
-  %nop12031 = alloca i1, i1 0
-  %nop12032 = alloca i1, i1 0
-  %nop12033 = alloca i1, i1 0
-  %nop12034 = alloca i1, i1 0
-  %nop12035 = alloca i1, i1 0
-  %nop12036 = alloca i1, i1 0
-  %nop12037 = alloca i1, i1 0
-  %nop12038 = alloca i1, i1 0
-  %nop12039 = alloca i1, i1 0
-  %nop12040 = alloca i1, i1 0
-  %nop12041 = alloca i1, i1 0
-  %nop12042 = alloca i1, i1 0
-  %nop12043 = alloca i1, i1 0
-  %nop12044 = alloca i1, i1 0
-  %nop12045 = alloca i1, i1 0
-  %nop12046 = alloca i1, i1 0
-  %nop12047 = alloca i1, i1 0
-  %nop12048 = alloca i1, i1 0
-  %nop12049 = alloca i1, i1 0
-  %nop12050 = alloca i1, i1 0
-  %nop12051 = alloca i1, i1 0
-  %nop12052 = alloca i1, i1 0
-  %nop12053 = alloca i1, i1 0
-  %nop12054 = alloca i1, i1 0
-  %nop12055 = alloca i1, i1 0
-  %nop12056 = alloca i1, i1 0
-  %nop12057 = alloca i1, i1 0
-  %nop12058 = alloca i1, i1 0
-  %nop12059 = alloca i1, i1 0
-  %nop12060 = alloca i1, i1 0
-  %nop12061 = alloca i1, i1 0
-  %nop12062 = alloca i1, i1 0
-  %nop12063 = alloca i1, i1 0
-  %nop12064 = alloca i1, i1 0
-  %nop12065 = alloca i1, i1 0
-  %nop12066 = alloca i1, i1 0
-  %nop12067 = alloca i1, i1 0
-  %nop12068 = alloca i1, i1 0
-  %nop12069 = alloca i1, i1 0
-  %nop12070 = alloca i1, i1 0
-  %nop12071 = alloca i1, i1 0
-  %nop12072 = alloca i1, i1 0
-  %nop12073 = alloca i1, i1 0
-  %nop12074 = alloca i1, i1 0
-  %nop12075 = alloca i1, i1 0
-  %nop12076 = alloca i1, i1 0
-  %nop12077 = alloca i1, i1 0
-  %nop12078 = alloca i1, i1 0
-  %nop12079 = alloca i1, i1 0
-  %nop12080 = alloca i1, i1 0
-  %nop12081 = alloca i1, i1 0
-  %nop12082 = alloca i1, i1 0
-  %nop12083 = alloca i1, i1 0
-  %nop12084 = alloca i1, i1 0
-  %nop12085 = alloca i1, i1 0
-  %nop12086 = alloca i1, i1 0
-  %nop12087 = alloca i1, i1 0
-  %nop12088 = alloca i1, i1 0
-  %nop12089 = alloca i1, i1 0
-  %nop12090 = alloca i1, i1 0
-  %nop12091 = alloca i1, i1 0
-  %nop12092 = alloca i1, i1 0
-  %nop12093 = alloca i1, i1 0
-  %nop12094 = alloca i1, i1 0
-  %nop12095 = alloca i1, i1 0
-  %nop12096 = alloca i1, i1 0
-  %nop12097 = alloca i1, i1 0
-  %nop12098 = alloca i1, i1 0
-  %nop12099 = alloca i1, i1 0
-  %nop12100 = alloca i1, i1 0
-  %nop12101 = alloca i1, i1 0
-  %nop12102 = alloca i1, i1 0
-  %nop12103 = alloca i1, i1 0
-  %nop12104 = alloca i1, i1 0
-  %nop12105 = alloca i1, i1 0
-  %nop12106 = alloca i1, i1 0
-  %nop12107 = alloca i1, i1 0
-  %nop12108 = alloca i1, i1 0
-  %nop12109 = alloca i1, i1 0
-  %nop12110 = alloca i1, i1 0
-  %nop12111 = alloca i1, i1 0
-  %nop12112 = alloca i1, i1 0
-  %nop12113 = alloca i1, i1 0
-  %nop12114 = alloca i1, i1 0
-  %nop12115 = alloca i1, i1 0
-  %nop12116 = alloca i1, i1 0
-  %nop12117 = alloca i1, i1 0
-  %nop12118 = alloca i1, i1 0
-  %nop12119 = alloca i1, i1 0
-  %nop12120 = alloca i1, i1 0
-  %nop12121 = alloca i1, i1 0
-  %nop12122 = alloca i1, i1 0
-  %nop12123 = alloca i1, i1 0
-  %nop12124 = alloca i1, i1 0
-  %nop12125 = alloca i1, i1 0
-  %nop12126 = alloca i1, i1 0
-  %nop12127 = alloca i1, i1 0
-  %nop12128 = alloca i1, i1 0
-  %nop12129 = alloca i1, i1 0
-  %nop12130 = alloca i1, i1 0
-  %nop12131 = alloca i1, i1 0
-  %nop12132 = alloca i1, i1 0
-  %nop12133 = alloca i1, i1 0
-  %nop12134 = alloca i1, i1 0
-  %nop12135 = alloca i1, i1 0
-  %nop12136 = alloca i1, i1 0
-  %nop12137 = alloca i1, i1 0
-  %nop12138 = alloca i1, i1 0
-  %nop12139 = alloca i1, i1 0
-  %nop12140 = alloca i1, i1 0
-  %nop12141 = alloca i1, i1 0
-  %nop12142 = alloca i1, i1 0
-  %nop12143 = alloca i1, i1 0
-  %nop12144 = alloca i1, i1 0
-  %nop12145 = alloca i1, i1 0
-  %nop12146 = alloca i1, i1 0
-  %nop12147 = alloca i1, i1 0
-  %nop12148 = alloca i1, i1 0
-  %nop12149 = alloca i1, i1 0
-  %nop12150 = alloca i1, i1 0
-  %nop12151 = alloca i1, i1 0
-  %nop12152 = alloca i1, i1 0
-  %nop12153 = alloca i1, i1 0
-  %nop12154 = alloca i1, i1 0
-  %nop12155 = alloca i1, i1 0
-  %nop12156 = alloca i1, i1 0
-  %nop12157 = alloca i1, i1 0
-  %nop12158 = alloca i1, i1 0
-  %nop12159 = alloca i1, i1 0
-  %nop12160 = alloca i1, i1 0
-  %nop12161 = alloca i1, i1 0
-  %nop12162 = alloca i1, i1 0
-  %nop12163 = alloca i1, i1 0
-  %nop12164 = alloca i1, i1 0
-  %nop12165 = alloca i1, i1 0
-  %nop12166 = alloca i1, i1 0
-  %nop12167 = alloca i1, i1 0
-  %nop12168 = alloca i1, i1 0
-  %nop12169 = alloca i1, i1 0
-  %nop12170 = alloca i1, i1 0
-  %nop12171 = alloca i1, i1 0
-  %nop12172 = alloca i1, i1 0
-  %nop12173 = alloca i1, i1 0
-  %nop12174 = alloca i1, i1 0
-  %nop12175 = alloca i1, i1 0
-  %nop12176 = alloca i1, i1 0
-  %nop12177 = alloca i1, i1 0
-  %nop12178 = alloca i1, i1 0
-  %nop12179 = alloca i1, i1 0
-  %nop12180 = alloca i1, i1 0
-  %nop12181 = alloca i1, i1 0
-  %nop12182 = alloca i1, i1 0
-  %nop12183 = alloca i1, i1 0
-  %nop12184 = alloca i1, i1 0
-  %nop12185 = alloca i1, i1 0
-  %nop12186 = alloca i1, i1 0
-  %nop12187 = alloca i1, i1 0
-  %nop12188 = alloca i1, i1 0
-  %nop12189 = alloca i1, i1 0
-  %nop12190 = alloca i1, i1 0
-  %nop12191 = alloca i1, i1 0
-  %nop12192 = alloca i1, i1 0
-  %nop12193 = alloca i1, i1 0
-  %nop12194 = alloca i1, i1 0
-  %nop12195 = alloca i1, i1 0
-  %nop12196 = alloca i1, i1 0
-  %nop12197 = alloca i1, i1 0
-  %nop12198 = alloca i1, i1 0
-  %nop12199 = alloca i1, i1 0
-  %nop12200 = alloca i1, i1 0
-  %nop12201 = alloca i1, i1 0
-  %nop12202 = alloca i1, i1 0
-  %nop12203 = alloca i1, i1 0
-  %nop12204 = alloca i1, i1 0
-  %nop12205 = alloca i1, i1 0
-  %nop12206 = alloca i1, i1 0
-  %nop12207 = alloca i1, i1 0
-  %nop12208 = alloca i1, i1 0
-  %nop12209 = alloca i1, i1 0
-  %nop12210 = alloca i1, i1 0
-  %nop12211 = alloca i1, i1 0
-  %nop12212 = alloca i1, i1 0
-  %nop12213 = alloca i1, i1 0
-  %nop12214 = alloca i1, i1 0
-  %nop12215 = alloca i1, i1 0
-  %nop12216 = alloca i1, i1 0
-  %nop12217 = alloca i1, i1 0
-  %nop12218 = alloca i1, i1 0
-  %nop12219 = alloca i1, i1 0
-  %nop12220 = alloca i1, i1 0
-  %nop12221 = alloca i1, i1 0
-  %nop12222 = alloca i1, i1 0
-  %nop12223 = alloca i1, i1 0
-  %nop12224 = alloca i1, i1 0
-  %nop12225 = alloca i1, i1 0
-  %nop12226 = alloca i1, i1 0
-  %nop12227 = alloca i1, i1 0
-  %nop12228 = alloca i1, i1 0
-  %nop12229 = alloca i1, i1 0
-  %nop12230 = alloca i1, i1 0
-  %nop12231 = alloca i1, i1 0
-  %nop12232 = alloca i1, i1 0
-  %nop12233 = alloca i1, i1 0
-  %nop12234 = alloca i1, i1 0
-  %nop12235 = alloca i1, i1 0
-  %nop12236 = alloca i1, i1 0
-  %nop12237 = alloca i1, i1 0
-  %nop12238 = alloca i1, i1 0
-  %nop12239 = alloca i1, i1 0
-  %nop12240 = alloca i1, i1 0
-  %nop12241 = alloca i1, i1 0
-  %nop12242 = alloca i1, i1 0
-  %nop12243 = alloca i1, i1 0
-  %nop12244 = alloca i1, i1 0
-  %nop12245 = alloca i1, i1 0
-  %nop12246 = alloca i1, i1 0
-  %nop12247 = alloca i1, i1 0
-  %nop12248 = alloca i1, i1 0
-  %nop12249 = alloca i1, i1 0
-  %nop12250 = alloca i1, i1 0
-  %nop12251 = alloca i1, i1 0
-  %nop12252 = alloca i1, i1 0
-  %nop12253 = alloca i1, i1 0
-  %nop12254 = alloca i1, i1 0
-  %nop12255 = alloca i1, i1 0
-  %nop12256 = alloca i1, i1 0
-  %nop12257 = alloca i1, i1 0
-  %nop12258 = alloca i1, i1 0
-  %nop12259 = alloca i1, i1 0
-  %nop12260 = alloca i1, i1 0
-  %nop12261 = alloca i1, i1 0
-  %nop12262 = alloca i1, i1 0
-  %nop12263 = alloca i1, i1 0
-  %nop12264 = alloca i1, i1 0
-  %nop12265 = alloca i1, i1 0
-  %nop12266 = alloca i1, i1 0
-  %nop12267 = alloca i1, i1 0
-  %nop12268 = alloca i1, i1 0
-  %nop12269 = alloca i1, i1 0
-  %nop12270 = alloca i1, i1 0
-  %nop12271 = alloca i1, i1 0
-  %nop12272 = alloca i1, i1 0
-  %nop12273 = alloca i1, i1 0
-  %nop12274 = alloca i1, i1 0
-  %nop12275 = alloca i1, i1 0
-  %nop12276 = alloca i1, i1 0
-  %nop12277 = alloca i1, i1 0
-  %nop12278 = alloca i1, i1 0
-  %nop12279 = alloca i1, i1 0
-  %nop12280 = alloca i1, i1 0
-  %nop12281 = alloca i1, i1 0
-  %nop12282 = alloca i1, i1 0
-  %nop12283 = alloca i1, i1 0
-  %nop12284 = alloca i1, i1 0
-  %nop12285 = alloca i1, i1 0
-  %nop12286 = alloca i1, i1 0
-  %nop12287 = alloca i1, i1 0
-  %nop12288 = alloca i1, i1 0
-  %nop12289 = alloca i1, i1 0
-  %nop12290 = alloca i1, i1 0
-  %nop12291 = alloca i1, i1 0
-  %nop12292 = alloca i1, i1 0
-  %nop12293 = alloca i1, i1 0
-  %nop12294 = alloca i1, i1 0
-  %nop12295 = alloca i1, i1 0
-  %nop12296 = alloca i1, i1 0
-  %nop12297 = alloca i1, i1 0
-  %nop12298 = alloca i1, i1 0
-  %nop12299 = alloca i1, i1 0
-  %nop12300 = alloca i1, i1 0
-  %nop12301 = alloca i1, i1 0
-  %nop12302 = alloca i1, i1 0
-  %nop12303 = alloca i1, i1 0
-  %nop12304 = alloca i1, i1 0
-  %nop12305 = alloca i1, i1 0
-  %nop12306 = alloca i1, i1 0
-  %nop12307 = alloca i1, i1 0
-  %nop12308 = alloca i1, i1 0
-  %nop12309 = alloca i1, i1 0
-  %nop12310 = alloca i1, i1 0
-  %nop12311 = alloca i1, i1 0
-  %nop12312 = alloca i1, i1 0
-  %nop12313 = alloca i1, i1 0
-  %nop12314 = alloca i1, i1 0
-  %nop12315 = alloca i1, i1 0
-  %nop12316 = alloca i1, i1 0
-  %nop12317 = alloca i1, i1 0
-  %nop12318 = alloca i1, i1 0
-  %nop12319 = alloca i1, i1 0
-  %nop12320 = alloca i1, i1 0
-  %nop12321 = alloca i1, i1 0
-  %nop12322 = alloca i1, i1 0
-  %nop12323 = alloca i1, i1 0
-  %nop12324 = alloca i1, i1 0
-  %nop12325 = alloca i1, i1 0
-  %nop12326 = alloca i1, i1 0
-  %nop12327 = alloca i1, i1 0
-  %nop12328 = alloca i1, i1 0
-  %nop12329 = alloca i1, i1 0
-  %nop12330 = alloca i1, i1 0
-  %nop12331 = alloca i1, i1 0
-  %nop12332 = alloca i1, i1 0
-  %nop12333 = alloca i1, i1 0
-  %nop12334 = alloca i1, i1 0
-  %nop12335 = alloca i1, i1 0
-  %nop12336 = alloca i1, i1 0
-  %nop12337 = alloca i1, i1 0
-  %nop12338 = alloca i1, i1 0
-  %nop12339 = alloca i1, i1 0
-  %nop12340 = alloca i1, i1 0
-  %nop12341 = alloca i1, i1 0
-  %nop12342 = alloca i1, i1 0
-  %nop12343 = alloca i1, i1 0
-  %nop12344 = alloca i1, i1 0
-  %nop12345 = alloca i1, i1 0
-  %nop12346 = alloca i1, i1 0
-  %nop12347 = alloca i1, i1 0
-  %nop12348 = alloca i1, i1 0
-  %nop12349 = alloca i1, i1 0
-  %nop12350 = alloca i1, i1 0
-  %nop12351 = alloca i1, i1 0
-  %nop12352 = alloca i1, i1 0
-  %nop12353 = alloca i1, i1 0
-  %nop12354 = alloca i1, i1 0
-  %nop12355 = alloca i1, i1 0
-  %nop12356 = alloca i1, i1 0
-  %nop12357 = alloca i1, i1 0
-  %nop12358 = alloca i1, i1 0
-  %nop12359 = alloca i1, i1 0
-  %nop12360 = alloca i1, i1 0
-  %nop12361 = alloca i1, i1 0
-  %nop12362 = alloca i1, i1 0
-  %nop12363 = alloca i1, i1 0
-  %nop12364 = alloca i1, i1 0
-  %nop12365 = alloca i1, i1 0
-  %nop12366 = alloca i1, i1 0
-  %nop12367 = alloca i1, i1 0
-  %nop12368 = alloca i1, i1 0
-  %nop12369 = alloca i1, i1 0
-  %nop12370 = alloca i1, i1 0
-  %nop12371 = alloca i1, i1 0
-  %nop12372 = alloca i1, i1 0
-  %nop12373 = alloca i1, i1 0
-  %nop12374 = alloca i1, i1 0
-  %nop12375 = alloca i1, i1 0
-  %nop12376 = alloca i1, i1 0
-  %nop12377 = alloca i1, i1 0
-  %nop12378 = alloca i1, i1 0
-  %nop12379 = alloca i1, i1 0
-  %nop12380 = alloca i1, i1 0
-  %nop12381 = alloca i1, i1 0
-  %nop12382 = alloca i1, i1 0
-  %nop12383 = alloca i1, i1 0
-  %nop12384 = alloca i1, i1 0
-  %nop12385 = alloca i1, i1 0
-  %nop12386 = alloca i1, i1 0
-  %nop12387 = alloca i1, i1 0
-  %nop12388 = alloca i1, i1 0
-  %nop12389 = alloca i1, i1 0
-  %nop12390 = alloca i1, i1 0
-  %nop12391 = alloca i1, i1 0
-  %nop12392 = alloca i1, i1 0
-  %nop12393 = alloca i1, i1 0
-  %nop12394 = alloca i1, i1 0
-  %nop12395 = alloca i1, i1 0
-  %nop12396 = alloca i1, i1 0
-  %nop12397 = alloca i1, i1 0
-  %nop12398 = alloca i1, i1 0
-  %nop12399 = alloca i1, i1 0
-  %nop12400 = alloca i1, i1 0
-  %nop12401 = alloca i1, i1 0
-  %nop12402 = alloca i1, i1 0
-  %nop12403 = alloca i1, i1 0
-  %nop12404 = alloca i1, i1 0
-  %nop12405 = alloca i1, i1 0
-  %nop12406 = alloca i1, i1 0
-  %nop12407 = alloca i1, i1 0
-  %nop12408 = alloca i1, i1 0
-  %nop12409 = alloca i1, i1 0
-  %nop12410 = alloca i1, i1 0
-  %nop12411 = alloca i1, i1 0
-  %nop12412 = alloca i1, i1 0
-  %nop12413 = alloca i1, i1 0
-  %nop12414 = alloca i1, i1 0
-  %nop12415 = alloca i1, i1 0
-  %nop12416 = alloca i1, i1 0
-  %nop12417 = alloca i1, i1 0
-  %nop12418 = alloca i1, i1 0
-  %nop12419 = alloca i1, i1 0
-  %nop12420 = alloca i1, i1 0
-  %nop12421 = alloca i1, i1 0
-  %nop12422 = alloca i1, i1 0
-  %nop12423 = alloca i1, i1 0
-  %nop12424 = alloca i1, i1 0
-  %nop12425 = alloca i1, i1 0
-  %nop12426 = alloca i1, i1 0
-  %nop12427 = alloca i1, i1 0
-  %nop12428 = alloca i1, i1 0
-  %nop12429 = alloca i1, i1 0
-  %nop12430 = alloca i1, i1 0
-  %nop12431 = alloca i1, i1 0
-  %nop12432 = alloca i1, i1 0
-  %nop12433 = alloca i1, i1 0
-  %nop12434 = alloca i1, i1 0
-  %nop12435 = alloca i1, i1 0
-  %nop12436 = alloca i1, i1 0
-  %nop12437 = alloca i1, i1 0
-  %nop12438 = alloca i1, i1 0
-  %nop12439 = alloca i1, i1 0
-  %nop12440 = alloca i1, i1 0
-  %nop12441 = alloca i1, i1 0
-  %nop12442 = alloca i1, i1 0
-  %nop12443 = alloca i1, i1 0
-  %nop12444 = alloca i1, i1 0
-  %nop12445 = alloca i1, i1 0
-  %nop12446 = alloca i1, i1 0
-  %nop12447 = alloca i1, i1 0
-  %nop12448 = alloca i1, i1 0
-  %nop12449 = alloca i1, i1 0
-  %nop12450 = alloca i1, i1 0
-  %nop12451 = alloca i1, i1 0
-  %nop12452 = alloca i1, i1 0
-  %nop12453 = alloca i1, i1 0
-  %nop12454 = alloca i1, i1 0
-  %nop12455 = alloca i1, i1 0
-  %nop12456 = alloca i1, i1 0
-  %nop12457 = alloca i1, i1 0
-  %nop12458 = alloca i1, i1 0
-  %nop12459 = alloca i1, i1 0
-  %nop12460 = alloca i1, i1 0
-  %nop12461 = alloca i1, i1 0
-  %nop12462 = alloca i1, i1 0
-  %nop12463 = alloca i1, i1 0
-  %nop12464 = alloca i1, i1 0
-  %nop12465 = alloca i1, i1 0
-  %nop12466 = alloca i1, i1 0
-  %nop12467 = alloca i1, i1 0
-  %nop12468 = alloca i1, i1 0
-  %nop12469 = alloca i1, i1 0
-  %nop12470 = alloca i1, i1 0
-  %nop12471 = alloca i1, i1 0
-  %nop12472 = alloca i1, i1 0
-  %nop12473 = alloca i1, i1 0
-  %nop12474 = alloca i1, i1 0
-  %nop12475 = alloca i1, i1 0
-  %nop12476 = alloca i1, i1 0
-  %nop12477 = alloca i1, i1 0
-  %nop12478 = alloca i1, i1 0
-  %nop12479 = alloca i1, i1 0
-  %nop12480 = alloca i1, i1 0
-  %nop12481 = alloca i1, i1 0
-  %nop12482 = alloca i1, i1 0
-  %nop12483 = alloca i1, i1 0
-  %nop12484 = alloca i1, i1 0
-  %nop12485 = alloca i1, i1 0
-  %nop12486 = alloca i1, i1 0
-  %nop12487 = alloca i1, i1 0
-  %nop12488 = alloca i1, i1 0
-  %nop12489 = alloca i1, i1 0
-  %nop12490 = alloca i1, i1 0
-  %nop12491 = alloca i1, i1 0
-  %nop12492 = alloca i1, i1 0
-  %nop12493 = alloca i1, i1 0
-  %nop12494 = alloca i1, i1 0
-  %nop12495 = alloca i1, i1 0
-  %nop12496 = alloca i1, i1 0
-  %nop12497 = alloca i1, i1 0
-  %nop12498 = alloca i1, i1 0
-  %nop12499 = alloca i1, i1 0
-  %nop12500 = alloca i1, i1 0
-  %nop12501 = alloca i1, i1 0
-  %nop12502 = alloca i1, i1 0
-  %nop12503 = alloca i1, i1 0
-  %nop12504 = alloca i1, i1 0
-  %nop12505 = alloca i1, i1 0
-  %nop12506 = alloca i1, i1 0
-  %nop12507 = alloca i1, i1 0
-  %nop12508 = alloca i1, i1 0
-  %nop12509 = alloca i1, i1 0
-  %nop12510 = alloca i1, i1 0
-  %nop12511 = alloca i1, i1 0
-  %nop12512 = alloca i1, i1 0
-  %nop12513 = alloca i1, i1 0
-  %nop12514 = alloca i1, i1 0
-  %nop12515 = alloca i1, i1 0
-  %nop12516 = alloca i1, i1 0
-  %nop12517 = alloca i1, i1 0
-  %nop12518 = alloca i1, i1 0
-  %nop12519 = alloca i1, i1 0
-  %nop12520 = alloca i1, i1 0
-  %nop12521 = alloca i1, i1 0
-  %nop12522 = alloca i1, i1 0
-  %nop12523 = alloca i1, i1 0
-  %nop12524 = alloca i1, i1 0
-  %nop12525 = alloca i1, i1 0
-  %nop12526 = alloca i1, i1 0
-  %nop12527 = alloca i1, i1 0
-  %nop12528 = alloca i1, i1 0
-  %nop12529 = alloca i1, i1 0
-  %nop12530 = alloca i1, i1 0
-  %nop12531 = alloca i1, i1 0
-  %nop12532 = alloca i1, i1 0
-  %nop12533 = alloca i1, i1 0
-  %nop12534 = alloca i1, i1 0
-  %nop12535 = alloca i1, i1 0
-  %nop12536 = alloca i1, i1 0
-  %nop12537 = alloca i1, i1 0
-  %nop12538 = alloca i1, i1 0
-  %nop12539 = alloca i1, i1 0
-  %nop12540 = alloca i1, i1 0
-  %nop12541 = alloca i1, i1 0
-  %nop12542 = alloca i1, i1 0
-  %nop12543 = alloca i1, i1 0
-  %nop12544 = alloca i1, i1 0
-  %nop12545 = alloca i1, i1 0
-  %nop12546 = alloca i1, i1 0
-  %nop12547 = alloca i1, i1 0
-  %nop12548 = alloca i1, i1 0
-  %nop12549 = alloca i1, i1 0
-  %nop12550 = alloca i1, i1 0
-  %nop12551 = alloca i1, i1 0
-  %nop12552 = alloca i1, i1 0
-  %nop12553 = alloca i1, i1 0
-  %nop12554 = alloca i1, i1 0
-  %nop12555 = alloca i1, i1 0
-  %nop12556 = alloca i1, i1 0
-  %nop12557 = alloca i1, i1 0
-  %nop12558 = alloca i1, i1 0
-  %nop12559 = alloca i1, i1 0
-  %nop12560 = alloca i1, i1 0
-  %nop12561 = alloca i1, i1 0
-  %nop12562 = alloca i1, i1 0
-  %nop12563 = alloca i1, i1 0
-  %nop12564 = alloca i1, i1 0
-  %nop12565 = alloca i1, i1 0
-  %nop12566 = alloca i1, i1 0
-  %nop12567 = alloca i1, i1 0
-  %nop12568 = alloca i1, i1 0
-  %nop12569 = alloca i1, i1 0
-  %nop12570 = alloca i1, i1 0
-  %nop12571 = alloca i1, i1 0
-  %nop12572 = alloca i1, i1 0
-  %nop12573 = alloca i1, i1 0
-  %nop12574 = alloca i1, i1 0
-  %nop12575 = alloca i1, i1 0
-  %nop12576 = alloca i1, i1 0
-  %nop12577 = alloca i1, i1 0
-  %nop12578 = alloca i1, i1 0
-  %nop12579 = alloca i1, i1 0
-  %nop12580 = alloca i1, i1 0
-  %nop12581 = alloca i1, i1 0
-  %nop12582 = alloca i1, i1 0
-  %nop12583 = alloca i1, i1 0
-  %nop12584 = alloca i1, i1 0
-  %nop12585 = alloca i1, i1 0
-  %nop12586 = alloca i1, i1 0
-  %nop12587 = alloca i1, i1 0
-  %nop12588 = alloca i1, i1 0
-  %nop12589 = alloca i1, i1 0
-  %nop12590 = alloca i1, i1 0
-  %nop12591 = alloca i1, i1 0
-  %nop12592 = alloca i1, i1 0
-  %nop12593 = alloca i1, i1 0
-  %nop12594 = alloca i1, i1 0
-  %nop12595 = alloca i1, i1 0
-  %nop12596 = alloca i1, i1 0
-  %nop12597 = alloca i1, i1 0
-  %nop12598 = alloca i1, i1 0
-  %nop12599 = alloca i1, i1 0
-  %nop12600 = alloca i1, i1 0
-  %nop12601 = alloca i1, i1 0
-  %nop12602 = alloca i1, i1 0
-  %nop12603 = alloca i1, i1 0
-  %nop12604 = alloca i1, i1 0
-  %nop12605 = alloca i1, i1 0
-  %nop12606 = alloca i1, i1 0
-  %nop12607 = alloca i1, i1 0
-  %nop12608 = alloca i1, i1 0
-  %nop12609 = alloca i1, i1 0
-  %nop12610 = alloca i1, i1 0
-  %nop12611 = alloca i1, i1 0
-  %nop12612 = alloca i1, i1 0
-  %nop12613 = alloca i1, i1 0
-  %nop12614 = alloca i1, i1 0
-  %nop12615 = alloca i1, i1 0
-  %nop12616 = alloca i1, i1 0
-  %nop12617 = alloca i1, i1 0
-  %nop12618 = alloca i1, i1 0
-  %nop12619 = alloca i1, i1 0
-  %nop12620 = alloca i1, i1 0
-  %nop12621 = alloca i1, i1 0
-  %nop12622 = alloca i1, i1 0
-  %nop12623 = alloca i1, i1 0
-  %nop12624 = alloca i1, i1 0
-  %nop12625 = alloca i1, i1 0
-  %nop12626 = alloca i1, i1 0
-  %nop12627 = alloca i1, i1 0
-  %nop12628 = alloca i1, i1 0
-  %nop12629 = alloca i1, i1 0
-  %nop12630 = alloca i1, i1 0
-  %nop12631 = alloca i1, i1 0
-  %nop12632 = alloca i1, i1 0
-  %nop12633 = alloca i1, i1 0
-  %nop12634 = alloca i1, i1 0
-  %nop12635 = alloca i1, i1 0
-  %nop12636 = alloca i1, i1 0
-  %nop12637 = alloca i1, i1 0
-  %nop12638 = alloca i1, i1 0
-  %nop12639 = alloca i1, i1 0
-  %nop12640 = alloca i1, i1 0
-  %nop12641 = alloca i1, i1 0
-  %nop12642 = alloca i1, i1 0
-  %nop12643 = alloca i1, i1 0
-  %nop12644 = alloca i1, i1 0
-  %nop12645 = alloca i1, i1 0
-  %nop12646 = alloca i1, i1 0
-  %nop12647 = alloca i1, i1 0
-  %nop12648 = alloca i1, i1 0
-  %nop12649 = alloca i1, i1 0
-  %nop12650 = alloca i1, i1 0
-  %nop12651 = alloca i1, i1 0
-  %nop12652 = alloca i1, i1 0
-  %nop12653 = alloca i1, i1 0
-  %nop12654 = alloca i1, i1 0
-  %nop12655 = alloca i1, i1 0
-  %nop12656 = alloca i1, i1 0
-  %nop12657 = alloca i1, i1 0
-  %nop12658 = alloca i1, i1 0
-  %nop12659 = alloca i1, i1 0
-  %nop12660 = alloca i1, i1 0
-  %nop12661 = alloca i1, i1 0
-  %nop12662 = alloca i1, i1 0
-  %nop12663 = alloca i1, i1 0
-  %nop12664 = alloca i1, i1 0
-  %nop12665 = alloca i1, i1 0
-  %nop12666 = alloca i1, i1 0
-  %nop12667 = alloca i1, i1 0
-  %nop12668 = alloca i1, i1 0
-  %nop12669 = alloca i1, i1 0
-  %nop12670 = alloca i1, i1 0
-  %nop12671 = alloca i1, i1 0
-  %nop12672 = alloca i1, i1 0
-  %nop12673 = alloca i1, i1 0
-  %nop12674 = alloca i1, i1 0
-  %nop12675 = alloca i1, i1 0
-  %nop12676 = alloca i1, i1 0
-  %nop12677 = alloca i1, i1 0
-  %nop12678 = alloca i1, i1 0
-  %nop12679 = alloca i1, i1 0
-  %nop12680 = alloca i1, i1 0
-  %nop12681 = alloca i1, i1 0
-  %nop12682 = alloca i1, i1 0
-  %nop12683 = alloca i1, i1 0
-  %nop12684 = alloca i1, i1 0
-  %nop12685 = alloca i1, i1 0
-  %nop12686 = alloca i1, i1 0
-  %nop12687 = alloca i1, i1 0
-  %nop12688 = alloca i1, i1 0
-  %nop12689 = alloca i1, i1 0
-  %nop12690 = alloca i1, i1 0
-  %nop12691 = alloca i1, i1 0
-  %nop12692 = alloca i1, i1 0
-  %nop12693 = alloca i1, i1 0
-  %nop12694 = alloca i1, i1 0
-  %nop12695 = alloca i1, i1 0
-  %nop12696 = alloca i1, i1 0
-  %nop12697 = alloca i1, i1 0
-  %nop12698 = alloca i1, i1 0
-  %nop12699 = alloca i1, i1 0
-  %nop12700 = alloca i1, i1 0
-  %nop12701 = alloca i1, i1 0
-  %nop12702 = alloca i1, i1 0
-  %nop12703 = alloca i1, i1 0
-  %nop12704 = alloca i1, i1 0
-  %nop12705 = alloca i1, i1 0
-  %nop12706 = alloca i1, i1 0
-  %nop12707 = alloca i1, i1 0
-  %nop12708 = alloca i1, i1 0
-  %nop12709 = alloca i1, i1 0
-  %nop12710 = alloca i1, i1 0
-  %nop12711 = alloca i1, i1 0
-  %nop12712 = alloca i1, i1 0
-  %nop12713 = alloca i1, i1 0
-  %nop12714 = alloca i1, i1 0
-  %nop12715 = alloca i1, i1 0
-  %nop12716 = alloca i1, i1 0
-  %nop12717 = alloca i1, i1 0
-  %nop12718 = alloca i1, i1 0
-  %nop12719 = alloca i1, i1 0
-  %nop12720 = alloca i1, i1 0
-  %nop12721 = alloca i1, i1 0
-  %nop12722 = alloca i1, i1 0
-  %nop12723 = alloca i1, i1 0
-  %nop12724 = alloca i1, i1 0
-  %nop12725 = alloca i1, i1 0
-  %nop12726 = alloca i1, i1 0
-  %nop12727 = alloca i1, i1 0
-  %nop12728 = alloca i1, i1 0
-  %nop12729 = alloca i1, i1 0
-  %nop12730 = alloca i1, i1 0
-  %nop12731 = alloca i1, i1 0
-  %nop12732 = alloca i1, i1 0
-  %nop12733 = alloca i1, i1 0
-  %nop12734 = alloca i1, i1 0
-  %nop12735 = alloca i1, i1 0
-  %nop12736 = alloca i1, i1 0
-  %nop12737 = alloca i1, i1 0
-  %nop12738 = alloca i1, i1 0
-  %nop12739 = alloca i1, i1 0
-  %nop12740 = alloca i1, i1 0
-  %nop12741 = alloca i1, i1 0
-  %nop12742 = alloca i1, i1 0
-  %nop12743 = alloca i1, i1 0
-  %nop12744 = alloca i1, i1 0
-  %nop12745 = alloca i1, i1 0
-  %nop12746 = alloca i1, i1 0
-  %nop12747 = alloca i1, i1 0
-  %nop12748 = alloca i1, i1 0
-  %nop12749 = alloca i1, i1 0
-  %nop12750 = alloca i1, i1 0
-  %nop12751 = alloca i1, i1 0
-  %nop12752 = alloca i1, i1 0
-  %nop12753 = alloca i1, i1 0
-  %nop12754 = alloca i1, i1 0
-  %nop12755 = alloca i1, i1 0
-  %nop12756 = alloca i1, i1 0
-  %nop12757 = alloca i1, i1 0
-  %nop12758 = alloca i1, i1 0
-  %nop12759 = alloca i1, i1 0
-  %nop12760 = alloca i1, i1 0
-  %nop12761 = alloca i1, i1 0
-  %nop12762 = alloca i1, i1 0
-  %nop12763 = alloca i1, i1 0
-  %nop12764 = alloca i1, i1 0
-  %nop12765 = alloca i1, i1 0
-  %nop12766 = alloca i1, i1 0
-  %nop12767 = alloca i1, i1 0
-  %nop12768 = alloca i1, i1 0
-  %nop12769 = alloca i1, i1 0
-  %nop12770 = alloca i1, i1 0
-  %nop12771 = alloca i1, i1 0
-  %nop12772 = alloca i1, i1 0
-  %nop12773 = alloca i1, i1 0
-  %nop12774 = alloca i1, i1 0
-  %nop12775 = alloca i1, i1 0
-  %nop12776 = alloca i1, i1 0
-  %nop12777 = alloca i1, i1 0
-  %nop12778 = alloca i1, i1 0
-  %nop12779 = alloca i1, i1 0
-  %nop12780 = alloca i1, i1 0
-  %nop12781 = alloca i1, i1 0
-  %nop12782 = alloca i1, i1 0
-  %nop12783 = alloca i1, i1 0
-  %nop12784 = alloca i1, i1 0
-  %nop12785 = alloca i1, i1 0
-  %nop12786 = alloca i1, i1 0
-  %nop12787 = alloca i1, i1 0
-  %nop12788 = alloca i1, i1 0
-  %nop12789 = alloca i1, i1 0
-  %nop12790 = alloca i1, i1 0
-  %nop12791 = alloca i1, i1 0
-  %nop12792 = alloca i1, i1 0
-  %nop12793 = alloca i1, i1 0
-  %nop12794 = alloca i1, i1 0
-  %nop12795 = alloca i1, i1 0
-  %nop12796 = alloca i1, i1 0
-  %nop12797 = alloca i1, i1 0
-  %nop12798 = alloca i1, i1 0
-  %nop12799 = alloca i1, i1 0
-  %nop12800 = alloca i1, i1 0
-  %nop12801 = alloca i1, i1 0
-  %nop12802 = alloca i1, i1 0
-  %nop12803 = alloca i1, i1 0
-  %nop12804 = alloca i1, i1 0
-  %nop12805 = alloca i1, i1 0
-  %nop12806 = alloca i1, i1 0
-  %nop12807 = alloca i1, i1 0
-  %nop12808 = alloca i1, i1 0
-  %nop12809 = alloca i1, i1 0
-  %nop12810 = alloca i1, i1 0
-  %nop12811 = alloca i1, i1 0
-  %nop12812 = alloca i1, i1 0
-  %nop12813 = alloca i1, i1 0
-  %nop12814 = alloca i1, i1 0
-  %nop12815 = alloca i1, i1 0
-  %nop12816 = alloca i1, i1 0
-  %nop12817 = alloca i1, i1 0
-  %nop12818 = alloca i1, i1 0
-  %nop12819 = alloca i1, i1 0
-  %nop12820 = alloca i1, i1 0
-  %nop12821 = alloca i1, i1 0
-  %nop12822 = alloca i1, i1 0
-  %nop12823 = alloca i1, i1 0
-  %nop12824 = alloca i1, i1 0
-  %nop12825 = alloca i1, i1 0
-  %nop12826 = alloca i1, i1 0
-  %nop12827 = alloca i1, i1 0
-  %nop12828 = alloca i1, i1 0
-  %nop12829 = alloca i1, i1 0
-  %nop12830 = alloca i1, i1 0
-  %nop12831 = alloca i1, i1 0
-  %nop12832 = alloca i1, i1 0
-  %nop12833 = alloca i1, i1 0
-  %nop12834 = alloca i1, i1 0
-  %nop12835 = alloca i1, i1 0
-  %nop12836 = alloca i1, i1 0
-  %nop12837 = alloca i1, i1 0
-  %nop12838 = alloca i1, i1 0
-  %nop12839 = alloca i1, i1 0
-  %nop12840 = alloca i1, i1 0
-  %nop12841 = alloca i1, i1 0
-  %nop12842 = alloca i1, i1 0
-  %nop12843 = alloca i1, i1 0
-  %nop12844 = alloca i1, i1 0
-  %nop12845 = alloca i1, i1 0
-  %nop12846 = alloca i1, i1 0
-  %nop12847 = alloca i1, i1 0
-  %nop12848 = alloca i1, i1 0
-  %nop12849 = alloca i1, i1 0
-  %nop12850 = alloca i1, i1 0
-  %nop12851 = alloca i1, i1 0
-  %nop12852 = alloca i1, i1 0
-  %nop12853 = alloca i1, i1 0
-  %nop12854 = alloca i1, i1 0
-  %nop12855 = alloca i1, i1 0
-  %nop12856 = alloca i1, i1 0
-  %nop12857 = alloca i1, i1 0
-  %nop12858 = alloca i1, i1 0
-  %nop12859 = alloca i1, i1 0
-  %nop12860 = alloca i1, i1 0
-  %nop12861 = alloca i1, i1 0
-  %nop12862 = alloca i1, i1 0
-  %nop12863 = alloca i1, i1 0
-  %nop12864 = alloca i1, i1 0
-  %nop12865 = alloca i1, i1 0
-  %nop12866 = alloca i1, i1 0
-  %nop12867 = alloca i1, i1 0
-  %nop12868 = alloca i1, i1 0
-  %nop12869 = alloca i1, i1 0
-  %nop12870 = alloca i1, i1 0
-  %nop12871 = alloca i1, i1 0
-  %nop12872 = alloca i1, i1 0
-  %nop12873 = alloca i1, i1 0
-  %nop12874 = alloca i1, i1 0
-  %nop12875 = alloca i1, i1 0
-  %nop12876 = alloca i1, i1 0
-  %nop12877 = alloca i1, i1 0
-  %nop12878 = alloca i1, i1 0
-  %nop12879 = alloca i1, i1 0
-  %nop12880 = alloca i1, i1 0
-  %nop12881 = alloca i1, i1 0
-  %nop12882 = alloca i1, i1 0
-  %nop12883 = alloca i1, i1 0
-  %nop12884 = alloca i1, i1 0
-  %nop12885 = alloca i1, i1 0
-  %nop12886 = alloca i1, i1 0
-  %nop12887 = alloca i1, i1 0
-  %nop12888 = alloca i1, i1 0
-  %nop12889 = alloca i1, i1 0
-  %nop12890 = alloca i1, i1 0
-  %nop12891 = alloca i1, i1 0
-  %nop12892 = alloca i1, i1 0
-  %nop12893 = alloca i1, i1 0
-  %nop12894 = alloca i1, i1 0
-  %nop12895 = alloca i1, i1 0
-  %nop12896 = alloca i1, i1 0
-  %nop12897 = alloca i1, i1 0
-  %nop12898 = alloca i1, i1 0
-  %nop12899 = alloca i1, i1 0
-  %nop12900 = alloca i1, i1 0
-  %nop12901 = alloca i1, i1 0
-  %nop12902 = alloca i1, i1 0
-  %nop12903 = alloca i1, i1 0
-  %nop12904 = alloca i1, i1 0
-  %nop12905 = alloca i1, i1 0
-  %nop12906 = alloca i1, i1 0
-  %nop12907 = alloca i1, i1 0
-  %nop12908 = alloca i1, i1 0
-  %nop12909 = alloca i1, i1 0
-  %nop12910 = alloca i1, i1 0
-  %nop12911 = alloca i1, i1 0
-  %nop12912 = alloca i1, i1 0
-  %nop12913 = alloca i1, i1 0
-  %nop12914 = alloca i1, i1 0
-  %nop12915 = alloca i1, i1 0
-  %nop12916 = alloca i1, i1 0
-  %nop12917 = alloca i1, i1 0
-  %nop12918 = alloca i1, i1 0
-  %nop12919 = alloca i1, i1 0
-  %nop12920 = alloca i1, i1 0
-  %nop12921 = alloca i1, i1 0
-  %nop12922 = alloca i1, i1 0
-  %nop12923 = alloca i1, i1 0
-  %nop12924 = alloca i1, i1 0
-  %nop12925 = alloca i1, i1 0
-  %nop12926 = alloca i1, i1 0
-  %nop12927 = alloca i1, i1 0
-  %nop12928 = alloca i1, i1 0
-  %nop12929 = alloca i1, i1 0
-  %nop12930 = alloca i1, i1 0
-  %nop12931 = alloca i1, i1 0
-  %nop12932 = alloca i1, i1 0
-  %nop12933 = alloca i1, i1 0
-  %nop12934 = alloca i1, i1 0
-  %nop12935 = alloca i1, i1 0
-  %nop12936 = alloca i1, i1 0
-  %nop12937 = alloca i1, i1 0
-  %nop12938 = alloca i1, i1 0
-  %nop12939 = alloca i1, i1 0
-  %nop12940 = alloca i1, i1 0
-  %nop12941 = alloca i1, i1 0
-  %nop12942 = alloca i1, i1 0
-  %nop12943 = alloca i1, i1 0
-  %nop12944 = alloca i1, i1 0
-  %nop12945 = alloca i1, i1 0
-  %nop12946 = alloca i1, i1 0
-  %nop12947 = alloca i1, i1 0
-  %nop12948 = alloca i1, i1 0
-  %nop12949 = alloca i1, i1 0
-  %nop12950 = alloca i1, i1 0
-  %nop12951 = alloca i1, i1 0
-  %nop12952 = alloca i1, i1 0
-  %nop12953 = alloca i1, i1 0
-  %nop12954 = alloca i1, i1 0
-  %nop12955 = alloca i1, i1 0
-  %nop12956 = alloca i1, i1 0
-  %nop12957 = alloca i1, i1 0
-  %nop12958 = alloca i1, i1 0
-  %nop12959 = alloca i1, i1 0
-  %nop12960 = alloca i1, i1 0
-  %nop12961 = alloca i1, i1 0
-  %nop12962 = alloca i1, i1 0
-  %nop12963 = alloca i1, i1 0
-  %nop12964 = alloca i1, i1 0
-  %nop12965 = alloca i1, i1 0
-  %nop12966 = alloca i1, i1 0
-  %nop12967 = alloca i1, i1 0
-  %nop12968 = alloca i1, i1 0
-  %nop12969 = alloca i1, i1 0
-  %nop12970 = alloca i1, i1 0
-  %nop12971 = alloca i1, i1 0
-  %nop12972 = alloca i1, i1 0
-  %nop12973 = alloca i1, i1 0
-  %nop12974 = alloca i1, i1 0
-  %nop12975 = alloca i1, i1 0
-  %nop12976 = alloca i1, i1 0
-  %nop12977 = alloca i1, i1 0
-  %nop12978 = alloca i1, i1 0
-  %nop12979 = alloca i1, i1 0
-  %nop12980 = alloca i1, i1 0
-  %nop12981 = alloca i1, i1 0
-  %nop12982 = alloca i1, i1 0
-  %nop12983 = alloca i1, i1 0
-  %nop12984 = alloca i1, i1 0
-  %nop12985 = alloca i1, i1 0
-  %nop12986 = alloca i1, i1 0
-  %nop12987 = alloca i1, i1 0
-  %nop12988 = alloca i1, i1 0
-  %nop12989 = alloca i1, i1 0
-  %nop12990 = alloca i1, i1 0
-  %nop12991 = alloca i1, i1 0
-  %nop12992 = alloca i1, i1 0
-  %nop12993 = alloca i1, i1 0
-  %nop12994 = alloca i1, i1 0
-  %nop12995 = alloca i1, i1 0
-  %nop12996 = alloca i1, i1 0
-  %nop12997 = alloca i1, i1 0
-  %nop12998 = alloca i1, i1 0
-  %nop12999 = alloca i1, i1 0
-  %nop13000 = alloca i1, i1 0
-  %nop13001 = alloca i1, i1 0
-  %nop13002 = alloca i1, i1 0
-  %nop13003 = alloca i1, i1 0
-  %nop13004 = alloca i1, i1 0
-  %nop13005 = alloca i1, i1 0
-  %nop13006 = alloca i1, i1 0
-  %nop13007 = alloca i1, i1 0
-  %nop13008 = alloca i1, i1 0
-  %nop13009 = alloca i1, i1 0
-  %nop13010 = alloca i1, i1 0
-  %nop13011 = alloca i1, i1 0
-  %nop13012 = alloca i1, i1 0
-  %nop13013 = alloca i1, i1 0
-  %nop13014 = alloca i1, i1 0
-  %nop13015 = alloca i1, i1 0
-  %nop13016 = alloca i1, i1 0
-  %nop13017 = alloca i1, i1 0
-  %nop13018 = alloca i1, i1 0
-  %nop13019 = alloca i1, i1 0
-  %nop13020 = alloca i1, i1 0
-  %nop13021 = alloca i1, i1 0
-  %nop13022 = alloca i1, i1 0
-  %nop13023 = alloca i1, i1 0
-  %nop13024 = alloca i1, i1 0
-  %nop13025 = alloca i1, i1 0
-  %nop13026 = alloca i1, i1 0
-  %nop13027 = alloca i1, i1 0
-  %nop13028 = alloca i1, i1 0
-  %nop13029 = alloca i1, i1 0
-  %nop13030 = alloca i1, i1 0
-  %nop13031 = alloca i1, i1 0
-  %nop13032 = alloca i1, i1 0
-  %nop13033 = alloca i1, i1 0
-  %nop13034 = alloca i1, i1 0
-  %nop13035 = alloca i1, i1 0
-  %nop13036 = alloca i1, i1 0
-  %nop13037 = alloca i1, i1 0
-  %nop13038 = alloca i1, i1 0
-  %nop13039 = alloca i1, i1 0
-  %nop13040 = alloca i1, i1 0
-  %nop13041 = alloca i1, i1 0
-  %nop13042 = alloca i1, i1 0
-  %nop13043 = alloca i1, i1 0
-  %nop13044 = alloca i1, i1 0
-  %nop13045 = alloca i1, i1 0
-  %nop13046 = alloca i1, i1 0
-  %nop13047 = alloca i1, i1 0
-  %nop13048 = alloca i1, i1 0
-  %nop13049 = alloca i1, i1 0
-  %nop13050 = alloca i1, i1 0
-  %nop13051 = alloca i1, i1 0
-  %nop13052 = alloca i1, i1 0
-  %nop13053 = alloca i1, i1 0
-  %nop13054 = alloca i1, i1 0
-  %nop13055 = alloca i1, i1 0
-  %nop13056 = alloca i1, i1 0
-  %nop13057 = alloca i1, i1 0
-  %nop13058 = alloca i1, i1 0
-  %nop13059 = alloca i1, i1 0
-  %nop13060 = alloca i1, i1 0
-  %nop13061 = alloca i1, i1 0
-  %nop13062 = alloca i1, i1 0
-  %nop13063 = alloca i1, i1 0
-  %nop13064 = alloca i1, i1 0
-  %nop13065 = alloca i1, i1 0
-  %nop13066 = alloca i1, i1 0
-  %nop13067 = alloca i1, i1 0
-  %nop13068 = alloca i1, i1 0
-  %nop13069 = alloca i1, i1 0
-  %nop13070 = alloca i1, i1 0
-  %nop13071 = alloca i1, i1 0
-  %nop13072 = alloca i1, i1 0
-  %nop13073 = alloca i1, i1 0
-  %nop13074 = alloca i1, i1 0
-  %nop13075 = alloca i1, i1 0
-  %nop13076 = alloca i1, i1 0
-  %nop13077 = alloca i1, i1 0
-  %nop13078 = alloca i1, i1 0
-  %nop13079 = alloca i1, i1 0
-  %nop13080 = alloca i1, i1 0
-  %nop13081 = alloca i1, i1 0
-  %nop13082 = alloca i1, i1 0
-  %nop13083 = alloca i1, i1 0
-  %nop13084 = alloca i1, i1 0
-  %nop13085 = alloca i1, i1 0
-  %nop13086 = alloca i1, i1 0
-  %nop13087 = alloca i1, i1 0
-  %nop13088 = alloca i1, i1 0
-  %nop13089 = alloca i1, i1 0
-  %nop13090 = alloca i1, i1 0
-  %nop13091 = alloca i1, i1 0
-  %nop13092 = alloca i1, i1 0
-  %nop13093 = alloca i1, i1 0
-  %nop13094 = alloca i1, i1 0
-  %nop13095 = alloca i1, i1 0
-  %nop13096 = alloca i1, i1 0
-  %nop13097 = alloca i1, i1 0
-  %nop13098 = alloca i1, i1 0
-  %nop13099 = alloca i1, i1 0
-  %nop13100 = alloca i1, i1 0
-  %nop13101 = alloca i1, i1 0
-  %nop13102 = alloca i1, i1 0
-  %nop13103 = alloca i1, i1 0
-  %nop13104 = alloca i1, i1 0
-  %nop13105 = alloca i1, i1 0
-  %nop13106 = alloca i1, i1 0
-  %nop13107 = alloca i1, i1 0
-  %nop13108 = alloca i1, i1 0
-  %nop13109 = alloca i1, i1 0
-  %nop13110 = alloca i1, i1 0
-  %nop13111 = alloca i1, i1 0
-  %nop13112 = alloca i1, i1 0
-  %nop13113 = alloca i1, i1 0
-  %nop13114 = alloca i1, i1 0
-  %nop13115 = alloca i1, i1 0
-  %nop13116 = alloca i1, i1 0
-  %nop13117 = alloca i1, i1 0
-  %nop13118 = alloca i1, i1 0
-  %nop13119 = alloca i1, i1 0
-  %nop13120 = alloca i1, i1 0
-  %nop13121 = alloca i1, i1 0
-  %nop13122 = alloca i1, i1 0
-  %nop13123 = alloca i1, i1 0
-  %nop13124 = alloca i1, i1 0
-  %nop13125 = alloca i1, i1 0
-  %nop13126 = alloca i1, i1 0
-  %nop13127 = alloca i1, i1 0
-  %nop13128 = alloca i1, i1 0
-  %nop13129 = alloca i1, i1 0
-  %nop13130 = alloca i1, i1 0
-  %nop13131 = alloca i1, i1 0
-  %nop13132 = alloca i1, i1 0
-  %nop13133 = alloca i1, i1 0
-  %nop13134 = alloca i1, i1 0
-  %nop13135 = alloca i1, i1 0
-  %nop13136 = alloca i1, i1 0
-  %nop13137 = alloca i1, i1 0
-  %nop13138 = alloca i1, i1 0
-  %nop13139 = alloca i1, i1 0
-  %nop13140 = alloca i1, i1 0
-  %nop13141 = alloca i1, i1 0
-  %nop13142 = alloca i1, i1 0
-  %nop13143 = alloca i1, i1 0
-  %nop13144 = alloca i1, i1 0
-  %nop13145 = alloca i1, i1 0
-  %nop13146 = alloca i1, i1 0
-  %nop13147 = alloca i1, i1 0
-  %nop13148 = alloca i1, i1 0
-  %nop13149 = alloca i1, i1 0
-  %nop13150 = alloca i1, i1 0
-  %nop13151 = alloca i1, i1 0
-  %nop13152 = alloca i1, i1 0
-  %nop13153 = alloca i1, i1 0
-  %nop13154 = alloca i1, i1 0
-  %nop13155 = alloca i1, i1 0
-  %nop13156 = alloca i1, i1 0
-  %nop13157 = alloca i1, i1 0
-  %nop13158 = alloca i1, i1 0
-  %nop13159 = alloca i1, i1 0
-  %nop13160 = alloca i1, i1 0
-  %nop13161 = alloca i1, i1 0
-  %nop13162 = alloca i1, i1 0
-  %nop13163 = alloca i1, i1 0
-  %nop13164 = alloca i1, i1 0
-  %nop13165 = alloca i1, i1 0
-  %nop13166 = alloca i1, i1 0
-  %nop13167 = alloca i1, i1 0
-  %nop13168 = alloca i1, i1 0
-  %nop13169 = alloca i1, i1 0
-  %nop13170 = alloca i1, i1 0
-  %nop13171 = alloca i1, i1 0
-  %nop13172 = alloca i1, i1 0
-  %nop13173 = alloca i1, i1 0
-  %nop13174 = alloca i1, i1 0
-  %nop13175 = alloca i1, i1 0
-  %nop13176 = alloca i1, i1 0
-  %nop13177 = alloca i1, i1 0
-  %nop13178 = alloca i1, i1 0
-  %nop13179 = alloca i1, i1 0
-  %nop13180 = alloca i1, i1 0
-  %nop13181 = alloca i1, i1 0
-  %nop13182 = alloca i1, i1 0
-  %nop13183 = alloca i1, i1 0
-  %nop13184 = alloca i1, i1 0
-  %nop13185 = alloca i1, i1 0
-  %nop13186 = alloca i1, i1 0
-  %nop13187 = alloca i1, i1 0
-  %nop13188 = alloca i1, i1 0
-  %nop13189 = alloca i1, i1 0
-  %nop13190 = alloca i1, i1 0
-  %nop13191 = alloca i1, i1 0
-  %nop13192 = alloca i1, i1 0
-  %nop13193 = alloca i1, i1 0
-  %nop13194 = alloca i1, i1 0
-  %nop13195 = alloca i1, i1 0
-  %nop13196 = alloca i1, i1 0
-  %nop13197 = alloca i1, i1 0
-  %nop13198 = alloca i1, i1 0
-  %nop13199 = alloca i1, i1 0
-  %nop13200 = alloca i1, i1 0
-  %nop13201 = alloca i1, i1 0
-  %nop13202 = alloca i1, i1 0
-  %nop13203 = alloca i1, i1 0
-  %nop13204 = alloca i1, i1 0
-  %nop13205 = alloca i1, i1 0
-  %nop13206 = alloca i1, i1 0
-  %nop13207 = alloca i1, i1 0
-  %nop13208 = alloca i1, i1 0
-  %nop13209 = alloca i1, i1 0
-  %nop13210 = alloca i1, i1 0
-  %nop13211 = alloca i1, i1 0
-  %nop13212 = alloca i1, i1 0
-  %nop13213 = alloca i1, i1 0
-  %nop13214 = alloca i1, i1 0
-  %nop13215 = alloca i1, i1 0
-  %nop13216 = alloca i1, i1 0
-  %nop13217 = alloca i1, i1 0
-  %nop13218 = alloca i1, i1 0
-  %nop13219 = alloca i1, i1 0
-  %nop13220 = alloca i1, i1 0
-  %nop13221 = alloca i1, i1 0
-  %nop13222 = alloca i1, i1 0
-  %nop13223 = alloca i1, i1 0
-  %nop13224 = alloca i1, i1 0
-  %nop13225 = alloca i1, i1 0
-  %nop13226 = alloca i1, i1 0
-  %nop13227 = alloca i1, i1 0
-  %nop13228 = alloca i1, i1 0
-  %nop13229 = alloca i1, i1 0
-  %nop13230 = alloca i1, i1 0
-  %nop13231 = alloca i1, i1 0
-  %nop13232 = alloca i1, i1 0
-  %nop13233 = alloca i1, i1 0
-  %nop13234 = alloca i1, i1 0
-  %nop13235 = alloca i1, i1 0
-  %nop13236 = alloca i1, i1 0
-  %nop13237 = alloca i1, i1 0
-  %nop13238 = alloca i1, i1 0
-  %nop13239 = alloca i1, i1 0
-  %nop13240 = alloca i1, i1 0
-  %nop13241 = alloca i1, i1 0
-  %nop13242 = alloca i1, i1 0
-  %nop13243 = alloca i1, i1 0
-  %nop13244 = alloca i1, i1 0
-  %nop13245 = alloca i1, i1 0
-  %nop13246 = alloca i1, i1 0
-  %nop13247 = alloca i1, i1 0
-  %nop13248 = alloca i1, i1 0
-  %nop13249 = alloca i1, i1 0
-  %nop13250 = alloca i1, i1 0
-  %nop13251 = alloca i1, i1 0
-  %nop13252 = alloca i1, i1 0
-  %nop13253 = alloca i1, i1 0
-  %nop13254 = alloca i1, i1 0
-  %nop13255 = alloca i1, i1 0
-  %nop13256 = alloca i1, i1 0
-  %nop13257 = alloca i1, i1 0
-  %nop13258 = alloca i1, i1 0
-  %nop13259 = alloca i1, i1 0
-  %nop13260 = alloca i1, i1 0
-  %nop13261 = alloca i1, i1 0
-  %nop13262 = alloca i1, i1 0
-  %nop13263 = alloca i1, i1 0
-  %nop13264 = alloca i1, i1 0
-  %nop13265 = alloca i1, i1 0
-  %nop13266 = alloca i1, i1 0
-  %nop13267 = alloca i1, i1 0
-  %nop13268 = alloca i1, i1 0
-  %nop13269 = alloca i1, i1 0
-  %nop13270 = alloca i1, i1 0
-  %nop13271 = alloca i1, i1 0
-  %nop13272 = alloca i1, i1 0
-  %nop13273 = alloca i1, i1 0
-  %nop13274 = alloca i1, i1 0
-  %nop13275 = alloca i1, i1 0
-  %nop13276 = alloca i1, i1 0
-  %nop13277 = alloca i1, i1 0
-  %nop13278 = alloca i1, i1 0
-  %nop13279 = alloca i1, i1 0
-  %nop13280 = alloca i1, i1 0
-  %nop13281 = alloca i1, i1 0
-  %nop13282 = alloca i1, i1 0
-  %nop13283 = alloca i1, i1 0
-  %nop13284 = alloca i1, i1 0
-  %nop13285 = alloca i1, i1 0
-  %nop13286 = alloca i1, i1 0
-  %nop13287 = alloca i1, i1 0
-  %nop13288 = alloca i1, i1 0
-  %nop13289 = alloca i1, i1 0
-  %nop13290 = alloca i1, i1 0
-  %nop13291 = alloca i1, i1 0
-  %nop13292 = alloca i1, i1 0
-  %nop13293 = alloca i1, i1 0
-  %nop13294 = alloca i1, i1 0
-  %nop13295 = alloca i1, i1 0
-  %nop13296 = alloca i1, i1 0
-  %nop13297 = alloca i1, i1 0
-  %nop13298 = alloca i1, i1 0
-  %nop13299 = alloca i1, i1 0
-  %nop13300 = alloca i1, i1 0
-  %nop13301 = alloca i1, i1 0
-  %nop13302 = alloca i1, i1 0
-  %nop13303 = alloca i1, i1 0
-  %nop13304 = alloca i1, i1 0
-  %nop13305 = alloca i1, i1 0
-  %nop13306 = alloca i1, i1 0
-  %nop13307 = alloca i1, i1 0
-  %nop13308 = alloca i1, i1 0
-  %nop13309 = alloca i1, i1 0
-  %nop13310 = alloca i1, i1 0
-  %nop13311 = alloca i1, i1 0
-  %nop13312 = alloca i1, i1 0
-  %nop13313 = alloca i1, i1 0
-  %nop13314 = alloca i1, i1 0
-  %nop13315 = alloca i1, i1 0
-  %nop13316 = alloca i1, i1 0
-  %nop13317 = alloca i1, i1 0
-  %nop13318 = alloca i1, i1 0
-  %nop13319 = alloca i1, i1 0
-  %nop13320 = alloca i1, i1 0
-  %nop13321 = alloca i1, i1 0
-  %nop13322 = alloca i1, i1 0
-  %nop13323 = alloca i1, i1 0
-  %nop13324 = alloca i1, i1 0
-  %nop13325 = alloca i1, i1 0
-  %nop13326 = alloca i1, i1 0
-  %nop13327 = alloca i1, i1 0
-  %nop13328 = alloca i1, i1 0
-  %nop13329 = alloca i1, i1 0
-  %nop13330 = alloca i1, i1 0
-  %nop13331 = alloca i1, i1 0
-  %nop13332 = alloca i1, i1 0
-  %nop13333 = alloca i1, i1 0
-  %nop13334 = alloca i1, i1 0
-  %nop13335 = alloca i1, i1 0
-  %nop13336 = alloca i1, i1 0
-  %nop13337 = alloca i1, i1 0
-  %nop13338 = alloca i1, i1 0
-  %nop13339 = alloca i1, i1 0
-  %nop13340 = alloca i1, i1 0
-  %nop13341 = alloca i1, i1 0
-  %nop13342 = alloca i1, i1 0
-  %nop13343 = alloca i1, i1 0
-  %nop13344 = alloca i1, i1 0
-  %nop13345 = alloca i1, i1 0
-  %nop13346 = alloca i1, i1 0
-  %nop13347 = alloca i1, i1 0
-  %nop13348 = alloca i1, i1 0
-  %nop13349 = alloca i1, i1 0
-  %nop13350 = alloca i1, i1 0
-  %nop13351 = alloca i1, i1 0
-  %nop13352 = alloca i1, i1 0
-  %nop13353 = alloca i1, i1 0
-  %nop13354 = alloca i1, i1 0
-  %nop13355 = alloca i1, i1 0
-  %nop13356 = alloca i1, i1 0
-  %nop13357 = alloca i1, i1 0
-  %nop13358 = alloca i1, i1 0
-  %nop13359 = alloca i1, i1 0
-  %nop13360 = alloca i1, i1 0
-  %nop13361 = alloca i1, i1 0
-  %nop13362 = alloca i1, i1 0
-  %nop13363 = alloca i1, i1 0
-  %nop13364 = alloca i1, i1 0
-  %nop13365 = alloca i1, i1 0
-  %nop13366 = alloca i1, i1 0
-  %nop13367 = alloca i1, i1 0
-  %nop13368 = alloca i1, i1 0
-  %nop13369 = alloca i1, i1 0
-  %nop13370 = alloca i1, i1 0
-  %nop13371 = alloca i1, i1 0
-  %nop13372 = alloca i1, i1 0
-  %nop13373 = alloca i1, i1 0
-  %nop13374 = alloca i1, i1 0
-  %nop13375 = alloca i1, i1 0
-  %nop13376 = alloca i1, i1 0
-  %nop13377 = alloca i1, i1 0
-  %nop13378 = alloca i1, i1 0
-  %nop13379 = alloca i1, i1 0
-  %nop13380 = alloca i1, i1 0
-  %nop13381 = alloca i1, i1 0
-  %nop13382 = alloca i1, i1 0
-  %nop13383 = alloca i1, i1 0
-  %nop13384 = alloca i1, i1 0
-  %nop13385 = alloca i1, i1 0
-  %nop13386 = alloca i1, i1 0
-  %nop13387 = alloca i1, i1 0
-  %nop13388 = alloca i1, i1 0
-  %nop13389 = alloca i1, i1 0
-  %nop13390 = alloca i1, i1 0
-  %nop13391 = alloca i1, i1 0
-  %nop13392 = alloca i1, i1 0
-  %nop13393 = alloca i1, i1 0
-  %nop13394 = alloca i1, i1 0
-  %nop13395 = alloca i1, i1 0
-  %nop13396 = alloca i1, i1 0
-  %nop13397 = alloca i1, i1 0
-  %nop13398 = alloca i1, i1 0
-  %nop13399 = alloca i1, i1 0
-  %nop13400 = alloca i1, i1 0
-  %nop13401 = alloca i1, i1 0
-  %nop13402 = alloca i1, i1 0
-  %nop13403 = alloca i1, i1 0
-  %nop13404 = alloca i1, i1 0
-  %nop13405 = alloca i1, i1 0
-  %nop13406 = alloca i1, i1 0
-  %nop13407 = alloca i1, i1 0
-  %nop13408 = alloca i1, i1 0
-  %nop13409 = alloca i1, i1 0
-  %nop13410 = alloca i1, i1 0
-  %nop13411 = alloca i1, i1 0
-  %nop13412 = alloca i1, i1 0
-  %nop13413 = alloca i1, i1 0
-  %nop13414 = alloca i1, i1 0
-  %nop13415 = alloca i1, i1 0
-  %nop13416 = alloca i1, i1 0
-  %nop13417 = alloca i1, i1 0
-  %nop13418 = alloca i1, i1 0
-  %nop13419 = alloca i1, i1 0
-  %nop13420 = alloca i1, i1 0
-  %nop13421 = alloca i1, i1 0
-  %nop13422 = alloca i1, i1 0
-  %nop13423 = alloca i1, i1 0
-  %nop13424 = alloca i1, i1 0
-  %nop13425 = alloca i1, i1 0
-  %nop13426 = alloca i1, i1 0
-  %nop13427 = alloca i1, i1 0
-  %nop13428 = alloca i1, i1 0
-  %nop13429 = alloca i1, i1 0
-  %nop13430 = alloca i1, i1 0
-  %nop13431 = alloca i1, i1 0
-  %nop13432 = alloca i1, i1 0
-  %nop13433 = alloca i1, i1 0
-  %nop13434 = alloca i1, i1 0
-  %nop13435 = alloca i1, i1 0
-  %nop13436 = alloca i1, i1 0
-  %nop13437 = alloca i1, i1 0
-  %nop13438 = alloca i1, i1 0
-  %nop13439 = alloca i1, i1 0
-  %nop13440 = alloca i1, i1 0
-  %nop13441 = alloca i1, i1 0
-  %nop13442 = alloca i1, i1 0
-  %nop13443 = alloca i1, i1 0
-  %nop13444 = alloca i1, i1 0
-  %nop13445 = alloca i1, i1 0
-  %nop13446 = alloca i1, i1 0
-  %nop13447 = alloca i1, i1 0
-  %nop13448 = alloca i1, i1 0
-  %nop13449 = alloca i1, i1 0
-  %nop13450 = alloca i1, i1 0
-  %nop13451 = alloca i1, i1 0
-  %nop13452 = alloca i1, i1 0
-  %nop13453 = alloca i1, i1 0
-  %nop13454 = alloca i1, i1 0
-  %nop13455 = alloca i1, i1 0
-  %nop13456 = alloca i1, i1 0
-  %nop13457 = alloca i1, i1 0
-  %nop13458 = alloca i1, i1 0
-  %nop13459 = alloca i1, i1 0
-  %nop13460 = alloca i1, i1 0
-  %nop13461 = alloca i1, i1 0
-  %nop13462 = alloca i1, i1 0
-  %nop13463 = alloca i1, i1 0
-  %nop13464 = alloca i1, i1 0
-  %nop13465 = alloca i1, i1 0
-  %nop13466 = alloca i1, i1 0
-  %nop13467 = alloca i1, i1 0
-  %nop13468 = alloca i1, i1 0
-  %nop13469 = alloca i1, i1 0
-  %nop13470 = alloca i1, i1 0
-  %nop13471 = alloca i1, i1 0
-  %nop13472 = alloca i1, i1 0
-  %nop13473 = alloca i1, i1 0
-  %nop13474 = alloca i1, i1 0
-  %nop13475 = alloca i1, i1 0
-  %nop13476 = alloca i1, i1 0
-  %nop13477 = alloca i1, i1 0
-  %nop13478 = alloca i1, i1 0
-  %nop13479 = alloca i1, i1 0
-  %nop13480 = alloca i1, i1 0
-  %nop13481 = alloca i1, i1 0
-  %nop13482 = alloca i1, i1 0
-  %nop13483 = alloca i1, i1 0
-  %nop13484 = alloca i1, i1 0
-  %nop13485 = alloca i1, i1 0
-  %nop13486 = alloca i1, i1 0
-  %nop13487 = alloca i1, i1 0
-  %nop13488 = alloca i1, i1 0
-  %nop13489 = alloca i1, i1 0
-  %nop13490 = alloca i1, i1 0
-  %nop13491 = alloca i1, i1 0
-  %nop13492 = alloca i1, i1 0
-  %nop13493 = alloca i1, i1 0
-  %nop13494 = alloca i1, i1 0
-  %nop13495 = alloca i1, i1 0
-  %nop13496 = alloca i1, i1 0
-  %nop13497 = alloca i1, i1 0
-  %nop13498 = alloca i1, i1 0
-  %nop13499 = alloca i1, i1 0
-  %nop13500 = alloca i1, i1 0
-  %nop13501 = alloca i1, i1 0
-  %nop13502 = alloca i1, i1 0
-  %nop13503 = alloca i1, i1 0
-  %nop13504 = alloca i1, i1 0
-  %nop13505 = alloca i1, i1 0
-  %nop13506 = alloca i1, i1 0
-  %nop13507 = alloca i1, i1 0
-  %nop13508 = alloca i1, i1 0
-  %nop13509 = alloca i1, i1 0
-  %nop13510 = alloca i1, i1 0
-  %nop13511 = alloca i1, i1 0
-  %nop13512 = alloca i1, i1 0
-  %nop13513 = alloca i1, i1 0
-  %nop13514 = alloca i1, i1 0
-  %nop13515 = alloca i1, i1 0
-  %nop13516 = alloca i1, i1 0
-  %nop13517 = alloca i1, i1 0
-  %nop13518 = alloca i1, i1 0
-  %nop13519 = alloca i1, i1 0
-  %nop13520 = alloca i1, i1 0
-  %nop13521 = alloca i1, i1 0
-  %nop13522 = alloca i1, i1 0
-  %nop13523 = alloca i1, i1 0
-  %nop13524 = alloca i1, i1 0
-  %nop13525 = alloca i1, i1 0
-  %nop13526 = alloca i1, i1 0
-  %nop13527 = alloca i1, i1 0
-  %nop13528 = alloca i1, i1 0
-  %nop13529 = alloca i1, i1 0
-  %nop13530 = alloca i1, i1 0
-  %nop13531 = alloca i1, i1 0
-  %nop13532 = alloca i1, i1 0
-  %nop13533 = alloca i1, i1 0
-  %nop13534 = alloca i1, i1 0
-  %nop13535 = alloca i1, i1 0
-  %nop13536 = alloca i1, i1 0
-  %nop13537 = alloca i1, i1 0
-  %nop13538 = alloca i1, i1 0
-  %nop13539 = alloca i1, i1 0
-  %nop13540 = alloca i1, i1 0
-  %nop13541 = alloca i1, i1 0
-  %nop13542 = alloca i1, i1 0
-  %nop13543 = alloca i1, i1 0
-  %nop13544 = alloca i1, i1 0
-  %nop13545 = alloca i1, i1 0
-  %nop13546 = alloca i1, i1 0
-  %nop13547 = alloca i1, i1 0
-  %nop13548 = alloca i1, i1 0
-  %nop13549 = alloca i1, i1 0
-  %nop13550 = alloca i1, i1 0
-  %nop13551 = alloca i1, i1 0
-  %nop13552 = alloca i1, i1 0
-  %nop13553 = alloca i1, i1 0
-  %nop13554 = alloca i1, i1 0
-  %nop13555 = alloca i1, i1 0
-  %nop13556 = alloca i1, i1 0
-  %nop13557 = alloca i1, i1 0
-  %nop13558 = alloca i1, i1 0
-  %nop13559 = alloca i1, i1 0
-  %nop13560 = alloca i1, i1 0
-  %nop13561 = alloca i1, i1 0
-  %nop13562 = alloca i1, i1 0
-  %nop13563 = alloca i1, i1 0
-  %nop13564 = alloca i1, i1 0
-  %nop13565 = alloca i1, i1 0
-  %nop13566 = alloca i1, i1 0
-  %nop13567 = alloca i1, i1 0
-  %nop13568 = alloca i1, i1 0
-  %nop13569 = alloca i1, i1 0
-  %nop13570 = alloca i1, i1 0
-  %nop13571 = alloca i1, i1 0
-  %nop13572 = alloca i1, i1 0
-  %nop13573 = alloca i1, i1 0
-  %nop13574 = alloca i1, i1 0
-  %nop13575 = alloca i1, i1 0
-  %nop13576 = alloca i1, i1 0
-  %nop13577 = alloca i1, i1 0
-  %nop13578 = alloca i1, i1 0
-  %nop13579 = alloca i1, i1 0
-  %nop13580 = alloca i1, i1 0
-  %nop13581 = alloca i1, i1 0
-  %nop13582 = alloca i1, i1 0
-  %nop13583 = alloca i1, i1 0
-  %nop13584 = alloca i1, i1 0
-  %nop13585 = alloca i1, i1 0
-  %nop13586 = alloca i1, i1 0
-  %nop13587 = alloca i1, i1 0
-  %nop13588 = alloca i1, i1 0
-  %nop13589 = alloca i1, i1 0
-  %nop13590 = alloca i1, i1 0
-  %nop13591 = alloca i1, i1 0
-  %nop13592 = alloca i1, i1 0
-  %nop13593 = alloca i1, i1 0
-  %nop13594 = alloca i1, i1 0
-  %nop13595 = alloca i1, i1 0
-  %nop13596 = alloca i1, i1 0
-  %nop13597 = alloca i1, i1 0
-  %nop13598 = alloca i1, i1 0
-  %nop13599 = alloca i1, i1 0
-  %nop13600 = alloca i1, i1 0
-  %nop13601 = alloca i1, i1 0
-  %nop13602 = alloca i1, i1 0
-  %nop13603 = alloca i1, i1 0
-  %nop13604 = alloca i1, i1 0
-  %nop13605 = alloca i1, i1 0
-  %nop13606 = alloca i1, i1 0
-  %nop13607 = alloca i1, i1 0
-  %nop13608 = alloca i1, i1 0
-  %nop13609 = alloca i1, i1 0
-  %nop13610 = alloca i1, i1 0
-  %nop13611 = alloca i1, i1 0
-  %nop13612 = alloca i1, i1 0
-  %nop13613 = alloca i1, i1 0
-  %nop13614 = alloca i1, i1 0
-  %nop13615 = alloca i1, i1 0
-  %nop13616 = alloca i1, i1 0
-  %nop13617 = alloca i1, i1 0
-  %nop13618 = alloca i1, i1 0
-  %nop13619 = alloca i1, i1 0
-  %nop13620 = alloca i1, i1 0
-  %nop13621 = alloca i1, i1 0
-  %nop13622 = alloca i1, i1 0
-  %nop13623 = alloca i1, i1 0
-  %nop13624 = alloca i1, i1 0
-  %nop13625 = alloca i1, i1 0
-  %nop13626 = alloca i1, i1 0
-  %nop13627 = alloca i1, i1 0
-  %nop13628 = alloca i1, i1 0
-  %nop13629 = alloca i1, i1 0
-  %nop13630 = alloca i1, i1 0
-  %nop13631 = alloca i1, i1 0
-  %nop13632 = alloca i1, i1 0
-  %nop13633 = alloca i1, i1 0
-  %nop13634 = alloca i1, i1 0
-  %nop13635 = alloca i1, i1 0
-  %nop13636 = alloca i1, i1 0
-  %nop13637 = alloca i1, i1 0
-  %nop13638 = alloca i1, i1 0
-  %nop13639 = alloca i1, i1 0
-  %nop13640 = alloca i1, i1 0
-  %nop13641 = alloca i1, i1 0
-  %nop13642 = alloca i1, i1 0
-  %nop13643 = alloca i1, i1 0
-  %nop13644 = alloca i1, i1 0
-  %nop13645 = alloca i1, i1 0
-  %nop13646 = alloca i1, i1 0
-  %nop13647 = alloca i1, i1 0
-  %nop13648 = alloca i1, i1 0
-  %nop13649 = alloca i1, i1 0
-  %nop13650 = alloca i1, i1 0
-  %nop13651 = alloca i1, i1 0
-  %nop13652 = alloca i1, i1 0
-  %nop13653 = alloca i1, i1 0
-  %nop13654 = alloca i1, i1 0
-  %nop13655 = alloca i1, i1 0
-  %nop13656 = alloca i1, i1 0
-  %nop13657 = alloca i1, i1 0
-  %nop13658 = alloca i1, i1 0
-  %nop13659 = alloca i1, i1 0
-  %nop13660 = alloca i1, i1 0
-  %nop13661 = alloca i1, i1 0
-  %nop13662 = alloca i1, i1 0
-  %nop13663 = alloca i1, i1 0
-  %nop13664 = alloca i1, i1 0
-  %nop13665 = alloca i1, i1 0
-  %nop13666 = alloca i1, i1 0
-  %nop13667 = alloca i1, i1 0
-  %nop13668 = alloca i1, i1 0
-  %nop13669 = alloca i1, i1 0
-  %nop13670 = alloca i1, i1 0
-  %nop13671 = alloca i1, i1 0
-  %nop13672 = alloca i1, i1 0
-  %nop13673 = alloca i1, i1 0
-  %nop13674 = alloca i1, i1 0
-  %nop13675 = alloca i1, i1 0
-  %nop13676 = alloca i1, i1 0
-  %nop13677 = alloca i1, i1 0
-  %nop13678 = alloca i1, i1 0
-  %nop13679 = alloca i1, i1 0
-  %nop13680 = alloca i1, i1 0
-  %nop13681 = alloca i1, i1 0
-  %nop13682 = alloca i1, i1 0
-  %nop13683 = alloca i1, i1 0
-  %nop13684 = alloca i1, i1 0
-  %nop13685 = alloca i1, i1 0
-  %nop13686 = alloca i1, i1 0
-  %nop13687 = alloca i1, i1 0
-  %nop13688 = alloca i1, i1 0
-  %nop13689 = alloca i1, i1 0
-  %nop13690 = alloca i1, i1 0
-  %nop13691 = alloca i1, i1 0
-  %nop13692 = alloca i1, i1 0
-  %nop13693 = alloca i1, i1 0
-  %nop13694 = alloca i1, i1 0
-  %nop13695 = alloca i1, i1 0
-  %nop13696 = alloca i1, i1 0
-  %nop13697 = alloca i1, i1 0
-  %nop13698 = alloca i1, i1 0
-  %nop13699 = alloca i1, i1 0
-  %nop13700 = alloca i1, i1 0
-  %nop13701 = alloca i1, i1 0
-  %nop13702 = alloca i1, i1 0
-  %nop13703 = alloca i1, i1 0
-  %nop13704 = alloca i1, i1 0
-  %nop13705 = alloca i1, i1 0
-  %nop13706 = alloca i1, i1 0
-  %nop13707 = alloca i1, i1 0
-  %nop13708 = alloca i1, i1 0
-  %nop13709 = alloca i1, i1 0
-  %nop13710 = alloca i1, i1 0
-  %nop13711 = alloca i1, i1 0
-  %nop13712 = alloca i1, i1 0
-  %nop13713 = alloca i1, i1 0
-  %nop13714 = alloca i1, i1 0
-  %nop13715 = alloca i1, i1 0
-  %nop13716 = alloca i1, i1 0
-  %nop13717 = alloca i1, i1 0
-  %nop13718 = alloca i1, i1 0
-  %nop13719 = alloca i1, i1 0
-  %nop13720 = alloca i1, i1 0
-  %nop13721 = alloca i1, i1 0
-  %nop13722 = alloca i1, i1 0
-  %nop13723 = alloca i1, i1 0
-  %nop13724 = alloca i1, i1 0
-  %nop13725 = alloca i1, i1 0
-  %nop13726 = alloca i1, i1 0
-  %nop13727 = alloca i1, i1 0
-  %nop13728 = alloca i1, i1 0
-  %nop13729 = alloca i1, i1 0
-  %nop13730 = alloca i1, i1 0
-  %nop13731 = alloca i1, i1 0
-  %nop13732 = alloca i1, i1 0
-  %nop13733 = alloca i1, i1 0
-  %nop13734 = alloca i1, i1 0
-  %nop13735 = alloca i1, i1 0
-  %nop13736 = alloca i1, i1 0
-  %nop13737 = alloca i1, i1 0
-  %nop13738 = alloca i1, i1 0
-  %nop13739 = alloca i1, i1 0
-  %nop13740 = alloca i1, i1 0
-  %nop13741 = alloca i1, i1 0
-  %nop13742 = alloca i1, i1 0
-  %nop13743 = alloca i1, i1 0
-  %nop13744 = alloca i1, i1 0
-  %nop13745 = alloca i1, i1 0
-  %nop13746 = alloca i1, i1 0
-  %nop13747 = alloca i1, i1 0
-  %nop13748 = alloca i1, i1 0
-  %nop13749 = alloca i1, i1 0
-  %nop13750 = alloca i1, i1 0
-  %nop13751 = alloca i1, i1 0
-  %nop13752 = alloca i1, i1 0
-  %nop13753 = alloca i1, i1 0
-  %nop13754 = alloca i1, i1 0
-  %nop13755 = alloca i1, i1 0
-  %nop13756 = alloca i1, i1 0
-  %nop13757 = alloca i1, i1 0
-  %nop13758 = alloca i1, i1 0
-  %nop13759 = alloca i1, i1 0
-  %nop13760 = alloca i1, i1 0
-  %nop13761 = alloca i1, i1 0
-  %nop13762 = alloca i1, i1 0
-  %nop13763 = alloca i1, i1 0
-  %nop13764 = alloca i1, i1 0
-  %nop13765 = alloca i1, i1 0
-  %nop13766 = alloca i1, i1 0
-  %nop13767 = alloca i1, i1 0
-  %nop13768 = alloca i1, i1 0
-  %nop13769 = alloca i1, i1 0
-  %nop13770 = alloca i1, i1 0
-  %nop13771 = alloca i1, i1 0
-  %nop13772 = alloca i1, i1 0
-  %nop13773 = alloca i1, i1 0
-  %nop13774 = alloca i1, i1 0
-  %nop13775 = alloca i1, i1 0
-  %nop13776 = alloca i1, i1 0
-  %nop13777 = alloca i1, i1 0
-  %nop13778 = alloca i1, i1 0
-  %nop13779 = alloca i1, i1 0
-  %nop13780 = alloca i1, i1 0
-  %nop13781 = alloca i1, i1 0
-  %nop13782 = alloca i1, i1 0
-  %nop13783 = alloca i1, i1 0
-  %nop13784 = alloca i1, i1 0
-  %nop13785 = alloca i1, i1 0
-  %nop13786 = alloca i1, i1 0
-  %nop13787 = alloca i1, i1 0
-  %nop13788 = alloca i1, i1 0
-  %nop13789 = alloca i1, i1 0
-  %nop13790 = alloca i1, i1 0
-  %nop13791 = alloca i1, i1 0
-  %nop13792 = alloca i1, i1 0
-  %nop13793 = alloca i1, i1 0
-  %nop13794 = alloca i1, i1 0
-  %nop13795 = alloca i1, i1 0
-  %nop13796 = alloca i1, i1 0
-  %nop13797 = alloca i1, i1 0
-  %nop13798 = alloca i1, i1 0
-  %nop13799 = alloca i1, i1 0
-  %nop13800 = alloca i1, i1 0
-  %nop13801 = alloca i1, i1 0
-  %nop13802 = alloca i1, i1 0
-  %nop13803 = alloca i1, i1 0
-  %nop13804 = alloca i1, i1 0
-  %nop13805 = alloca i1, i1 0
-  %nop13806 = alloca i1, i1 0
-  %nop13807 = alloca i1, i1 0
-  %nop13808 = alloca i1, i1 0
-  %nop13809 = alloca i1, i1 0
-  %nop13810 = alloca i1, i1 0
-  %nop13811 = alloca i1, i1 0
-  %nop13812 = alloca i1, i1 0
-  %nop13813 = alloca i1, i1 0
-  %nop13814 = alloca i1, i1 0
-  %nop13815 = alloca i1, i1 0
-  %nop13816 = alloca i1, i1 0
-  %nop13817 = alloca i1, i1 0
-  %nop13818 = alloca i1, i1 0
-  %nop13819 = alloca i1, i1 0
-  %nop13820 = alloca i1, i1 0
-  %nop13821 = alloca i1, i1 0
-  %nop13822 = alloca i1, i1 0
-  %nop13823 = alloca i1, i1 0
-  %nop13824 = alloca i1, i1 0
-  %nop13825 = alloca i1, i1 0
-  %nop13826 = alloca i1, i1 0
-  %nop13827 = alloca i1, i1 0
-  %nop13828 = alloca i1, i1 0
-  %nop13829 = alloca i1, i1 0
-  %nop13830 = alloca i1, i1 0
-  %nop13831 = alloca i1, i1 0
-  %nop13832 = alloca i1, i1 0
-  %nop13833 = alloca i1, i1 0
-  %nop13834 = alloca i1, i1 0
-  %nop13835 = alloca i1, i1 0
-  %nop13836 = alloca i1, i1 0
-  %nop13837 = alloca i1, i1 0
-  %nop13838 = alloca i1, i1 0
-  %nop13839 = alloca i1, i1 0
-  %nop13840 = alloca i1, i1 0
-  %nop13841 = alloca i1, i1 0
-  %nop13842 = alloca i1, i1 0
-  %nop13843 = alloca i1, i1 0
-  %nop13844 = alloca i1, i1 0
-  %nop13845 = alloca i1, i1 0
-  %nop13846 = alloca i1, i1 0
-  %nop13847 = alloca i1, i1 0
-  %nop13848 = alloca i1, i1 0
-  %nop13849 = alloca i1, i1 0
-  %nop13850 = alloca i1, i1 0
-  %nop13851 = alloca i1, i1 0
-  %nop13852 = alloca i1, i1 0
-  %nop13853 = alloca i1, i1 0
-  %nop13854 = alloca i1, i1 0
-  %nop13855 = alloca i1, i1 0
-  %nop13856 = alloca i1, i1 0
-  %nop13857 = alloca i1, i1 0
-  %nop13858 = alloca i1, i1 0
-  %nop13859 = alloca i1, i1 0
-  %nop13860 = alloca i1, i1 0
-  %nop13861 = alloca i1, i1 0
-  %nop13862 = alloca i1, i1 0
-  %nop13863 = alloca i1, i1 0
-  %nop13864 = alloca i1, i1 0
-  %nop13865 = alloca i1, i1 0
-  %nop13866 = alloca i1, i1 0
-  %nop13867 = alloca i1, i1 0
-  %nop13868 = alloca i1, i1 0
-  %nop13869 = alloca i1, i1 0
-  %nop13870 = alloca i1, i1 0
-  %nop13871 = alloca i1, i1 0
-  %nop13872 = alloca i1, i1 0
-  %nop13873 = alloca i1, i1 0
-  %nop13874 = alloca i1, i1 0
-  %nop13875 = alloca i1, i1 0
-  %nop13876 = alloca i1, i1 0
-  %nop13877 = alloca i1, i1 0
-  %nop13878 = alloca i1, i1 0
-  %nop13879 = alloca i1, i1 0
-  %nop13880 = alloca i1, i1 0
-  %nop13881 = alloca i1, i1 0
-  %nop13882 = alloca i1, i1 0
-  %nop13883 = alloca i1, i1 0
-  %nop13884 = alloca i1, i1 0
-  %nop13885 = alloca i1, i1 0
-  %nop13886 = alloca i1, i1 0
-  %nop13887 = alloca i1, i1 0
-  %nop13888 = alloca i1, i1 0
-  %nop13889 = alloca i1, i1 0
-  %nop13890 = alloca i1, i1 0
-  %nop13891 = alloca i1, i1 0
-  %nop13892 = alloca i1, i1 0
-  %nop13893 = alloca i1, i1 0
-  %nop13894 = alloca i1, i1 0
-  %nop13895 = alloca i1, i1 0
-  %nop13896 = alloca i1, i1 0
-  %nop13897 = alloca i1, i1 0
-  %nop13898 = alloca i1, i1 0
-  %nop13899 = alloca i1, i1 0
-  %nop13900 = alloca i1, i1 0
-  %nop13901 = alloca i1, i1 0
-  %nop13902 = alloca i1, i1 0
-  %nop13903 = alloca i1, i1 0
-  %nop13904 = alloca i1, i1 0
-  %nop13905 = alloca i1, i1 0
-  %nop13906 = alloca i1, i1 0
-  %nop13907 = alloca i1, i1 0
-  %nop13908 = alloca i1, i1 0
-  %nop13909 = alloca i1, i1 0
-  %nop13910 = alloca i1, i1 0
-  %nop13911 = alloca i1, i1 0
-  %nop13912 = alloca i1, i1 0
-  %nop13913 = alloca i1, i1 0
-  %nop13914 = alloca i1, i1 0
-  %nop13915 = alloca i1, i1 0
-  %nop13916 = alloca i1, i1 0
-  %nop13917 = alloca i1, i1 0
-  %nop13918 = alloca i1, i1 0
-  %nop13919 = alloca i1, i1 0
-  %nop13920 = alloca i1, i1 0
-  %nop13921 = alloca i1, i1 0
-  %nop13922 = alloca i1, i1 0
-  %nop13923 = alloca i1, i1 0
-  %nop13924 = alloca i1, i1 0
-  %nop13925 = alloca i1, i1 0
-  %nop13926 = alloca i1, i1 0
-  %nop13927 = alloca i1, i1 0
-  %nop13928 = alloca i1, i1 0
-  %nop13929 = alloca i1, i1 0
-  %nop13930 = alloca i1, i1 0
-  %nop13931 = alloca i1, i1 0
-  %nop13932 = alloca i1, i1 0
-  %nop13933 = alloca i1, i1 0
-  %nop13934 = alloca i1, i1 0
-  %nop13935 = alloca i1, i1 0
-  %nop13936 = alloca i1, i1 0
-  %nop13937 = alloca i1, i1 0
-  %nop13938 = alloca i1, i1 0
-  %nop13939 = alloca i1, i1 0
-  %nop13940 = alloca i1, i1 0
-  %nop13941 = alloca i1, i1 0
-  %nop13942 = alloca i1, i1 0
-  %nop13943 = alloca i1, i1 0
-  %nop13944 = alloca i1, i1 0
-  %nop13945 = alloca i1, i1 0
-  %nop13946 = alloca i1, i1 0
-  %nop13947 = alloca i1, i1 0
-  %nop13948 = alloca i1, i1 0
-  %nop13949 = alloca i1, i1 0
-  %nop13950 = alloca i1, i1 0
-  %nop13951 = alloca i1, i1 0
-  %nop13952 = alloca i1, i1 0
-  %nop13953 = alloca i1, i1 0
-  %nop13954 = alloca i1, i1 0
-  %nop13955 = alloca i1, i1 0
-  %nop13956 = alloca i1, i1 0
-  %nop13957 = alloca i1, i1 0
-  %nop13958 = alloca i1, i1 0
-  %nop13959 = alloca i1, i1 0
-  %nop13960 = alloca i1, i1 0
-  %nop13961 = alloca i1, i1 0
-  %nop13962 = alloca i1, i1 0
-  %nop13963 = alloca i1, i1 0
-  %nop13964 = alloca i1, i1 0
-  %nop13965 = alloca i1, i1 0
-  %nop13966 = alloca i1, i1 0
-  %nop13967 = alloca i1, i1 0
-  %nop13968 = alloca i1, i1 0
-  %nop13969 = alloca i1, i1 0
-  %nop13970 = alloca i1, i1 0
-  %nop13971 = alloca i1, i1 0
-  %nop13972 = alloca i1, i1 0
-  %nop13973 = alloca i1, i1 0
-  %nop13974 = alloca i1, i1 0
-  %nop13975 = alloca i1, i1 0
-  %nop13976 = alloca i1, i1 0
-  %nop13977 = alloca i1, i1 0
-  %nop13978 = alloca i1, i1 0
-  %nop13979 = alloca i1, i1 0
-  %nop13980 = alloca i1, i1 0
-  %nop13981 = alloca i1, i1 0
-  %nop13982 = alloca i1, i1 0
-  %nop13983 = alloca i1, i1 0
-  %nop13984 = alloca i1, i1 0
-  %nop13985 = alloca i1, i1 0
-  %nop13986 = alloca i1, i1 0
-  %nop13987 = alloca i1, i1 0
-  %nop13988 = alloca i1, i1 0
-  %nop13989 = alloca i1, i1 0
-  %nop13990 = alloca i1, i1 0
-  %nop13991 = alloca i1, i1 0
-  %nop13992 = alloca i1, i1 0
-  %nop13993 = alloca i1, i1 0
-  %nop13994 = alloca i1, i1 0
-  %nop13995 = alloca i1, i1 0
-  %nop13996 = alloca i1, i1 0
-  %nop13997 = alloca i1, i1 0
-  %nop13998 = alloca i1, i1 0
-  %nop13999 = alloca i1, i1 0
-  %nop14000 = alloca i1, i1 0
-  %nop14001 = alloca i1, i1 0
-  %nop14002 = alloca i1, i1 0
-  %nop14003 = alloca i1, i1 0
-  %nop14004 = alloca i1, i1 0
-  %nop14005 = alloca i1, i1 0
-  %nop14006 = alloca i1, i1 0
-  %nop14007 = alloca i1, i1 0
-  %nop14008 = alloca i1, i1 0
-  %nop14009 = alloca i1, i1 0
-  %nop14010 = alloca i1, i1 0
-  %nop14011 = alloca i1, i1 0
-  %nop14012 = alloca i1, i1 0
-  %nop14013 = alloca i1, i1 0
-  %nop14014 = alloca i1, i1 0
-  %nop14015 = alloca i1, i1 0
-  %nop14016 = alloca i1, i1 0
-  %nop14017 = alloca i1, i1 0
-  %nop14018 = alloca i1, i1 0
-  %nop14019 = alloca i1, i1 0
-  %nop14020 = alloca i1, i1 0
-  %nop14021 = alloca i1, i1 0
-  %nop14022 = alloca i1, i1 0
-  %nop14023 = alloca i1, i1 0
-  %nop14024 = alloca i1, i1 0
-  %nop14025 = alloca i1, i1 0
-  %nop14026 = alloca i1, i1 0
-  %nop14027 = alloca i1, i1 0
-  %nop14028 = alloca i1, i1 0
-  %nop14029 = alloca i1, i1 0
-  %nop14030 = alloca i1, i1 0
-  %nop14031 = alloca i1, i1 0
-  %nop14032 = alloca i1, i1 0
-  %nop14033 = alloca i1, i1 0
-  %nop14034 = alloca i1, i1 0
-  %nop14035 = alloca i1, i1 0
-  %nop14036 = alloca i1, i1 0
-  %nop14037 = alloca i1, i1 0
-  %nop14038 = alloca i1, i1 0
-  %nop14039 = alloca i1, i1 0
-  %nop14040 = alloca i1, i1 0
-  %nop14041 = alloca i1, i1 0
-  %nop14042 = alloca i1, i1 0
-  %nop14043 = alloca i1, i1 0
-  %nop14044 = alloca i1, i1 0
-  %nop14045 = alloca i1, i1 0
-  %nop14046 = alloca i1, i1 0
-  %nop14047 = alloca i1, i1 0
-  %nop14048 = alloca i1, i1 0
-  %nop14049 = alloca i1, i1 0
-  %nop14050 = alloca i1, i1 0
-  %nop14051 = alloca i1, i1 0
-  %nop14052 = alloca i1, i1 0
-  %nop14053 = alloca i1, i1 0
-  %nop14054 = alloca i1, i1 0
-  %nop14055 = alloca i1, i1 0
-  %nop14056 = alloca i1, i1 0
-  %nop14057 = alloca i1, i1 0
-  %nop14058 = alloca i1, i1 0
-  %nop14059 = alloca i1, i1 0
-  %nop14060 = alloca i1, i1 0
-  %nop14061 = alloca i1, i1 0
-  %nop14062 = alloca i1, i1 0
-  %nop14063 = alloca i1, i1 0
-  %nop14064 = alloca i1, i1 0
-  %nop14065 = alloca i1, i1 0
-  %nop14066 = alloca i1, i1 0
-  %nop14067 = alloca i1, i1 0
-  %nop14068 = alloca i1, i1 0
-  %nop14069 = alloca i1, i1 0
-  %nop14070 = alloca i1, i1 0
-  %nop14071 = alloca i1, i1 0
-  %nop14072 = alloca i1, i1 0
-  %nop14073 = alloca i1, i1 0
-  %nop14074 = alloca i1, i1 0
-  %nop14075 = alloca i1, i1 0
-  %nop14076 = alloca i1, i1 0
-  %nop14077 = alloca i1, i1 0
-  %nop14078 = alloca i1, i1 0
-  %nop14079 = alloca i1, i1 0
-  %nop14080 = alloca i1, i1 0
-  %nop14081 = alloca i1, i1 0
-  %nop14082 = alloca i1, i1 0
-  %nop14083 = alloca i1, i1 0
-  %nop14084 = alloca i1, i1 0
-  %nop14085 = alloca i1, i1 0
-  %nop14086 = alloca i1, i1 0
-  %nop14087 = alloca i1, i1 0
-  %nop14088 = alloca i1, i1 0
-  %nop14089 = alloca i1, i1 0
-  %nop14090 = alloca i1, i1 0
-  %nop14091 = alloca i1, i1 0
-  %nop14092 = alloca i1, i1 0
-  %nop14093 = alloca i1, i1 0
-  %nop14094 = alloca i1, i1 0
-  %nop14095 = alloca i1, i1 0
-  %nop14096 = alloca i1, i1 0
-  %nop14097 = alloca i1, i1 0
-  %nop14098 = alloca i1, i1 0
-  %nop14099 = alloca i1, i1 0
-  %nop14100 = alloca i1, i1 0
-  %nop14101 = alloca i1, i1 0
-  %nop14102 = alloca i1, i1 0
-  %nop14103 = alloca i1, i1 0
-  %nop14104 = alloca i1, i1 0
-  %nop14105 = alloca i1, i1 0
-  %nop14106 = alloca i1, i1 0
-  %nop14107 = alloca i1, i1 0
-  %nop14108 = alloca i1, i1 0
-  %nop14109 = alloca i1, i1 0
-  %nop14110 = alloca i1, i1 0
-  %nop14111 = alloca i1, i1 0
-  %nop14112 = alloca i1, i1 0
-  %nop14113 = alloca i1, i1 0
-  %nop14114 = alloca i1, i1 0
-  %nop14115 = alloca i1, i1 0
-  %nop14116 = alloca i1, i1 0
-  %nop14117 = alloca i1, i1 0
-  %nop14118 = alloca i1, i1 0
-  %nop14119 = alloca i1, i1 0
-  %nop14120 = alloca i1, i1 0
-  %nop14121 = alloca i1, i1 0
-  %nop14122 = alloca i1, i1 0
-  %nop14123 = alloca i1, i1 0
-  %nop14124 = alloca i1, i1 0
-  %nop14125 = alloca i1, i1 0
-  %nop14126 = alloca i1, i1 0
-  %nop14127 = alloca i1, i1 0
-  %nop14128 = alloca i1, i1 0
-  %nop14129 = alloca i1, i1 0
-  %nop14130 = alloca i1, i1 0
-  %nop14131 = alloca i1, i1 0
-  %nop14132 = alloca i1, i1 0
-  %nop14133 = alloca i1, i1 0
-  %nop14134 = alloca i1, i1 0
-  %nop14135 = alloca i1, i1 0
-  %nop14136 = alloca i1, i1 0
-  %nop14137 = alloca i1, i1 0
-  %nop14138 = alloca i1, i1 0
-  %nop14139 = alloca i1, i1 0
-  %nop14140 = alloca i1, i1 0
-  %nop14141 = alloca i1, i1 0
-  %nop14142 = alloca i1, i1 0
-  %nop14143 = alloca i1, i1 0
-  %nop14144 = alloca i1, i1 0
-  %nop14145 = alloca i1, i1 0
-  %nop14146 = alloca i1, i1 0
-  %nop14147 = alloca i1, i1 0
-  %nop14148 = alloca i1, i1 0
-  %nop14149 = alloca i1, i1 0
-  %nop14150 = alloca i1, i1 0
-  %nop14151 = alloca i1, i1 0
-  %nop14152 = alloca i1, i1 0
-  %nop14153 = alloca i1, i1 0
-  %nop14154 = alloca i1, i1 0
-  %nop14155 = alloca i1, i1 0
-  %nop14156 = alloca i1, i1 0
-  %nop14157 = alloca i1, i1 0
-  %nop14158 = alloca i1, i1 0
-  %nop14159 = alloca i1, i1 0
-  %nop14160 = alloca i1, i1 0
-  %nop14161 = alloca i1, i1 0
-  %nop14162 = alloca i1, i1 0
-  %nop14163 = alloca i1, i1 0
-  %nop14164 = alloca i1, i1 0
-  %nop14165 = alloca i1, i1 0
-  %nop14166 = alloca i1, i1 0
-  %nop14167 = alloca i1, i1 0
-  %nop14168 = alloca i1, i1 0
-  %nop14169 = alloca i1, i1 0
-  %nop14170 = alloca i1, i1 0
-  %nop14171 = alloca i1, i1 0
-  %nop14172 = alloca i1, i1 0
-  %nop14173 = alloca i1, i1 0
-  %nop14174 = alloca i1, i1 0
-  %nop14175 = alloca i1, i1 0
-  %nop14176 = alloca i1, i1 0
-  %nop14177 = alloca i1, i1 0
-  %nop14178 = alloca i1, i1 0
-  %nop14179 = alloca i1, i1 0
-  %nop14180 = alloca i1, i1 0
-  %nop14181 = alloca i1, i1 0
-  %nop14182 = alloca i1, i1 0
-  %nop14183 = alloca i1, i1 0
-  %nop14184 = alloca i1, i1 0
-  %nop14185 = alloca i1, i1 0
-  %nop14186 = alloca i1, i1 0
-  %nop14187 = alloca i1, i1 0
-  %nop14188 = alloca i1, i1 0
-  %nop14189 = alloca i1, i1 0
-  %nop14190 = alloca i1, i1 0
-  %nop14191 = alloca i1, i1 0
-  %nop14192 = alloca i1, i1 0
-  %nop14193 = alloca i1, i1 0
-  %nop14194 = alloca i1, i1 0
-  %nop14195 = alloca i1, i1 0
-  %nop14196 = alloca i1, i1 0
-  %nop14197 = alloca i1, i1 0
-  %nop14198 = alloca i1, i1 0
-  %nop14199 = alloca i1, i1 0
-  %nop14200 = alloca i1, i1 0
-  %nop14201 = alloca i1, i1 0
-  %nop14202 = alloca i1, i1 0
-  %nop14203 = alloca i1, i1 0
-  %nop14204 = alloca i1, i1 0
-  %nop14205 = alloca i1, i1 0
-  %nop14206 = alloca i1, i1 0
-  %nop14207 = alloca i1, i1 0
-  %nop14208 = alloca i1, i1 0
-  %nop14209 = alloca i1, i1 0
-  %nop14210 = alloca i1, i1 0
-  %nop14211 = alloca i1, i1 0
-  %nop14212 = alloca i1, i1 0
-  %nop14213 = alloca i1, i1 0
-  %nop14214 = alloca i1, i1 0
-  %nop14215 = alloca i1, i1 0
-  %nop14216 = alloca i1, i1 0
-  %nop14217 = alloca i1, i1 0
-  %nop14218 = alloca i1, i1 0
-  %nop14219 = alloca i1, i1 0
-  %nop14220 = alloca i1, i1 0
-  %nop14221 = alloca i1, i1 0
-  %nop14222 = alloca i1, i1 0
-  %nop14223 = alloca i1, i1 0
-  %nop14224 = alloca i1, i1 0
-  %nop14225 = alloca i1, i1 0
-  %nop14226 = alloca i1, i1 0
-  %nop14227 = alloca i1, i1 0
-  %nop14228 = alloca i1, i1 0
-  %nop14229 = alloca i1, i1 0
-  %nop14230 = alloca i1, i1 0
-  %nop14231 = alloca i1, i1 0
-  %nop14232 = alloca i1, i1 0
-  %nop14233 = alloca i1, i1 0
-  %nop14234 = alloca i1, i1 0
-  %nop14235 = alloca i1, i1 0
-  %nop14236 = alloca i1, i1 0
-  %nop14237 = alloca i1, i1 0
-  %nop14238 = alloca i1, i1 0
-  %nop14239 = alloca i1, i1 0
-  %nop14240 = alloca i1, i1 0
-  %nop14241 = alloca i1, i1 0
-  %nop14242 = alloca i1, i1 0
-  %nop14243 = alloca i1, i1 0
-  %nop14244 = alloca i1, i1 0
-  %nop14245 = alloca i1, i1 0
-  %nop14246 = alloca i1, i1 0
-  %nop14247 = alloca i1, i1 0
-  %nop14248 = alloca i1, i1 0
-  %nop14249 = alloca i1, i1 0
-  %nop14250 = alloca i1, i1 0
-  %nop14251 = alloca i1, i1 0
-  %nop14252 = alloca i1, i1 0
-  %nop14253 = alloca i1, i1 0
-  %nop14254 = alloca i1, i1 0
-  %nop14255 = alloca i1, i1 0
-  %nop14256 = alloca i1, i1 0
-  %nop14257 = alloca i1, i1 0
-  %nop14258 = alloca i1, i1 0
-  %nop14259 = alloca i1, i1 0
-  %nop14260 = alloca i1, i1 0
-  %nop14261 = alloca i1, i1 0
-  %nop14262 = alloca i1, i1 0
-  %nop14263 = alloca i1, i1 0
-  %nop14264 = alloca i1, i1 0
-  %nop14265 = alloca i1, i1 0
-  %nop14266 = alloca i1, i1 0
-  %nop14267 = alloca i1, i1 0
-  %nop14268 = alloca i1, i1 0
-  %nop14269 = alloca i1, i1 0
-  %nop14270 = alloca i1, i1 0
-  %nop14271 = alloca i1, i1 0
-  %nop14272 = alloca i1, i1 0
-  %nop14273 = alloca i1, i1 0
-  %nop14274 = alloca i1, i1 0
-  %nop14275 = alloca i1, i1 0
-  %nop14276 = alloca i1, i1 0
-  %nop14277 = alloca i1, i1 0
-  %nop14278 = alloca i1, i1 0
-  %nop14279 = alloca i1, i1 0
-  %nop14280 = alloca i1, i1 0
-  %nop14281 = alloca i1, i1 0
-  %nop14282 = alloca i1, i1 0
-  %nop14283 = alloca i1, i1 0
-  %nop14284 = alloca i1, i1 0
-  %nop14285 = alloca i1, i1 0
-  %nop14286 = alloca i1, i1 0
-  %nop14287 = alloca i1, i1 0
-  %nop14288 = alloca i1, i1 0
-  %nop14289 = alloca i1, i1 0
-  %nop14290 = alloca i1, i1 0
-  %nop14291 = alloca i1, i1 0
-  %nop14292 = alloca i1, i1 0
-  %nop14293 = alloca i1, i1 0
-  %nop14294 = alloca i1, i1 0
-  %nop14295 = alloca i1, i1 0
-  %nop14296 = alloca i1, i1 0
-  %nop14297 = alloca i1, i1 0
-  %nop14298 = alloca i1, i1 0
-  %nop14299 = alloca i1, i1 0
-  %nop14300 = alloca i1, i1 0
-  %nop14301 = alloca i1, i1 0
-  %nop14302 = alloca i1, i1 0
-  %nop14303 = alloca i1, i1 0
-  %nop14304 = alloca i1, i1 0
-  %nop14305 = alloca i1, i1 0
-  %nop14306 = alloca i1, i1 0
-  %nop14307 = alloca i1, i1 0
-  %nop14308 = alloca i1, i1 0
-  %nop14309 = alloca i1, i1 0
-  %nop14310 = alloca i1, i1 0
-  %nop14311 = alloca i1, i1 0
-  %nop14312 = alloca i1, i1 0
-  %nop14313 = alloca i1, i1 0
-  %nop14314 = alloca i1, i1 0
-  %nop14315 = alloca i1, i1 0
-  %nop14316 = alloca i1, i1 0
-  %nop14317 = alloca i1, i1 0
-  %nop14318 = alloca i1, i1 0
-  %nop14319 = alloca i1, i1 0
-  %nop14320 = alloca i1, i1 0
-  %nop14321 = alloca i1, i1 0
-  %nop14322 = alloca i1, i1 0
-  %nop14323 = alloca i1, i1 0
-  %nop14324 = alloca i1, i1 0
-  %nop14325 = alloca i1, i1 0
-  %nop14326 = alloca i1, i1 0
-  %nop14327 = alloca i1, i1 0
-  %nop14328 = alloca i1, i1 0
-  %nop14329 = alloca i1, i1 0
-  %nop14330 = alloca i1, i1 0
-  %nop14331 = alloca i1, i1 0
-  %nop14332 = alloca i1, i1 0
-  %nop14333 = alloca i1, i1 0
-  %nop14334 = alloca i1, i1 0
-  %nop14335 = alloca i1, i1 0
-  %nop14336 = alloca i1, i1 0
-  %nop14337 = alloca i1, i1 0
-  %nop14338 = alloca i1, i1 0
-  %nop14339 = alloca i1, i1 0
-  %nop14340 = alloca i1, i1 0
-  %nop14341 = alloca i1, i1 0
-  %nop14342 = alloca i1, i1 0
-  %nop14343 = alloca i1, i1 0
-  %nop14344 = alloca i1, i1 0
-  %nop14345 = alloca i1, i1 0
-  %nop14346 = alloca i1, i1 0
-  %nop14347 = alloca i1, i1 0
-  %nop14348 = alloca i1, i1 0
-  %nop14349 = alloca i1, i1 0
-  %nop14350 = alloca i1, i1 0
-  %nop14351 = alloca i1, i1 0
-  %nop14352 = alloca i1, i1 0
-  %nop14353 = alloca i1, i1 0
-  %nop14354 = alloca i1, i1 0
-  %nop14355 = alloca i1, i1 0
-  %nop14356 = alloca i1, i1 0
-  %nop14357 = alloca i1, i1 0
-  %nop14358 = alloca i1, i1 0
-  %nop14359 = alloca i1, i1 0
-  %nop14360 = alloca i1, i1 0
-  %nop14361 = alloca i1, i1 0
-  %nop14362 = alloca i1, i1 0
-  %nop14363 = alloca i1, i1 0
-  %nop14364 = alloca i1, i1 0
-  %nop14365 = alloca i1, i1 0
-  %nop14366 = alloca i1, i1 0
-  %nop14367 = alloca i1, i1 0
-  %nop14368 = alloca i1, i1 0
-  %nop14369 = alloca i1, i1 0
-  %nop14370 = alloca i1, i1 0
-  %nop14371 = alloca i1, i1 0
-  %nop14372 = alloca i1, i1 0
-  %nop14373 = alloca i1, i1 0
-  %nop14374 = alloca i1, i1 0
-  %nop14375 = alloca i1, i1 0
-  %nop14376 = alloca i1, i1 0
-  %nop14377 = alloca i1, i1 0
-  %nop14378 = alloca i1, i1 0
-  %nop14379 = alloca i1, i1 0
-  %nop14380 = alloca i1, i1 0
-  %nop14381 = alloca i1, i1 0
-  %nop14382 = alloca i1, i1 0
-  %nop14383 = alloca i1, i1 0
-  %nop14384 = alloca i1, i1 0
-  %nop14385 = alloca i1, i1 0
-  %nop14386 = alloca i1, i1 0
-  %nop14387 = alloca i1, i1 0
-  %nop14388 = alloca i1, i1 0
-  %nop14389 = alloca i1, i1 0
-  %nop14390 = alloca i1, i1 0
-  %nop14391 = alloca i1, i1 0
-  %nop14392 = alloca i1, i1 0
-  %nop14393 = alloca i1, i1 0
-  %nop14394 = alloca i1, i1 0
-  %nop14395 = alloca i1, i1 0
-  %nop14396 = alloca i1, i1 0
-  %nop14397 = alloca i1, i1 0
-  %nop14398 = alloca i1, i1 0
-  %nop14399 = alloca i1, i1 0
-  %nop14400 = alloca i1, i1 0
-  %nop14401 = alloca i1, i1 0
-  %nop14402 = alloca i1, i1 0
-  %nop14403 = alloca i1, i1 0
-  %nop14404 = alloca i1, i1 0
-  %nop14405 = alloca i1, i1 0
-  %nop14406 = alloca i1, i1 0
-  %nop14407 = alloca i1, i1 0
-  %nop14408 = alloca i1, i1 0
-  %nop14409 = alloca i1, i1 0
-  %nop14410 = alloca i1, i1 0
-  %nop14411 = alloca i1, i1 0
-  %nop14412 = alloca i1, i1 0
-  %nop14413 = alloca i1, i1 0
-  %nop14414 = alloca i1, i1 0
-  %nop14415 = alloca i1, i1 0
-  %nop14416 = alloca i1, i1 0
-  %nop14417 = alloca i1, i1 0
-  %nop14418 = alloca i1, i1 0
-  %nop14419 = alloca i1, i1 0
-  %nop14420 = alloca i1, i1 0
-  %nop14421 = alloca i1, i1 0
-  %nop14422 = alloca i1, i1 0
-  %nop14423 = alloca i1, i1 0
-  %nop14424 = alloca i1, i1 0
-  %nop14425 = alloca i1, i1 0
-  %nop14426 = alloca i1, i1 0
-  %nop14427 = alloca i1, i1 0
-  %nop14428 = alloca i1, i1 0
-  %nop14429 = alloca i1, i1 0
-  %nop14430 = alloca i1, i1 0
-  %nop14431 = alloca i1, i1 0
-  %nop14432 = alloca i1, i1 0
-  %nop14433 = alloca i1, i1 0
-  %nop14434 = alloca i1, i1 0
-  %nop14435 = alloca i1, i1 0
-  %nop14436 = alloca i1, i1 0
-  %nop14437 = alloca i1, i1 0
-  %nop14438 = alloca i1, i1 0
-  %nop14439 = alloca i1, i1 0
-  %nop14440 = alloca i1, i1 0
-  %nop14441 = alloca i1, i1 0
-  %nop14442 = alloca i1, i1 0
-  %nop14443 = alloca i1, i1 0
-  %nop14444 = alloca i1, i1 0
-  %nop14445 = alloca i1, i1 0
-  %nop14446 = alloca i1, i1 0
-  %nop14447 = alloca i1, i1 0
-  %nop14448 = alloca i1, i1 0
-  %nop14449 = alloca i1, i1 0
-  %nop14450 = alloca i1, i1 0
-  %nop14451 = alloca i1, i1 0
-  %nop14452 = alloca i1, i1 0
-  %nop14453 = alloca i1, i1 0
-  %nop14454 = alloca i1, i1 0
-  %nop14455 = alloca i1, i1 0
-  %nop14456 = alloca i1, i1 0
-  %nop14457 = alloca i1, i1 0
-  %nop14458 = alloca i1, i1 0
-  %nop14459 = alloca i1, i1 0
-  %nop14460 = alloca i1, i1 0
-  %nop14461 = alloca i1, i1 0
-  %nop14462 = alloca i1, i1 0
-  %nop14463 = alloca i1, i1 0
-  %nop14464 = alloca i1, i1 0
-  %nop14465 = alloca i1, i1 0
-  %nop14466 = alloca i1, i1 0
-  %nop14467 = alloca i1, i1 0
-  %nop14468 = alloca i1, i1 0
-  %nop14469 = alloca i1, i1 0
-  %nop14470 = alloca i1, i1 0
-  %nop14471 = alloca i1, i1 0
-  %nop14472 = alloca i1, i1 0
-  %nop14473 = alloca i1, i1 0
-  %nop14474 = alloca i1, i1 0
-  %nop14475 = alloca i1, i1 0
-  %nop14476 = alloca i1, i1 0
-  %nop14477 = alloca i1, i1 0
-  %nop14478 = alloca i1, i1 0
-  %nop14479 = alloca i1, i1 0
-  %nop14480 = alloca i1, i1 0
-  %nop14481 = alloca i1, i1 0
-  %nop14482 = alloca i1, i1 0
-  %nop14483 = alloca i1, i1 0
-  %nop14484 = alloca i1, i1 0
-  %nop14485 = alloca i1, i1 0
-  %nop14486 = alloca i1, i1 0
-  %nop14487 = alloca i1, i1 0
-  %nop14488 = alloca i1, i1 0
-  %nop14489 = alloca i1, i1 0
-  %nop14490 = alloca i1, i1 0
-  %nop14491 = alloca i1, i1 0
-  %nop14492 = alloca i1, i1 0
-  %nop14493 = alloca i1, i1 0
-  %nop14494 = alloca i1, i1 0
-  %nop14495 = alloca i1, i1 0
-  %nop14496 = alloca i1, i1 0
-  %nop14497 = alloca i1, i1 0
-  %nop14498 = alloca i1, i1 0
-  %nop14499 = alloca i1, i1 0
-  %nop14500 = alloca i1, i1 0
-  %nop14501 = alloca i1, i1 0
-  %nop14502 = alloca i1, i1 0
-  %nop14503 = alloca i1, i1 0
-  %nop14504 = alloca i1, i1 0
-  %nop14505 = alloca i1, i1 0
-  %nop14506 = alloca i1, i1 0
-  %nop14507 = alloca i1, i1 0
-  %nop14508 = alloca i1, i1 0
-  %nop14509 = alloca i1, i1 0
-  %nop14510 = alloca i1, i1 0
-  %nop14511 = alloca i1, i1 0
-  %nop14512 = alloca i1, i1 0
-  %nop14513 = alloca i1, i1 0
-  %nop14514 = alloca i1, i1 0
-  %nop14515 = alloca i1, i1 0
-  %nop14516 = alloca i1, i1 0
-  %nop14517 = alloca i1, i1 0
-  %nop14518 = alloca i1, i1 0
-  %nop14519 = alloca i1, i1 0
-  %nop14520 = alloca i1, i1 0
-  %nop14521 = alloca i1, i1 0
-  %nop14522 = alloca i1, i1 0
-  %nop14523 = alloca i1, i1 0
-  %nop14524 = alloca i1, i1 0
-  %nop14525 = alloca i1, i1 0
-  %nop14526 = alloca i1, i1 0
-  %nop14527 = alloca i1, i1 0
-  %nop14528 = alloca i1, i1 0
-  %nop14529 = alloca i1, i1 0
-  %nop14530 = alloca i1, i1 0
-  %nop14531 = alloca i1, i1 0
-  %nop14532 = alloca i1, i1 0
-  %nop14533 = alloca i1, i1 0
-  %nop14534 = alloca i1, i1 0
-  %nop14535 = alloca i1, i1 0
-  %nop14536 = alloca i1, i1 0
-  %nop14537 = alloca i1, i1 0
-  %nop14538 = alloca i1, i1 0
-  %nop14539 = alloca i1, i1 0
-  %nop14540 = alloca i1, i1 0
-  %nop14541 = alloca i1, i1 0
-  %nop14542 = alloca i1, i1 0
-  %nop14543 = alloca i1, i1 0
-  %nop14544 = alloca i1, i1 0
-  %nop14545 = alloca i1, i1 0
-  %nop14546 = alloca i1, i1 0
-  %nop14547 = alloca i1, i1 0
-  %nop14548 = alloca i1, i1 0
-  %nop14549 = alloca i1, i1 0
-  %nop14550 = alloca i1, i1 0
-  %nop14551 = alloca i1, i1 0
-  %nop14552 = alloca i1, i1 0
-  %nop14553 = alloca i1, i1 0
-  %nop14554 = alloca i1, i1 0
-  %nop14555 = alloca i1, i1 0
-  %nop14556 = alloca i1, i1 0
-  %nop14557 = alloca i1, i1 0
-  %nop14558 = alloca i1, i1 0
-  %nop14559 = alloca i1, i1 0
-  %nop14560 = alloca i1, i1 0
-  %nop14561 = alloca i1, i1 0
-  %nop14562 = alloca i1, i1 0
-  %nop14563 = alloca i1, i1 0
-  %nop14564 = alloca i1, i1 0
-  %nop14565 = alloca i1, i1 0
-  %nop14566 = alloca i1, i1 0
-  %nop14567 = alloca i1, i1 0
-  %nop14568 = alloca i1, i1 0
-  %nop14569 = alloca i1, i1 0
-  %nop14570 = alloca i1, i1 0
-  %nop14571 = alloca i1, i1 0
-  %nop14572 = alloca i1, i1 0
-  %nop14573 = alloca i1, i1 0
-  %nop14574 = alloca i1, i1 0
-  %nop14575 = alloca i1, i1 0
-  %nop14576 = alloca i1, i1 0
-  %nop14577 = alloca i1, i1 0
-  %nop14578 = alloca i1, i1 0
-  %nop14579 = alloca i1, i1 0
-  %nop14580 = alloca i1, i1 0
-  %nop14581 = alloca i1, i1 0
-  %nop14582 = alloca i1, i1 0
-  %nop14583 = alloca i1, i1 0
-  %nop14584 = alloca i1, i1 0
-  %nop14585 = alloca i1, i1 0
-  %nop14586 = alloca i1, i1 0
-  %nop14587 = alloca i1, i1 0
-  %nop14588 = alloca i1, i1 0
-  %nop14589 = alloca i1, i1 0
-  %nop14590 = alloca i1, i1 0
-  %nop14591 = alloca i1, i1 0
-  %nop14592 = alloca i1, i1 0
-  %nop14593 = alloca i1, i1 0
-  %nop14594 = alloca i1, i1 0
-  %nop14595 = alloca i1, i1 0
-  %nop14596 = alloca i1, i1 0
-  %nop14597 = alloca i1, i1 0
-  %nop14598 = alloca i1, i1 0
-  %nop14599 = alloca i1, i1 0
-  %nop14600 = alloca i1, i1 0
-  %nop14601 = alloca i1, i1 0
-  %nop14602 = alloca i1, i1 0
-  %nop14603 = alloca i1, i1 0
-  %nop14604 = alloca i1, i1 0
-  %nop14605 = alloca i1, i1 0
-  %nop14606 = alloca i1, i1 0
-  %nop14607 = alloca i1, i1 0
-  %nop14608 = alloca i1, i1 0
-  %nop14609 = alloca i1, i1 0
-  %nop14610 = alloca i1, i1 0
-  %nop14611 = alloca i1, i1 0
-  %nop14612 = alloca i1, i1 0
-  %nop14613 = alloca i1, i1 0
-  %nop14614 = alloca i1, i1 0
-  %nop14615 = alloca i1, i1 0
-  %nop14616 = alloca i1, i1 0
-  %nop14617 = alloca i1, i1 0
-  %nop14618 = alloca i1, i1 0
-  %nop14619 = alloca i1, i1 0
-  %nop14620 = alloca i1, i1 0
-  %nop14621 = alloca i1, i1 0
-  %nop14622 = alloca i1, i1 0
-  %nop14623 = alloca i1, i1 0
-  %nop14624 = alloca i1, i1 0
-  %nop14625 = alloca i1, i1 0
-  %nop14626 = alloca i1, i1 0
-  %nop14627 = alloca i1, i1 0
-  %nop14628 = alloca i1, i1 0
-  %nop14629 = alloca i1, i1 0
-  %nop14630 = alloca i1, i1 0
-  %nop14631 = alloca i1, i1 0
-  %nop14632 = alloca i1, i1 0
-  %nop14633 = alloca i1, i1 0
-  %nop14634 = alloca i1, i1 0
-  %nop14635 = alloca i1, i1 0
-  %nop14636 = alloca i1, i1 0
-  %nop14637 = alloca i1, i1 0
-  %nop14638 = alloca i1, i1 0
-  %nop14639 = alloca i1, i1 0
-  %nop14640 = alloca i1, i1 0
-  %nop14641 = alloca i1, i1 0
-  %nop14642 = alloca i1, i1 0
-  %nop14643 = alloca i1, i1 0
-  %nop14644 = alloca i1, i1 0
-  %nop14645 = alloca i1, i1 0
-  %nop14646 = alloca i1, i1 0
-  %nop14647 = alloca i1, i1 0
-  %nop14648 = alloca i1, i1 0
-  %nop14649 = alloca i1, i1 0
-  %nop14650 = alloca i1, i1 0
-  %nop14651 = alloca i1, i1 0
-  %nop14652 = alloca i1, i1 0
-  %nop14653 = alloca i1, i1 0
-  %nop14654 = alloca i1, i1 0
-  %nop14655 = alloca i1, i1 0
-  %nop14656 = alloca i1, i1 0
-  %nop14657 = alloca i1, i1 0
-  %nop14658 = alloca i1, i1 0
-  %nop14659 = alloca i1, i1 0
-  %nop14660 = alloca i1, i1 0
-  %nop14661 = alloca i1, i1 0
-  %nop14662 = alloca i1, i1 0
-  %nop14663 = alloca i1, i1 0
-  %nop14664 = alloca i1, i1 0
-  %nop14665 = alloca i1, i1 0
-  %nop14666 = alloca i1, i1 0
-  %nop14667 = alloca i1, i1 0
-  %nop14668 = alloca i1, i1 0
-  %nop14669 = alloca i1, i1 0
-  %nop14670 = alloca i1, i1 0
-  %nop14671 = alloca i1, i1 0
-  %nop14672 = alloca i1, i1 0
-  %nop14673 = alloca i1, i1 0
-  %nop14674 = alloca i1, i1 0
-  %nop14675 = alloca i1, i1 0
-  %nop14676 = alloca i1, i1 0
-  %nop14677 = alloca i1, i1 0
-  %nop14678 = alloca i1, i1 0
-  %nop14679 = alloca i1, i1 0
-  %nop14680 = alloca i1, i1 0
-  %nop14681 = alloca i1, i1 0
-  %nop14682 = alloca i1, i1 0
-  %nop14683 = alloca i1, i1 0
-  %nop14684 = alloca i1, i1 0
-  %nop14685 = alloca i1, i1 0
-  %nop14686 = alloca i1, i1 0
-  %nop14687 = alloca i1, i1 0
-  %nop14688 = alloca i1, i1 0
-  %nop14689 = alloca i1, i1 0
-  %nop14690 = alloca i1, i1 0
-  %nop14691 = alloca i1, i1 0
-  %nop14692 = alloca i1, i1 0
-  %nop14693 = alloca i1, i1 0
-  %nop14694 = alloca i1, i1 0
-  %nop14695 = alloca i1, i1 0
-  %nop14696 = alloca i1, i1 0
-  %nop14697 = alloca i1, i1 0
-  %nop14698 = alloca i1, i1 0
-  %nop14699 = alloca i1, i1 0
-  %nop14700 = alloca i1, i1 0
-  %nop14701 = alloca i1, i1 0
-  %nop14702 = alloca i1, i1 0
-  %nop14703 = alloca i1, i1 0
-  %nop14704 = alloca i1, i1 0
-  %nop14705 = alloca i1, i1 0
-  %nop14706 = alloca i1, i1 0
-  %nop14707 = alloca i1, i1 0
-  %nop14708 = alloca i1, i1 0
-  %nop14709 = alloca i1, i1 0
-  %nop14710 = alloca i1, i1 0
-  %nop14711 = alloca i1, i1 0
-  %nop14712 = alloca i1, i1 0
-  %nop14713 = alloca i1, i1 0
-  %nop14714 = alloca i1, i1 0
-  %nop14715 = alloca i1, i1 0
-  %nop14716 = alloca i1, i1 0
-  %nop14717 = alloca i1, i1 0
-  %nop14718 = alloca i1, i1 0
-  %nop14719 = alloca i1, i1 0
-  %nop14720 = alloca i1, i1 0
-  %nop14721 = alloca i1, i1 0
-  %nop14722 = alloca i1, i1 0
-  %nop14723 = alloca i1, i1 0
-  %nop14724 = alloca i1, i1 0
-  %nop14725 = alloca i1, i1 0
-  %nop14726 = alloca i1, i1 0
-  %nop14727 = alloca i1, i1 0
-  %nop14728 = alloca i1, i1 0
-  %nop14729 = alloca i1, i1 0
-  %nop14730 = alloca i1, i1 0
-  %nop14731 = alloca i1, i1 0
-  %nop14732 = alloca i1, i1 0
-  %nop14733 = alloca i1, i1 0
-  %nop14734 = alloca i1, i1 0
-  %nop14735 = alloca i1, i1 0
-  %nop14736 = alloca i1, i1 0
-  %nop14737 = alloca i1, i1 0
-  %nop14738 = alloca i1, i1 0
-  %nop14739 = alloca i1, i1 0
-  %nop14740 = alloca i1, i1 0
-  %nop14741 = alloca i1, i1 0
-  %nop14742 = alloca i1, i1 0
-  %nop14743 = alloca i1, i1 0
-  %nop14744 = alloca i1, i1 0
-  %nop14745 = alloca i1, i1 0
-  %nop14746 = alloca i1, i1 0
-  %nop14747 = alloca i1, i1 0
-  %nop14748 = alloca i1, i1 0
-  %nop14749 = alloca i1, i1 0
-  %nop14750 = alloca i1, i1 0
-  %nop14751 = alloca i1, i1 0
-  %nop14752 = alloca i1, i1 0
-  %nop14753 = alloca i1, i1 0
-  %nop14754 = alloca i1, i1 0
-  %nop14755 = alloca i1, i1 0
-  %nop14756 = alloca i1, i1 0
-  %nop14757 = alloca i1, i1 0
-  %nop14758 = alloca i1, i1 0
-  %nop14759 = alloca i1, i1 0
-  %nop14760 = alloca i1, i1 0
-  %nop14761 = alloca i1, i1 0
-  %nop14762 = alloca i1, i1 0
-  %nop14763 = alloca i1, i1 0
-  %nop14764 = alloca i1, i1 0
-  %nop14765 = alloca i1, i1 0
-  %nop14766 = alloca i1, i1 0
-  %nop14767 = alloca i1, i1 0
-  %nop14768 = alloca i1, i1 0
-  %nop14769 = alloca i1, i1 0
-  %nop14770 = alloca i1, i1 0
-  %nop14771 = alloca i1, i1 0
-  %nop14772 = alloca i1, i1 0
-  %nop14773 = alloca i1, i1 0
-  %nop14774 = alloca i1, i1 0
-  %nop14775 = alloca i1, i1 0
-  %nop14776 = alloca i1, i1 0
-  %nop14777 = alloca i1, i1 0
-  %nop14778 = alloca i1, i1 0
-  %nop14779 = alloca i1, i1 0
-  %nop14780 = alloca i1, i1 0
-  %nop14781 = alloca i1, i1 0
-  %nop14782 = alloca i1, i1 0
-  %nop14783 = alloca i1, i1 0
-  %nop14784 = alloca i1, i1 0
-  %nop14785 = alloca i1, i1 0
-  %nop14786 = alloca i1, i1 0
-  %nop14787 = alloca i1, i1 0
-  %nop14788 = alloca i1, i1 0
-  %nop14789 = alloca i1, i1 0
-  %nop14790 = alloca i1, i1 0
-  %nop14791 = alloca i1, i1 0
-  %nop14792 = alloca i1, i1 0
-  %nop14793 = alloca i1, i1 0
-  %nop14794 = alloca i1, i1 0
-  %nop14795 = alloca i1, i1 0
-  %nop14796 = alloca i1, i1 0
-  %nop14797 = alloca i1, i1 0
-  %nop14798 = alloca i1, i1 0
-  %nop14799 = alloca i1, i1 0
-  %nop14800 = alloca i1, i1 0
-  %nop14801 = alloca i1, i1 0
-  %nop14802 = alloca i1, i1 0
-  %nop14803 = alloca i1, i1 0
-  %nop14804 = alloca i1, i1 0
-  %nop14805 = alloca i1, i1 0
-  %nop14806 = alloca i1, i1 0
-  %nop14807 = alloca i1, i1 0
-  %nop14808 = alloca i1, i1 0
-  %nop14809 = alloca i1, i1 0
-  %nop14810 = alloca i1, i1 0
-  %nop14811 = alloca i1, i1 0
-  %nop14812 = alloca i1, i1 0
-  %nop14813 = alloca i1, i1 0
-  %nop14814 = alloca i1, i1 0
-  %nop14815 = alloca i1, i1 0
-  %nop14816 = alloca i1, i1 0
-  %nop14817 = alloca i1, i1 0
-  %nop14818 = alloca i1, i1 0
-  %nop14819 = alloca i1, i1 0
-  %nop14820 = alloca i1, i1 0
-  %nop14821 = alloca i1, i1 0
-  %nop14822 = alloca i1, i1 0
-  %nop14823 = alloca i1, i1 0
-  %nop14824 = alloca i1, i1 0
-  %nop14825 = alloca i1, i1 0
-  %nop14826 = alloca i1, i1 0
-  %nop14827 = alloca i1, i1 0
-  %nop14828 = alloca i1, i1 0
-  %nop14829 = alloca i1, i1 0
-  %nop14830 = alloca i1, i1 0
-  %nop14831 = alloca i1, i1 0
-  %nop14832 = alloca i1, i1 0
-  %nop14833 = alloca i1, i1 0
-  %nop14834 = alloca i1, i1 0
-  %nop14835 = alloca i1, i1 0
-  %nop14836 = alloca i1, i1 0
-  %nop14837 = alloca i1, i1 0
-  %nop14838 = alloca i1, i1 0
-  %nop14839 = alloca i1, i1 0
-  %nop14840 = alloca i1, i1 0
-  %nop14841 = alloca i1, i1 0
-  %nop14842 = alloca i1, i1 0
-  %nop14843 = alloca i1, i1 0
-  %nop14844 = alloca i1, i1 0
-  %nop14845 = alloca i1, i1 0
-  %nop14846 = alloca i1, i1 0
-  %nop14847 = alloca i1, i1 0
-  %nop14848 = alloca i1, i1 0
-  %nop14849 = alloca i1, i1 0
-  %nop14850 = alloca i1, i1 0
-  %nop14851 = alloca i1, i1 0
-  %nop14852 = alloca i1, i1 0
-  %nop14853 = alloca i1, i1 0
-  %nop14854 = alloca i1, i1 0
-  %nop14855 = alloca i1, i1 0
-  %nop14856 = alloca i1, i1 0
-  %nop14857 = alloca i1, i1 0
-  %nop14858 = alloca i1, i1 0
-  %nop14859 = alloca i1, i1 0
-  %nop14860 = alloca i1, i1 0
-  %nop14861 = alloca i1, i1 0
-  %nop14862 = alloca i1, i1 0
-  %nop14863 = alloca i1, i1 0
-  %nop14864 = alloca i1, i1 0
-  %nop14865 = alloca i1, i1 0
-  %nop14866 = alloca i1, i1 0
-  %nop14867 = alloca i1, i1 0
-  %nop14868 = alloca i1, i1 0
-  %nop14869 = alloca i1, i1 0
-  %nop14870 = alloca i1, i1 0
-  %nop14871 = alloca i1, i1 0
-  %nop14872 = alloca i1, i1 0
-  %nop14873 = alloca i1, i1 0
-  %nop14874 = alloca i1, i1 0
-  %nop14875 = alloca i1, i1 0
-  %nop14876 = alloca i1, i1 0
-  %nop14877 = alloca i1, i1 0
-  %nop14878 = alloca i1, i1 0
-  %nop14879 = alloca i1, i1 0
-  %nop14880 = alloca i1, i1 0
-  %nop14881 = alloca i1, i1 0
-  %nop14882 = alloca i1, i1 0
-  %nop14883 = alloca i1, i1 0
-  %nop14884 = alloca i1, i1 0
-  %nop14885 = alloca i1, i1 0
-  %nop14886 = alloca i1, i1 0
-  %nop14887 = alloca i1, i1 0
-  %nop14888 = alloca i1, i1 0
-  %nop14889 = alloca i1, i1 0
-  %nop14890 = alloca i1, i1 0
-  %nop14891 = alloca i1, i1 0
-  %nop14892 = alloca i1, i1 0
-  %nop14893 = alloca i1, i1 0
-  %nop14894 = alloca i1, i1 0
-  %nop14895 = alloca i1, i1 0
-  %nop14896 = alloca i1, i1 0
-  %nop14897 = alloca i1, i1 0
-  %nop14898 = alloca i1, i1 0
-  %nop14899 = alloca i1, i1 0
-  %nop14900 = alloca i1, i1 0
-  %nop14901 = alloca i1, i1 0
-  %nop14902 = alloca i1, i1 0
-  %nop14903 = alloca i1, i1 0
-  %nop14904 = alloca i1, i1 0
-  %nop14905 = alloca i1, i1 0
-  %nop14906 = alloca i1, i1 0
-  %nop14907 = alloca i1, i1 0
-  %nop14908 = alloca i1, i1 0
-  %nop14909 = alloca i1, i1 0
-  %nop14910 = alloca i1, i1 0
-  %nop14911 = alloca i1, i1 0
-  %nop14912 = alloca i1, i1 0
-  %nop14913 = alloca i1, i1 0
-  %nop14914 = alloca i1, i1 0
-  %nop14915 = alloca i1, i1 0
-  %nop14916 = alloca i1, i1 0
-  %nop14917 = alloca i1, i1 0
-  %nop14918 = alloca i1, i1 0
-  %nop14919 = alloca i1, i1 0
-  %nop14920 = alloca i1, i1 0
-  %nop14921 = alloca i1, i1 0
-  %nop14922 = alloca i1, i1 0
-  %nop14923 = alloca i1, i1 0
-  %nop14924 = alloca i1, i1 0
-  %nop14925 = alloca i1, i1 0
-  %nop14926 = alloca i1, i1 0
-  %nop14927 = alloca i1, i1 0
-  %nop14928 = alloca i1, i1 0
-  %nop14929 = alloca i1, i1 0
-  %nop14930 = alloca i1, i1 0
-  %nop14931 = alloca i1, i1 0
-  %nop14932 = alloca i1, i1 0
-  %nop14933 = alloca i1, i1 0
-  %nop14934 = alloca i1, i1 0
-  %nop14935 = alloca i1, i1 0
-  %nop14936 = alloca i1, i1 0
-  %nop14937 = alloca i1, i1 0
-  %nop14938 = alloca i1, i1 0
-  %nop14939 = alloca i1, i1 0
-  %nop14940 = alloca i1, i1 0
-  %nop14941 = alloca i1, i1 0
-  %nop14942 = alloca i1, i1 0
-  %nop14943 = alloca i1, i1 0
-  %nop14944 = alloca i1, i1 0
-  %nop14945 = alloca i1, i1 0
-  %nop14946 = alloca i1, i1 0
-  %nop14947 = alloca i1, i1 0
-  %nop14948 = alloca i1, i1 0
-  %nop14949 = alloca i1, i1 0
-  %nop14950 = alloca i1, i1 0
-  %nop14951 = alloca i1, i1 0
-  %nop14952 = alloca i1, i1 0
-  %nop14953 = alloca i1, i1 0
-  %nop14954 = alloca i1, i1 0
-  %nop14955 = alloca i1, i1 0
-  %nop14956 = alloca i1, i1 0
-  %nop14957 = alloca i1, i1 0
-  %nop14958 = alloca i1, i1 0
-  %nop14959 = alloca i1, i1 0
-  %nop14960 = alloca i1, i1 0
-  %nop14961 = alloca i1, i1 0
-  %nop14962 = alloca i1, i1 0
-  %nop14963 = alloca i1, i1 0
-  %nop14964 = alloca i1, i1 0
-  %nop14965 = alloca i1, i1 0
-  %nop14966 = alloca i1, i1 0
-  %nop14967 = alloca i1, i1 0
-  %nop14968 = alloca i1, i1 0
-  %nop14969 = alloca i1, i1 0
-  %nop14970 = alloca i1, i1 0
-  %nop14971 = alloca i1, i1 0
-  %nop14972 = alloca i1, i1 0
-  %nop14973 = alloca i1, i1 0
-  %nop14974 = alloca i1, i1 0
-  %nop14975 = alloca i1, i1 0
-  %nop14976 = alloca i1, i1 0
-  %nop14977 = alloca i1, i1 0
-  %nop14978 = alloca i1, i1 0
-  %nop14979 = alloca i1, i1 0
-  %nop14980 = alloca i1, i1 0
-  %nop14981 = alloca i1, i1 0
-  %nop14982 = alloca i1, i1 0
-  %nop14983 = alloca i1, i1 0
-  %nop14984 = alloca i1, i1 0
-  %nop14985 = alloca i1, i1 0
-  %nop14986 = alloca i1, i1 0
-  %nop14987 = alloca i1, i1 0
-  %nop14988 = alloca i1, i1 0
-  %nop14989 = alloca i1, i1 0
-  %nop14990 = alloca i1, i1 0
-  %nop14991 = alloca i1, i1 0
-  %nop14992 = alloca i1, i1 0
-  %nop14993 = alloca i1, i1 0
-  %nop14994 = alloca i1, i1 0
-  %nop14995 = alloca i1, i1 0
-  %nop14996 = alloca i1, i1 0
-  %nop14997 = alloca i1, i1 0
-  %nop14998 = alloca i1, i1 0
-  %nop14999 = alloca i1, i1 0
-  %nop15000 = alloca i1, i1 0
-  %nop15001 = alloca i1, i1 0
-  %nop15002 = alloca i1, i1 0
-  %nop15003 = alloca i1, i1 0
-  %nop15004 = alloca i1, i1 0
-  %nop15005 = alloca i1, i1 0
-  %nop15006 = alloca i1, i1 0
-  %nop15007 = alloca i1, i1 0
-  %nop15008 = alloca i1, i1 0
-  %nop15009 = alloca i1, i1 0
-  %nop15010 = alloca i1, i1 0
-  %nop15011 = alloca i1, i1 0
-  %nop15012 = alloca i1, i1 0
-  %nop15013 = alloca i1, i1 0
-  %nop15014 = alloca i1, i1 0
-  %nop15015 = alloca i1, i1 0
-  %nop15016 = alloca i1, i1 0
-  %nop15017 = alloca i1, i1 0
-  %nop15018 = alloca i1, i1 0
-  %nop15019 = alloca i1, i1 0
-  %nop15020 = alloca i1, i1 0
-  %nop15021 = alloca i1, i1 0
-  %nop15022 = alloca i1, i1 0
-  %nop15023 = alloca i1, i1 0
-  %nop15024 = alloca i1, i1 0
-  %nop15025 = alloca i1, i1 0
-  %nop15026 = alloca i1, i1 0
-  %nop15027 = alloca i1, i1 0
-  %nop15028 = alloca i1, i1 0
-  %nop15029 = alloca i1, i1 0
-  %nop15030 = alloca i1, i1 0
-  %nop15031 = alloca i1, i1 0
-  %nop15032 = alloca i1, i1 0
-  %nop15033 = alloca i1, i1 0
-  %nop15034 = alloca i1, i1 0
-  %nop15035 = alloca i1, i1 0
-  %nop15036 = alloca i1, i1 0
-  %nop15037 = alloca i1, i1 0
-  %nop15038 = alloca i1, i1 0
-  %nop15039 = alloca i1, i1 0
-  %nop15040 = alloca i1, i1 0
-  %nop15041 = alloca i1, i1 0
-  %nop15042 = alloca i1, i1 0
-  %nop15043 = alloca i1, i1 0
-  %nop15044 = alloca i1, i1 0
-  %nop15045 = alloca i1, i1 0
-  %nop15046 = alloca i1, i1 0
-  %nop15047 = alloca i1, i1 0
-  %nop15048 = alloca i1, i1 0
-  %nop15049 = alloca i1, i1 0
-  %nop15050 = alloca i1, i1 0
-  %nop15051 = alloca i1, i1 0
-  %nop15052 = alloca i1, i1 0
-  %nop15053 = alloca i1, i1 0
-  %nop15054 = alloca i1, i1 0
-  %nop15055 = alloca i1, i1 0
-  %nop15056 = alloca i1, i1 0
-  %nop15057 = alloca i1, i1 0
-  %nop15058 = alloca i1, i1 0
-  %nop15059 = alloca i1, i1 0
-  %nop15060 = alloca i1, i1 0
-  %nop15061 = alloca i1, i1 0
-  %nop15062 = alloca i1, i1 0
-  %nop15063 = alloca i1, i1 0
-  %nop15064 = alloca i1, i1 0
-  %nop15065 = alloca i1, i1 0
-  %nop15066 = alloca i1, i1 0
-  %nop15067 = alloca i1, i1 0
-  %nop15068 = alloca i1, i1 0
-  %nop15069 = alloca i1, i1 0
-  %nop15070 = alloca i1, i1 0
-  %nop15071 = alloca i1, i1 0
-  %nop15072 = alloca i1, i1 0
-  %nop15073 = alloca i1, i1 0
-  %nop15074 = alloca i1, i1 0
-  %nop15075 = alloca i1, i1 0
-  %nop15076 = alloca i1, i1 0
-  %nop15077 = alloca i1, i1 0
-  %nop15078 = alloca i1, i1 0
-  %nop15079 = alloca i1, i1 0
-  %nop15080 = alloca i1, i1 0
-  %nop15081 = alloca i1, i1 0
-  %nop15082 = alloca i1, i1 0
-  %nop15083 = alloca i1, i1 0
-  %nop15084 = alloca i1, i1 0
-  %nop15085 = alloca i1, i1 0
-  %nop15086 = alloca i1, i1 0
-  %nop15087 = alloca i1, i1 0
-  %nop15088 = alloca i1, i1 0
-  %nop15089 = alloca i1, i1 0
-  %nop15090 = alloca i1, i1 0
-  %nop15091 = alloca i1, i1 0
-  %nop15092 = alloca i1, i1 0
-  %nop15093 = alloca i1, i1 0
-  %nop15094 = alloca i1, i1 0
-  %nop15095 = alloca i1, i1 0
-  %nop15096 = alloca i1, i1 0
-  %nop15097 = alloca i1, i1 0
-  %nop15098 = alloca i1, i1 0
-  %nop15099 = alloca i1, i1 0
-  %nop15100 = alloca i1, i1 0
-  %nop15101 = alloca i1, i1 0
-  %nop15102 = alloca i1, i1 0
-  %nop15103 = alloca i1, i1 0
-  %nop15104 = alloca i1, i1 0
-  %nop15105 = alloca i1, i1 0
-  %nop15106 = alloca i1, i1 0
-  %nop15107 = alloca i1, i1 0
-  %nop15108 = alloca i1, i1 0
-  %nop15109 = alloca i1, i1 0
-  %nop15110 = alloca i1, i1 0
-  %nop15111 = alloca i1, i1 0
-  %nop15112 = alloca i1, i1 0
-  %nop15113 = alloca i1, i1 0
-  %nop15114 = alloca i1, i1 0
-  %nop15115 = alloca i1, i1 0
-  %nop15116 = alloca i1, i1 0
-  %nop15117 = alloca i1, i1 0
-  %nop15118 = alloca i1, i1 0
-  %nop15119 = alloca i1, i1 0
-  %nop15120 = alloca i1, i1 0
-  %nop15121 = alloca i1, i1 0
-  %nop15122 = alloca i1, i1 0
-  %nop15123 = alloca i1, i1 0
-  %nop15124 = alloca i1, i1 0
-  %nop15125 = alloca i1, i1 0
-  %nop15126 = alloca i1, i1 0
-  %nop15127 = alloca i1, i1 0
-  %nop15128 = alloca i1, i1 0
-  %nop15129 = alloca i1, i1 0
-  %nop15130 = alloca i1, i1 0
-  %nop15131 = alloca i1, i1 0
-  %nop15132 = alloca i1, i1 0
-  %nop15133 = alloca i1, i1 0
-  %nop15134 = alloca i1, i1 0
-  %nop15135 = alloca i1, i1 0
-  %nop15136 = alloca i1, i1 0
-  %nop15137 = alloca i1, i1 0
-  %nop15138 = alloca i1, i1 0
-  %nop15139 = alloca i1, i1 0
-  %nop15140 = alloca i1, i1 0
-  %nop15141 = alloca i1, i1 0
-  %nop15142 = alloca i1, i1 0
-  %nop15143 = alloca i1, i1 0
-  %nop15144 = alloca i1, i1 0
-  %nop15145 = alloca i1, i1 0
-  %nop15146 = alloca i1, i1 0
-  %nop15147 = alloca i1, i1 0
-  %nop15148 = alloca i1, i1 0
-  %nop15149 = alloca i1, i1 0
-  %nop15150 = alloca i1, i1 0
-  %nop15151 = alloca i1, i1 0
-  %nop15152 = alloca i1, i1 0
-  %nop15153 = alloca i1, i1 0
-  %nop15154 = alloca i1, i1 0
-  %nop15155 = alloca i1, i1 0
-  %nop15156 = alloca i1, i1 0
-  %nop15157 = alloca i1, i1 0
-  %nop15158 = alloca i1, i1 0
-  %nop15159 = alloca i1, i1 0
-  %nop15160 = alloca i1, i1 0
-  %nop15161 = alloca i1, i1 0
-  %nop15162 = alloca i1, i1 0
-  %nop15163 = alloca i1, i1 0
-  %nop15164 = alloca i1, i1 0
-  %nop15165 = alloca i1, i1 0
-  %nop15166 = alloca i1, i1 0
-  %nop15167 = alloca i1, i1 0
-  %nop15168 = alloca i1, i1 0
-  %nop15169 = alloca i1, i1 0
-  %nop15170 = alloca i1, i1 0
-  %nop15171 = alloca i1, i1 0
-  %nop15172 = alloca i1, i1 0
-  %nop15173 = alloca i1, i1 0
-  %nop15174 = alloca i1, i1 0
-  %nop15175 = alloca i1, i1 0
-  %nop15176 = alloca i1, i1 0
-  %nop15177 = alloca i1, i1 0
-  %nop15178 = alloca i1, i1 0
-  %nop15179 = alloca i1, i1 0
-  %nop15180 = alloca i1, i1 0
-  %nop15181 = alloca i1, i1 0
-  %nop15182 = alloca i1, i1 0
-  %nop15183 = alloca i1, i1 0
-  %nop15184 = alloca i1, i1 0
-  %nop15185 = alloca i1, i1 0
-  %nop15186 = alloca i1, i1 0
-  %nop15187 = alloca i1, i1 0
-  %nop15188 = alloca i1, i1 0
-  %nop15189 = alloca i1, i1 0
-  %nop15190 = alloca i1, i1 0
-  %nop15191 = alloca i1, i1 0
-  %nop15192 = alloca i1, i1 0
-  %nop15193 = alloca i1, i1 0
-  %nop15194 = alloca i1, i1 0
-  %nop15195 = alloca i1, i1 0
-  %nop15196 = alloca i1, i1 0
-  %nop15197 = alloca i1, i1 0
-  %nop15198 = alloca i1, i1 0
-  %nop15199 = alloca i1, i1 0
-  %nop15200 = alloca i1, i1 0
-  %nop15201 = alloca i1, i1 0
-  %nop15202 = alloca i1, i1 0
-  %nop15203 = alloca i1, i1 0
-  %nop15204 = alloca i1, i1 0
-  %nop15205 = alloca i1, i1 0
-  %nop15206 = alloca i1, i1 0
-  %nop15207 = alloca i1, i1 0
-  %nop15208 = alloca i1, i1 0
-  %nop15209 = alloca i1, i1 0
-  %nop15210 = alloca i1, i1 0
-  %nop15211 = alloca i1, i1 0
-  %nop15212 = alloca i1, i1 0
-  %nop15213 = alloca i1, i1 0
-  %nop15214 = alloca i1, i1 0
-  %nop15215 = alloca i1, i1 0
-  %nop15216 = alloca i1, i1 0
-  %nop15217 = alloca i1, i1 0
-  %nop15218 = alloca i1, i1 0
-  %nop15219 = alloca i1, i1 0
-  %nop15220 = alloca i1, i1 0
-  %nop15221 = alloca i1, i1 0
-  %nop15222 = alloca i1, i1 0
-  %nop15223 = alloca i1, i1 0
-  %nop15224 = alloca i1, i1 0
-  %nop15225 = alloca i1, i1 0
-  %nop15226 = alloca i1, i1 0
-  %nop15227 = alloca i1, i1 0
-  %nop15228 = alloca i1, i1 0
-  %nop15229 = alloca i1, i1 0
-  %nop15230 = alloca i1, i1 0
-  %nop15231 = alloca i1, i1 0
-  %nop15232 = alloca i1, i1 0
-  %nop15233 = alloca i1, i1 0
-  %nop15234 = alloca i1, i1 0
-  %nop15235 = alloca i1, i1 0
-  %nop15236 = alloca i1, i1 0
-  %nop15237 = alloca i1, i1 0
-  %nop15238 = alloca i1, i1 0
-  %nop15239 = alloca i1, i1 0
-  %nop15240 = alloca i1, i1 0
-  %nop15241 = alloca i1, i1 0
-  %nop15242 = alloca i1, i1 0
-  %nop15243 = alloca i1, i1 0
-  %nop15244 = alloca i1, i1 0
-  %nop15245 = alloca i1, i1 0
-  %nop15246 = alloca i1, i1 0
-  %nop15247 = alloca i1, i1 0
-  %nop15248 = alloca i1, i1 0
-  %nop15249 = alloca i1, i1 0
-  %nop15250 = alloca i1, i1 0
-  %nop15251 = alloca i1, i1 0
-  %nop15252 = alloca i1, i1 0
-  %nop15253 = alloca i1, i1 0
-  %nop15254 = alloca i1, i1 0
-  %nop15255 = alloca i1, i1 0
-  %nop15256 = alloca i1, i1 0
-  %nop15257 = alloca i1, i1 0
-  %nop15258 = alloca i1, i1 0
-  %nop15259 = alloca i1, i1 0
-  %nop15260 = alloca i1, i1 0
-  %nop15261 = alloca i1, i1 0
-  %nop15262 = alloca i1, i1 0
-  %nop15263 = alloca i1, i1 0
-  %nop15264 = alloca i1, i1 0
-  %nop15265 = alloca i1, i1 0
-  %nop15266 = alloca i1, i1 0
-  %nop15267 = alloca i1, i1 0
-  %nop15268 = alloca i1, i1 0
-  %nop15269 = alloca i1, i1 0
-  %nop15270 = alloca i1, i1 0
-  %nop15271 = alloca i1, i1 0
-  %nop15272 = alloca i1, i1 0
-  %nop15273 = alloca i1, i1 0
-  %nop15274 = alloca i1, i1 0
-  %nop15275 = alloca i1, i1 0
-  %nop15276 = alloca i1, i1 0
-  %nop15277 = alloca i1, i1 0
-  %nop15278 = alloca i1, i1 0
-  %nop15279 = alloca i1, i1 0
-  %nop15280 = alloca i1, i1 0
-  %nop15281 = alloca i1, i1 0
-  %nop15282 = alloca i1, i1 0
-  %nop15283 = alloca i1, i1 0
-  %nop15284 = alloca i1, i1 0
-  %nop15285 = alloca i1, i1 0
-  %nop15286 = alloca i1, i1 0
-  %nop15287 = alloca i1, i1 0
-  %nop15288 = alloca i1, i1 0
-  %nop15289 = alloca i1, i1 0
-  %nop15290 = alloca i1, i1 0
-  %nop15291 = alloca i1, i1 0
-  %nop15292 = alloca i1, i1 0
-  %nop15293 = alloca i1, i1 0
-  %nop15294 = alloca i1, i1 0
-  %nop15295 = alloca i1, i1 0
-  %nop15296 = alloca i1, i1 0
-  %nop15297 = alloca i1, i1 0
-  %nop15298 = alloca i1, i1 0
-  %nop15299 = alloca i1, i1 0
-  %nop15300 = alloca i1, i1 0
-  %nop15301 = alloca i1, i1 0
-  %nop15302 = alloca i1, i1 0
-  %nop15303 = alloca i1, i1 0
-  %nop15304 = alloca i1, i1 0
-  %nop15305 = alloca i1, i1 0
-  %nop15306 = alloca i1, i1 0
-  %nop15307 = alloca i1, i1 0
-  %nop15308 = alloca i1, i1 0
-  %nop15309 = alloca i1, i1 0
-  %nop15310 = alloca i1, i1 0
-  %nop15311 = alloca i1, i1 0
-  %nop15312 = alloca i1, i1 0
-  %nop15313 = alloca i1, i1 0
-  %nop15314 = alloca i1, i1 0
-  %nop15315 = alloca i1, i1 0
-  %nop15316 = alloca i1, i1 0
-  %nop15317 = alloca i1, i1 0
-  %nop15318 = alloca i1, i1 0
-  %nop15319 = alloca i1, i1 0
-  %nop15320 = alloca i1, i1 0
-  %nop15321 = alloca i1, i1 0
-  %nop15322 = alloca i1, i1 0
-  %nop15323 = alloca i1, i1 0
-  %nop15324 = alloca i1, i1 0
-  %nop15325 = alloca i1, i1 0
-  %nop15326 = alloca i1, i1 0
-  %nop15327 = alloca i1, i1 0
-  %nop15328 = alloca i1, i1 0
-  %nop15329 = alloca i1, i1 0
-  %nop15330 = alloca i1, i1 0
-  %nop15331 = alloca i1, i1 0
-  %nop15332 = alloca i1, i1 0
-  %nop15333 = alloca i1, i1 0
-  %nop15334 = alloca i1, i1 0
-  %nop15335 = alloca i1, i1 0
-  %nop15336 = alloca i1, i1 0
-  %nop15337 = alloca i1, i1 0
-  %nop15338 = alloca i1, i1 0
-  %nop15339 = alloca i1, i1 0
-  %nop15340 = alloca i1, i1 0
-  %nop15341 = alloca i1, i1 0
-  %nop15342 = alloca i1, i1 0
-  %nop15343 = alloca i1, i1 0
-  %nop15344 = alloca i1, i1 0
-  %nop15345 = alloca i1, i1 0
-  %nop15346 = alloca i1, i1 0
-  %nop15347 = alloca i1, i1 0
-  %nop15348 = alloca i1, i1 0
-  %nop15349 = alloca i1, i1 0
-  %nop15350 = alloca i1, i1 0
-  %nop15351 = alloca i1, i1 0
-  %nop15352 = alloca i1, i1 0
-  %nop15353 = alloca i1, i1 0
-  %nop15354 = alloca i1, i1 0
-  %nop15355 = alloca i1, i1 0
-  %nop15356 = alloca i1, i1 0
-  %nop15357 = alloca i1, i1 0
-  %nop15358 = alloca i1, i1 0
-  %nop15359 = alloca i1, i1 0
-  %nop15360 = alloca i1, i1 0
-  %nop15361 = alloca i1, i1 0
-  %nop15362 = alloca i1, i1 0
-  %nop15363 = alloca i1, i1 0
-  %nop15364 = alloca i1, i1 0
-  %nop15365 = alloca i1, i1 0
-  %nop15366 = alloca i1, i1 0
-  %nop15367 = alloca i1, i1 0
-  %nop15368 = alloca i1, i1 0
-  %nop15369 = alloca i1, i1 0
-  %nop15370 = alloca i1, i1 0
-  %nop15371 = alloca i1, i1 0
-  %nop15372 = alloca i1, i1 0
-  %nop15373 = alloca i1, i1 0
-  %nop15374 = alloca i1, i1 0
-  %nop15375 = alloca i1, i1 0
-  %nop15376 = alloca i1, i1 0
-  %nop15377 = alloca i1, i1 0
-  %nop15378 = alloca i1, i1 0
-  %nop15379 = alloca i1, i1 0
-  %nop15380 = alloca i1, i1 0
-  %nop15381 = alloca i1, i1 0
-  %nop15382 = alloca i1, i1 0
-  %nop15383 = alloca i1, i1 0
-  %nop15384 = alloca i1, i1 0
-  %nop15385 = alloca i1, i1 0
-  %nop15386 = alloca i1, i1 0
-  %nop15387 = alloca i1, i1 0
-  %nop15388 = alloca i1, i1 0
-  %nop15389 = alloca i1, i1 0
-  %nop15390 = alloca i1, i1 0
-  %nop15391 = alloca i1, i1 0
-  %nop15392 = alloca i1, i1 0
-  %nop15393 = alloca i1, i1 0
-  %nop15394 = alloca i1, i1 0
-  %nop15395 = alloca i1, i1 0
-  %nop15396 = alloca i1, i1 0
-  %nop15397 = alloca i1, i1 0
-  %nop15398 = alloca i1, i1 0
-  %nop15399 = alloca i1, i1 0
-  %nop15400 = alloca i1, i1 0
-  %nop15401 = alloca i1, i1 0
-  %nop15402 = alloca i1, i1 0
-  %nop15403 = alloca i1, i1 0
-  %nop15404 = alloca i1, i1 0
-  %nop15405 = alloca i1, i1 0
-  %nop15406 = alloca i1, i1 0
-  %nop15407 = alloca i1, i1 0
-  %nop15408 = alloca i1, i1 0
-  %nop15409 = alloca i1, i1 0
-  %nop15410 = alloca i1, i1 0
-  %nop15411 = alloca i1, i1 0
-  %nop15412 = alloca i1, i1 0
-  %nop15413 = alloca i1, i1 0
-  %nop15414 = alloca i1, i1 0
-  %nop15415 = alloca i1, i1 0
-  %nop15416 = alloca i1, i1 0
-  %nop15417 = alloca i1, i1 0
-  %nop15418 = alloca i1, i1 0
-  %nop15419 = alloca i1, i1 0
-  %nop15420 = alloca i1, i1 0
-  %nop15421 = alloca i1, i1 0
-  %nop15422 = alloca i1, i1 0
-  %nop15423 = alloca i1, i1 0
-  %nop15424 = alloca i1, i1 0
-  %nop15425 = alloca i1, i1 0
-  %nop15426 = alloca i1, i1 0
-  %nop15427 = alloca i1, i1 0
-  %nop15428 = alloca i1, i1 0
-  %nop15429 = alloca i1, i1 0
-  %nop15430 = alloca i1, i1 0
-  %nop15431 = alloca i1, i1 0
-  %nop15432 = alloca i1, i1 0
-  %nop15433 = alloca i1, i1 0
-  %nop15434 = alloca i1, i1 0
-  %nop15435 = alloca i1, i1 0
-  %nop15436 = alloca i1, i1 0
-  %nop15437 = alloca i1, i1 0
-  %nop15438 = alloca i1, i1 0
-  %nop15439 = alloca i1, i1 0
-  %nop15440 = alloca i1, i1 0
-  %nop15441 = alloca i1, i1 0
-  %nop15442 = alloca i1, i1 0
-  %nop15443 = alloca i1, i1 0
-  %nop15444 = alloca i1, i1 0
-  %nop15445 = alloca i1, i1 0
-  %nop15446 = alloca i1, i1 0
-  %nop15447 = alloca i1, i1 0
-  %nop15448 = alloca i1, i1 0
-  %nop15449 = alloca i1, i1 0
-  %nop15450 = alloca i1, i1 0
-  %nop15451 = alloca i1, i1 0
-  %nop15452 = alloca i1, i1 0
-  %nop15453 = alloca i1, i1 0
-  %nop15454 = alloca i1, i1 0
-  %nop15455 = alloca i1, i1 0
-  %nop15456 = alloca i1, i1 0
-  %nop15457 = alloca i1, i1 0
-  %nop15458 = alloca i1, i1 0
-  %nop15459 = alloca i1, i1 0
-  %nop15460 = alloca i1, i1 0
-  %nop15461 = alloca i1, i1 0
-  %nop15462 = alloca i1, i1 0
-  %nop15463 = alloca i1, i1 0
-  %nop15464 = alloca i1, i1 0
-  %nop15465 = alloca i1, i1 0
-  %nop15466 = alloca i1, i1 0
-  %nop15467 = alloca i1, i1 0
-  %nop15468 = alloca i1, i1 0
-  %nop15469 = alloca i1, i1 0
-  %nop15470 = alloca i1, i1 0
-  %nop15471 = alloca i1, i1 0
-  %nop15472 = alloca i1, i1 0
-  %nop15473 = alloca i1, i1 0
-  %nop15474 = alloca i1, i1 0
-  %nop15475 = alloca i1, i1 0
-  %nop15476 = alloca i1, i1 0
-  %nop15477 = alloca i1, i1 0
-  %nop15478 = alloca i1, i1 0
-  %nop15479 = alloca i1, i1 0
-  %nop15480 = alloca i1, i1 0
-  %nop15481 = alloca i1, i1 0
-  %nop15482 = alloca i1, i1 0
-  %nop15483 = alloca i1, i1 0
-  %nop15484 = alloca i1, i1 0
-  %nop15485 = alloca i1, i1 0
-  %nop15486 = alloca i1, i1 0
-  %nop15487 = alloca i1, i1 0
-  %nop15488 = alloca i1, i1 0
-  %nop15489 = alloca i1, i1 0
-  %nop15490 = alloca i1, i1 0
-  %nop15491 = alloca i1, i1 0
-  %nop15492 = alloca i1, i1 0
-  %nop15493 = alloca i1, i1 0
-  %nop15494 = alloca i1, i1 0
-  %nop15495 = alloca i1, i1 0
-  %nop15496 = alloca i1, i1 0
-  %nop15497 = alloca i1, i1 0
-  %nop15498 = alloca i1, i1 0
-  %nop15499 = alloca i1, i1 0
-  %nop15500 = alloca i1, i1 0
-  %nop15501 = alloca i1, i1 0
-  %nop15502 = alloca i1, i1 0
-  %nop15503 = alloca i1, i1 0
-  %nop15504 = alloca i1, i1 0
-  %nop15505 = alloca i1, i1 0
-  %nop15506 = alloca i1, i1 0
-  %nop15507 = alloca i1, i1 0
-  %nop15508 = alloca i1, i1 0
-  %nop15509 = alloca i1, i1 0
-  %nop15510 = alloca i1, i1 0
-  %nop15511 = alloca i1, i1 0
-  %nop15512 = alloca i1, i1 0
-  %nop15513 = alloca i1, i1 0
-  %nop15514 = alloca i1, i1 0
-  %nop15515 = alloca i1, i1 0
-  %nop15516 = alloca i1, i1 0
-  %nop15517 = alloca i1, i1 0
-  %nop15518 = alloca i1, i1 0
-  %nop15519 = alloca i1, i1 0
-  %nop15520 = alloca i1, i1 0
-  %nop15521 = alloca i1, i1 0
-  %nop15522 = alloca i1, i1 0
-  %nop15523 = alloca i1, i1 0
-  %nop15524 = alloca i1, i1 0
-  %nop15525 = alloca i1, i1 0
-  %nop15526 = alloca i1, i1 0
-  %nop15527 = alloca i1, i1 0
-  %nop15528 = alloca i1, i1 0
-  %nop15529 = alloca i1, i1 0
-  %nop15530 = alloca i1, i1 0
-  %nop15531 = alloca i1, i1 0
-  %nop15532 = alloca i1, i1 0
-  %nop15533 = alloca i1, i1 0
-  %nop15534 = alloca i1, i1 0
-  %nop15535 = alloca i1, i1 0
-  %nop15536 = alloca i1, i1 0
-  %nop15537 = alloca i1, i1 0
-  %nop15538 = alloca i1, i1 0
-  %nop15539 = alloca i1, i1 0
-  %nop15540 = alloca i1, i1 0
-  %nop15541 = alloca i1, i1 0
-  %nop15542 = alloca i1, i1 0
-  %nop15543 = alloca i1, i1 0
-  %nop15544 = alloca i1, i1 0
-  %nop15545 = alloca i1, i1 0
-  %nop15546 = alloca i1, i1 0
-  %nop15547 = alloca i1, i1 0
-  %nop15548 = alloca i1, i1 0
-  %nop15549 = alloca i1, i1 0
-  %nop15550 = alloca i1, i1 0
-  %nop15551 = alloca i1, i1 0
-  %nop15552 = alloca i1, i1 0
-  %nop15553 = alloca i1, i1 0
-  %nop15554 = alloca i1, i1 0
-  %nop15555 = alloca i1, i1 0
-  %nop15556 = alloca i1, i1 0
-  %nop15557 = alloca i1, i1 0
-  %nop15558 = alloca i1, i1 0
-  %nop15559 = alloca i1, i1 0
-  %nop15560 = alloca i1, i1 0
-  %nop15561 = alloca i1, i1 0
-  %nop15562 = alloca i1, i1 0
-  %nop15563 = alloca i1, i1 0
-  %nop15564 = alloca i1, i1 0
-  %nop15565 = alloca i1, i1 0
-  %nop15566 = alloca i1, i1 0
-  %nop15567 = alloca i1, i1 0
-  %nop15568 = alloca i1, i1 0
-  %nop15569 = alloca i1, i1 0
-  %nop15570 = alloca i1, i1 0
-  %nop15571 = alloca i1, i1 0
-  %nop15572 = alloca i1, i1 0
-  %nop15573 = alloca i1, i1 0
-  %nop15574 = alloca i1, i1 0
-  %nop15575 = alloca i1, i1 0
-  %nop15576 = alloca i1, i1 0
-  %nop15577 = alloca i1, i1 0
-  %nop15578 = alloca i1, i1 0
-  %nop15579 = alloca i1, i1 0
-  %nop15580 = alloca i1, i1 0
-  %nop15581 = alloca i1, i1 0
-  %nop15582 = alloca i1, i1 0
-  %nop15583 = alloca i1, i1 0
-  %nop15584 = alloca i1, i1 0
-  %nop15585 = alloca i1, i1 0
-  %nop15586 = alloca i1, i1 0
-  %nop15587 = alloca i1, i1 0
-  %nop15588 = alloca i1, i1 0
-  %nop15589 = alloca i1, i1 0
-  %nop15590 = alloca i1, i1 0
-  %nop15591 = alloca i1, i1 0
-  %nop15592 = alloca i1, i1 0
-  %nop15593 = alloca i1, i1 0
-  %nop15594 = alloca i1, i1 0
-  %nop15595 = alloca i1, i1 0
-  %nop15596 = alloca i1, i1 0
-  %nop15597 = alloca i1, i1 0
-  %nop15598 = alloca i1, i1 0
-  %nop15599 = alloca i1, i1 0
-  %nop15600 = alloca i1, i1 0
-  %nop15601 = alloca i1, i1 0
-  %nop15602 = alloca i1, i1 0
-  %nop15603 = alloca i1, i1 0
-  %nop15604 = alloca i1, i1 0
-  %nop15605 = alloca i1, i1 0
-  %nop15606 = alloca i1, i1 0
-  %nop15607 = alloca i1, i1 0
-  %nop15608 = alloca i1, i1 0
-  %nop15609 = alloca i1, i1 0
-  %nop15610 = alloca i1, i1 0
-  %nop15611 = alloca i1, i1 0
-  %nop15612 = alloca i1, i1 0
-  %nop15613 = alloca i1, i1 0
-  %nop15614 = alloca i1, i1 0
-  %nop15615 = alloca i1, i1 0
-  %nop15616 = alloca i1, i1 0
-  %nop15617 = alloca i1, i1 0
-  %nop15618 = alloca i1, i1 0
-  %nop15619 = alloca i1, i1 0
-  %nop15620 = alloca i1, i1 0
-  %nop15621 = alloca i1, i1 0
-  %nop15622 = alloca i1, i1 0
-  %nop15623 = alloca i1, i1 0
-  %nop15624 = alloca i1, i1 0
-  %nop15625 = alloca i1, i1 0
-  %nop15626 = alloca i1, i1 0
-  %nop15627 = alloca i1, i1 0
-  %nop15628 = alloca i1, i1 0
-  %nop15629 = alloca i1, i1 0
-  %nop15630 = alloca i1, i1 0
-  %nop15631 = alloca i1, i1 0
-  %nop15632 = alloca i1, i1 0
-  %nop15633 = alloca i1, i1 0
-  %nop15634 = alloca i1, i1 0
-  %nop15635 = alloca i1, i1 0
-  %nop15636 = alloca i1, i1 0
-  %nop15637 = alloca i1, i1 0
-  %nop15638 = alloca i1, i1 0
-  %nop15639 = alloca i1, i1 0
-  %nop15640 = alloca i1, i1 0
-  %nop15641 = alloca i1, i1 0
-  %nop15642 = alloca i1, i1 0
-  %nop15643 = alloca i1, i1 0
-  %nop15644 = alloca i1, i1 0
-  %nop15645 = alloca i1, i1 0
-  %nop15646 = alloca i1, i1 0
-  %nop15647 = alloca i1, i1 0
-  %nop15648 = alloca i1, i1 0
-  %nop15649 = alloca i1, i1 0
-  %nop15650 = alloca i1, i1 0
-  %nop15651 = alloca i1, i1 0
-  %nop15652 = alloca i1, i1 0
-  %nop15653 = alloca i1, i1 0
-  %nop15654 = alloca i1, i1 0
-  %nop15655 = alloca i1, i1 0
-  %nop15656 = alloca i1, i1 0
-  %nop15657 = alloca i1, i1 0
-  %nop15658 = alloca i1, i1 0
-  %nop15659 = alloca i1, i1 0
-  %nop15660 = alloca i1, i1 0
-  %nop15661 = alloca i1, i1 0
-  %nop15662 = alloca i1, i1 0
-  %nop15663 = alloca i1, i1 0
-  %nop15664 = alloca i1, i1 0
-  %nop15665 = alloca i1, i1 0
-  %nop15666 = alloca i1, i1 0
-  %nop15667 = alloca i1, i1 0
-  %nop15668 = alloca i1, i1 0
-  %nop15669 = alloca i1, i1 0
-  %nop15670 = alloca i1, i1 0
-  %nop15671 = alloca i1, i1 0
-  %nop15672 = alloca i1, i1 0
-  %nop15673 = alloca i1, i1 0
-  %nop15674 = alloca i1, i1 0
-  %nop15675 = alloca i1, i1 0
-  %nop15676 = alloca i1, i1 0
-  %nop15677 = alloca i1, i1 0
-  %nop15678 = alloca i1, i1 0
-  %nop15679 = alloca i1, i1 0
-  %nop15680 = alloca i1, i1 0
-  %nop15681 = alloca i1, i1 0
-  %nop15682 = alloca i1, i1 0
-  %nop15683 = alloca i1, i1 0
-  %nop15684 = alloca i1, i1 0
-  %nop15685 = alloca i1, i1 0
-  %nop15686 = alloca i1, i1 0
-  %nop15687 = alloca i1, i1 0
-  %nop15688 = alloca i1, i1 0
-  %nop15689 = alloca i1, i1 0
-  %nop15690 = alloca i1, i1 0
-  %nop15691 = alloca i1, i1 0
-  %nop15692 = alloca i1, i1 0
-  %nop15693 = alloca i1, i1 0
-  %nop15694 = alloca i1, i1 0
-  %nop15695 = alloca i1, i1 0
-  %nop15696 = alloca i1, i1 0
-  %nop15697 = alloca i1, i1 0
-  %nop15698 = alloca i1, i1 0
-  %nop15699 = alloca i1, i1 0
-  %nop15700 = alloca i1, i1 0
-  %nop15701 = alloca i1, i1 0
-  %nop15702 = alloca i1, i1 0
-  %nop15703 = alloca i1, i1 0
-  %nop15704 = alloca i1, i1 0
-  %nop15705 = alloca i1, i1 0
-  %nop15706 = alloca i1, i1 0
-  %nop15707 = alloca i1, i1 0
-  %nop15708 = alloca i1, i1 0
-  %nop15709 = alloca i1, i1 0
-  %nop15710 = alloca i1, i1 0
-  %nop15711 = alloca i1, i1 0
-  %nop15712 = alloca i1, i1 0
-  %nop15713 = alloca i1, i1 0
-  %nop15714 = alloca i1, i1 0
-  %nop15715 = alloca i1, i1 0
-  %nop15716 = alloca i1, i1 0
-  %nop15717 = alloca i1, i1 0
-  %nop15718 = alloca i1, i1 0
-  %nop15719 = alloca i1, i1 0
-  %nop15720 = alloca i1, i1 0
-  %nop15721 = alloca i1, i1 0
-  %nop15722 = alloca i1, i1 0
-  %nop15723 = alloca i1, i1 0
-  %nop15724 = alloca i1, i1 0
-  %nop15725 = alloca i1, i1 0
-  %nop15726 = alloca i1, i1 0
-  %nop15727 = alloca i1, i1 0
-  %nop15728 = alloca i1, i1 0
-  %nop15729 = alloca i1, i1 0
-  %nop15730 = alloca i1, i1 0
-  %nop15731 = alloca i1, i1 0
-  %nop15732 = alloca i1, i1 0
-  %nop15733 = alloca i1, i1 0
-  %nop15734 = alloca i1, i1 0
-  %nop15735 = alloca i1, i1 0
-  %nop15736 = alloca i1, i1 0
-  %nop15737 = alloca i1, i1 0
-  %nop15738 = alloca i1, i1 0
-  %nop15739 = alloca i1, i1 0
-  %nop15740 = alloca i1, i1 0
-  %nop15741 = alloca i1, i1 0
-  %nop15742 = alloca i1, i1 0
-  %nop15743 = alloca i1, i1 0
-  %nop15744 = alloca i1, i1 0
-  %nop15745 = alloca i1, i1 0
-  %nop15746 = alloca i1, i1 0
-  %nop15747 = alloca i1, i1 0
-  %nop15748 = alloca i1, i1 0
-  %nop15749 = alloca i1, i1 0
-  %nop15750 = alloca i1, i1 0
-  %nop15751 = alloca i1, i1 0
-  %nop15752 = alloca i1, i1 0
-  %nop15753 = alloca i1, i1 0
-  %nop15754 = alloca i1, i1 0
-  %nop15755 = alloca i1, i1 0
-  %nop15756 = alloca i1, i1 0
-  %nop15757 = alloca i1, i1 0
-  %nop15758 = alloca i1, i1 0
-  %nop15759 = alloca i1, i1 0
-  %nop15760 = alloca i1, i1 0
-  %nop15761 = alloca i1, i1 0
-  %nop15762 = alloca i1, i1 0
-  %nop15763 = alloca i1, i1 0
-  %nop15764 = alloca i1, i1 0
-  %nop15765 = alloca i1, i1 0
-  %nop15766 = alloca i1, i1 0
-  %nop15767 = alloca i1, i1 0
-  %nop15768 = alloca i1, i1 0
-  %nop15769 = alloca i1, i1 0
-  %nop15770 = alloca i1, i1 0
-  %nop15771 = alloca i1, i1 0
-  %nop15772 = alloca i1, i1 0
-  %nop15773 = alloca i1, i1 0
-  %nop15774 = alloca i1, i1 0
-  %nop15775 = alloca i1, i1 0
-  %nop15776 = alloca i1, i1 0
-  %nop15777 = alloca i1, i1 0
-  %nop15778 = alloca i1, i1 0
-  %nop15779 = alloca i1, i1 0
-  %nop15780 = alloca i1, i1 0
-  %nop15781 = alloca i1, i1 0
-  %nop15782 = alloca i1, i1 0
-  %nop15783 = alloca i1, i1 0
-  %nop15784 = alloca i1, i1 0
-  %nop15785 = alloca i1, i1 0
-  %nop15786 = alloca i1, i1 0
-  %nop15787 = alloca i1, i1 0
-  %nop15788 = alloca i1, i1 0
-  %nop15789 = alloca i1, i1 0
-  %nop15790 = alloca i1, i1 0
-  %nop15791 = alloca i1, i1 0
-  %nop15792 = alloca i1, i1 0
-  %nop15793 = alloca i1, i1 0
-  %nop15794 = alloca i1, i1 0
-  %nop15795 = alloca i1, i1 0
-  %nop15796 = alloca i1, i1 0
-  %nop15797 = alloca i1, i1 0
-  %nop15798 = alloca i1, i1 0
-  %nop15799 = alloca i1, i1 0
-  %nop15800 = alloca i1, i1 0
-  %nop15801 = alloca i1, i1 0
-  %nop15802 = alloca i1, i1 0
-  %nop15803 = alloca i1, i1 0
-  %nop15804 = alloca i1, i1 0
-  %nop15805 = alloca i1, i1 0
-  %nop15806 = alloca i1, i1 0
-  %nop15807 = alloca i1, i1 0
-  %nop15808 = alloca i1, i1 0
-  %nop15809 = alloca i1, i1 0
-  %nop15810 = alloca i1, i1 0
-  %nop15811 = alloca i1, i1 0
-  %nop15812 = alloca i1, i1 0
-  %nop15813 = alloca i1, i1 0
-  %nop15814 = alloca i1, i1 0
-  %nop15815 = alloca i1, i1 0
-  %nop15816 = alloca i1, i1 0
-  %nop15817 = alloca i1, i1 0
-  %nop15818 = alloca i1, i1 0
-  %nop15819 = alloca i1, i1 0
-  %nop15820 = alloca i1, i1 0
-  %nop15821 = alloca i1, i1 0
-  %nop15822 = alloca i1, i1 0
-  %nop15823 = alloca i1, i1 0
-  %nop15824 = alloca i1, i1 0
-  %nop15825 = alloca i1, i1 0
-  %nop15826 = alloca i1, i1 0
-  %nop15827 = alloca i1, i1 0
-  %nop15828 = alloca i1, i1 0
-  %nop15829 = alloca i1, i1 0
-  %nop15830 = alloca i1, i1 0
-  %nop15831 = alloca i1, i1 0
-  %nop15832 = alloca i1, i1 0
-  %nop15833 = alloca i1, i1 0
-  %nop15834 = alloca i1, i1 0
-  %nop15835 = alloca i1, i1 0
-  %nop15836 = alloca i1, i1 0
-  %nop15837 = alloca i1, i1 0
-  %nop15838 = alloca i1, i1 0
-  %nop15839 = alloca i1, i1 0
-  %nop15840 = alloca i1, i1 0
-  %nop15841 = alloca i1, i1 0
-  %nop15842 = alloca i1, i1 0
-  %nop15843 = alloca i1, i1 0
-  %nop15844 = alloca i1, i1 0
-  %nop15845 = alloca i1, i1 0
-  %nop15846 = alloca i1, i1 0
-  %nop15847 = alloca i1, i1 0
-  %nop15848 = alloca i1, i1 0
-  %nop15849 = alloca i1, i1 0
-  %nop15850 = alloca i1, i1 0
-  %nop15851 = alloca i1, i1 0
-  %nop15852 = alloca i1, i1 0
-  %nop15853 = alloca i1, i1 0
-  %nop15854 = alloca i1, i1 0
-  %nop15855 = alloca i1, i1 0
-  %nop15856 = alloca i1, i1 0
-  %nop15857 = alloca i1, i1 0
-  %nop15858 = alloca i1, i1 0
-  %nop15859 = alloca i1, i1 0
-  %nop15860 = alloca i1, i1 0
-  %nop15861 = alloca i1, i1 0
-  %nop15862 = alloca i1, i1 0
-  %nop15863 = alloca i1, i1 0
-  %nop15864 = alloca i1, i1 0
-  %nop15865 = alloca i1, i1 0
-  %nop15866 = alloca i1, i1 0
-  %nop15867 = alloca i1, i1 0
-  %nop15868 = alloca i1, i1 0
-  %nop15869 = alloca i1, i1 0
-  %nop15870 = alloca i1, i1 0
-  %nop15871 = alloca i1, i1 0
-  %nop15872 = alloca i1, i1 0
-  %nop15873 = alloca i1, i1 0
-  %nop15874 = alloca i1, i1 0
-  %nop15875 = alloca i1, i1 0
-  %nop15876 = alloca i1, i1 0
-  %nop15877 = alloca i1, i1 0
-  %nop15878 = alloca i1, i1 0
-  %nop15879 = alloca i1, i1 0
-  %nop15880 = alloca i1, i1 0
-  %nop15881 = alloca i1, i1 0
-  %nop15882 = alloca i1, i1 0
-  %nop15883 = alloca i1, i1 0
-  %nop15884 = alloca i1, i1 0
-  %nop15885 = alloca i1, i1 0
-  %nop15886 = alloca i1, i1 0
-  %nop15887 = alloca i1, i1 0
-  %nop15888 = alloca i1, i1 0
-  %nop15889 = alloca i1, i1 0
-  %nop15890 = alloca i1, i1 0
-  %nop15891 = alloca i1, i1 0
-  %nop15892 = alloca i1, i1 0
-  %nop15893 = alloca i1, i1 0
-  %nop15894 = alloca i1, i1 0
-  %nop15895 = alloca i1, i1 0
-  %nop15896 = alloca i1, i1 0
-  %nop15897 = alloca i1, i1 0
-  %nop15898 = alloca i1, i1 0
-  %nop15899 = alloca i1, i1 0
-  %nop15900 = alloca i1, i1 0
-  %nop15901 = alloca i1, i1 0
-  %nop15902 = alloca i1, i1 0
-  %nop15903 = alloca i1, i1 0
-  %nop15904 = alloca i1, i1 0
-  %nop15905 = alloca i1, i1 0
-  %nop15906 = alloca i1, i1 0
-  %nop15907 = alloca i1, i1 0
-  %nop15908 = alloca i1, i1 0
-  %nop15909 = alloca i1, i1 0
-  %nop15910 = alloca i1, i1 0
-  %nop15911 = alloca i1, i1 0
-  %nop15912 = alloca i1, i1 0
-  %nop15913 = alloca i1, i1 0
-  %nop15914 = alloca i1, i1 0
-  %nop15915 = alloca i1, i1 0
-  %nop15916 = alloca i1, i1 0
-  %nop15917 = alloca i1, i1 0
-  %nop15918 = alloca i1, i1 0
-  %nop15919 = alloca i1, i1 0
-  %nop15920 = alloca i1, i1 0
-  %nop15921 = alloca i1, i1 0
-  %nop15922 = alloca i1, i1 0
-  %nop15923 = alloca i1, i1 0
-  %nop15924 = alloca i1, i1 0
-  %nop15925 = alloca i1, i1 0
-  %nop15926 = alloca i1, i1 0
-  %nop15927 = alloca i1, i1 0
-  %nop15928 = alloca i1, i1 0
-  %nop15929 = alloca i1, i1 0
-  %nop15930 = alloca i1, i1 0
-  %nop15931 = alloca i1, i1 0
-  %nop15932 = alloca i1, i1 0
-  %nop15933 = alloca i1, i1 0
-  %nop15934 = alloca i1, i1 0
-  %nop15935 = alloca i1, i1 0
-  %nop15936 = alloca i1, i1 0
-  %nop15937 = alloca i1, i1 0
-  %nop15938 = alloca i1, i1 0
-  %nop15939 = alloca i1, i1 0
-  %nop15940 = alloca i1, i1 0
-  %nop15941 = alloca i1, i1 0
-  %nop15942 = alloca i1, i1 0
-  %nop15943 = alloca i1, i1 0
-  %nop15944 = alloca i1, i1 0
-  %nop15945 = alloca i1, i1 0
-  %nop15946 = alloca i1, i1 0
-  %nop15947 = alloca i1, i1 0
-  %nop15948 = alloca i1, i1 0
-  %nop15949 = alloca i1, i1 0
-  %nop15950 = alloca i1, i1 0
-  %nop15951 = alloca i1, i1 0
-  %nop15952 = alloca i1, i1 0
-  %nop15953 = alloca i1, i1 0
-  %nop15954 = alloca i1, i1 0
-  %nop15955 = alloca i1, i1 0
-  %nop15956 = alloca i1, i1 0
-  %nop15957 = alloca i1, i1 0
-  %nop15958 = alloca i1, i1 0
-  %nop15959 = alloca i1, i1 0
-  %nop15960 = alloca i1, i1 0
-  %nop15961 = alloca i1, i1 0
-  %nop15962 = alloca i1, i1 0
-  %nop15963 = alloca i1, i1 0
-  %nop15964 = alloca i1, i1 0
-  %nop15965 = alloca i1, i1 0
-  %nop15966 = alloca i1, i1 0
-  %nop15967 = alloca i1, i1 0
-  %nop15968 = alloca i1, i1 0
-  %nop15969 = alloca i1, i1 0
-  %nop15970 = alloca i1, i1 0
-  %nop15971 = alloca i1, i1 0
-  %nop15972 = alloca i1, i1 0
-  %nop15973 = alloca i1, i1 0
-  %nop15974 = alloca i1, i1 0
-  %nop15975 = alloca i1, i1 0
-  %nop15976 = alloca i1, i1 0
-  %nop15977 = alloca i1, i1 0
-  %nop15978 = alloca i1, i1 0
-  %nop15979 = alloca i1, i1 0
-  %nop15980 = alloca i1, i1 0
-  %nop15981 = alloca i1, i1 0
-  %nop15982 = alloca i1, i1 0
-  %nop15983 = alloca i1, i1 0
-  %nop15984 = alloca i1, i1 0
-  %nop15985 = alloca i1, i1 0
-  %nop15986 = alloca i1, i1 0
-  %nop15987 = alloca i1, i1 0
-  %nop15988 = alloca i1, i1 0
-  %nop15989 = alloca i1, i1 0
-  %nop15990 = alloca i1, i1 0
-  %nop15991 = alloca i1, i1 0
-  %nop15992 = alloca i1, i1 0
-  %nop15993 = alloca i1, i1 0
-  %nop15994 = alloca i1, i1 0
-  %nop15995 = alloca i1, i1 0
-  %nop15996 = alloca i1, i1 0
-  %nop15997 = alloca i1, i1 0
-  %nop15998 = alloca i1, i1 0
-  %nop15999 = alloca i1, i1 0
-  %nop16000 = alloca i1, i1 0
-  %nop16001 = alloca i1, i1 0
-  %nop16002 = alloca i1, i1 0
-  %nop16003 = alloca i1, i1 0
-  %nop16004 = alloca i1, i1 0
-  %nop16005 = alloca i1, i1 0
-  %nop16006 = alloca i1, i1 0
-  %nop16007 = alloca i1, i1 0
-  %nop16008 = alloca i1, i1 0
-  %nop16009 = alloca i1, i1 0
-  %nop16010 = alloca i1, i1 0
-  %nop16011 = alloca i1, i1 0
-  %nop16012 = alloca i1, i1 0
-  %nop16013 = alloca i1, i1 0
-  %nop16014 = alloca i1, i1 0
-  %nop16015 = alloca i1, i1 0
-  %nop16016 = alloca i1, i1 0
-  %nop16017 = alloca i1, i1 0
-  %nop16018 = alloca i1, i1 0
-  %nop16019 = alloca i1, i1 0
-  %nop16020 = alloca i1, i1 0
-  %nop16021 = alloca i1, i1 0
-  %nop16022 = alloca i1, i1 0
-  %nop16023 = alloca i1, i1 0
-  %nop16024 = alloca i1, i1 0
-  %nop16025 = alloca i1, i1 0
-  %nop16026 = alloca i1, i1 0
-  %nop16027 = alloca i1, i1 0
-  %nop16028 = alloca i1, i1 0
-  %nop16029 = alloca i1, i1 0
-  %nop16030 = alloca i1, i1 0
-  %nop16031 = alloca i1, i1 0
-  %nop16032 = alloca i1, i1 0
-  %nop16033 = alloca i1, i1 0
-  %nop16034 = alloca i1, i1 0
-  %nop16035 = alloca i1, i1 0
-  %nop16036 = alloca i1, i1 0
-  %nop16037 = alloca i1, i1 0
-  %nop16038 = alloca i1, i1 0
-  %nop16039 = alloca i1, i1 0
-  %nop16040 = alloca i1, i1 0
-  %nop16041 = alloca i1, i1 0
-  %nop16042 = alloca i1, i1 0
-  %nop16043 = alloca i1, i1 0
-  %nop16044 = alloca i1, i1 0
-  %nop16045 = alloca i1, i1 0
-  %nop16046 = alloca i1, i1 0
-  %nop16047 = alloca i1, i1 0
-  %nop16048 = alloca i1, i1 0
-  %nop16049 = alloca i1, i1 0
-  %nop16050 = alloca i1, i1 0
-  %nop16051 = alloca i1, i1 0
-  %nop16052 = alloca i1, i1 0
-  %nop16053 = alloca i1, i1 0
-  %nop16054 = alloca i1, i1 0
-  %nop16055 = alloca i1, i1 0
-  %nop16056 = alloca i1, i1 0
-  %nop16057 = alloca i1, i1 0
-  %nop16058 = alloca i1, i1 0
-  %nop16059 = alloca i1, i1 0
-  %nop16060 = alloca i1, i1 0
-  %nop16061 = alloca i1, i1 0
-  %nop16062 = alloca i1, i1 0
-  %nop16063 = alloca i1, i1 0
-  %nop16064 = alloca i1, i1 0
-  %nop16065 = alloca i1, i1 0
-  %nop16066 = alloca i1, i1 0
-  %nop16067 = alloca i1, i1 0
-  %nop16068 = alloca i1, i1 0
-  %nop16069 = alloca i1, i1 0
-  %nop16070 = alloca i1, i1 0
-  %nop16071 = alloca i1, i1 0
-  %nop16072 = alloca i1, i1 0
-  %nop16073 = alloca i1, i1 0
-  %nop16074 = alloca i1, i1 0
-  %nop16075 = alloca i1, i1 0
-  %nop16076 = alloca i1, i1 0
-  %nop16077 = alloca i1, i1 0
-  %nop16078 = alloca i1, i1 0
-  %nop16079 = alloca i1, i1 0
-  %nop16080 = alloca i1, i1 0
-  %nop16081 = alloca i1, i1 0
-  %nop16082 = alloca i1, i1 0
-  %nop16083 = alloca i1, i1 0
-  %nop16084 = alloca i1, i1 0
-  %nop16085 = alloca i1, i1 0
-  %nop16086 = alloca i1, i1 0
-  %nop16087 = alloca i1, i1 0
-  %nop16088 = alloca i1, i1 0
-  %nop16089 = alloca i1, i1 0
-  %nop16090 = alloca i1, i1 0
-  %nop16091 = alloca i1, i1 0
-  %nop16092 = alloca i1, i1 0
-  %nop16093 = alloca i1, i1 0
-  %nop16094 = alloca i1, i1 0
-  %nop16095 = alloca i1, i1 0
-  %nop16096 = alloca i1, i1 0
-  %nop16097 = alloca i1, i1 0
-  %nop16098 = alloca i1, i1 0
-  %nop16099 = alloca i1, i1 0
-  %nop16100 = alloca i1, i1 0
-  %nop16101 = alloca i1, i1 0
-  %nop16102 = alloca i1, i1 0
-  %nop16103 = alloca i1, i1 0
-  %nop16104 = alloca i1, i1 0
-  %nop16105 = alloca i1, i1 0
-  %nop16106 = alloca i1, i1 0
-  %nop16107 = alloca i1, i1 0
-  %nop16108 = alloca i1, i1 0
-  %nop16109 = alloca i1, i1 0
-  %nop16110 = alloca i1, i1 0
-  %nop16111 = alloca i1, i1 0
-  %nop16112 = alloca i1, i1 0
-  %nop16113 = alloca i1, i1 0
-  %nop16114 = alloca i1, i1 0
-  %nop16115 = alloca i1, i1 0
-  %nop16116 = alloca i1, i1 0
-  %nop16117 = alloca i1, i1 0
-  %nop16118 = alloca i1, i1 0
-  %nop16119 = alloca i1, i1 0
-  %nop16120 = alloca i1, i1 0
-  %nop16121 = alloca i1, i1 0
-  %nop16122 = alloca i1, i1 0
-  %nop16123 = alloca i1, i1 0
-  %nop16124 = alloca i1, i1 0
-  %nop16125 = alloca i1, i1 0
-  %nop16126 = alloca i1, i1 0
-  %nop16127 = alloca i1, i1 0
-  %nop16128 = alloca i1, i1 0
-  %nop16129 = alloca i1, i1 0
-  %nop16130 = alloca i1, i1 0
-  %nop16131 = alloca i1, i1 0
-  %nop16132 = alloca i1, i1 0
-  %nop16133 = alloca i1, i1 0
-  %nop16134 = alloca i1, i1 0
-  %nop16135 = alloca i1, i1 0
-  %nop16136 = alloca i1, i1 0
-  %nop16137 = alloca i1, i1 0
-  %nop16138 = alloca i1, i1 0
-  %nop16139 = alloca i1, i1 0
-  %nop16140 = alloca i1, i1 0
-  %nop16141 = alloca i1, i1 0
-  %nop16142 = alloca i1, i1 0
-  %nop16143 = alloca i1, i1 0
-  %nop16144 = alloca i1, i1 0
-  %nop16145 = alloca i1, i1 0
-  %nop16146 = alloca i1, i1 0
-  %nop16147 = alloca i1, i1 0
-  %nop16148 = alloca i1, i1 0
-  %nop16149 = alloca i1, i1 0
-  %nop16150 = alloca i1, i1 0
-  %nop16151 = alloca i1, i1 0
-  %nop16152 = alloca i1, i1 0
-  %nop16153 = alloca i1, i1 0
-  %nop16154 = alloca i1, i1 0
-  %nop16155 = alloca i1, i1 0
-  %nop16156 = alloca i1, i1 0
-  %nop16157 = alloca i1, i1 0
-  %nop16158 = alloca i1, i1 0
-  %nop16159 = alloca i1, i1 0
-  %nop16160 = alloca i1, i1 0
-  %nop16161 = alloca i1, i1 0
-  %nop16162 = alloca i1, i1 0
-  %nop16163 = alloca i1, i1 0
-  %nop16164 = alloca i1, i1 0
-  %nop16165 = alloca i1, i1 0
-  %nop16166 = alloca i1, i1 0
-  %nop16167 = alloca i1, i1 0
-  %nop16168 = alloca i1, i1 0
-  %nop16169 = alloca i1, i1 0
-  %nop16170 = alloca i1, i1 0
-  %nop16171 = alloca i1, i1 0
-  %nop16172 = alloca i1, i1 0
-  %nop16173 = alloca i1, i1 0
-  %nop16174 = alloca i1, i1 0
-  %nop16175 = alloca i1, i1 0
-  %nop16176 = alloca i1, i1 0
-  %nop16177 = alloca i1, i1 0
-  %nop16178 = alloca i1, i1 0
-  %nop16179 = alloca i1, i1 0
-  %nop16180 = alloca i1, i1 0
-  %nop16181 = alloca i1, i1 0
-  %nop16182 = alloca i1, i1 0
-  %nop16183 = alloca i1, i1 0
-  %nop16184 = alloca i1, i1 0
-  %nop16185 = alloca i1, i1 0
-  %nop16186 = alloca i1, i1 0
-  %nop16187 = alloca i1, i1 0
-  %nop16188 = alloca i1, i1 0
-  %nop16189 = alloca i1, i1 0
-  %nop16190 = alloca i1, i1 0
-  %nop16191 = alloca i1, i1 0
-  %nop16192 = alloca i1, i1 0
-  %nop16193 = alloca i1, i1 0
-  %nop16194 = alloca i1, i1 0
-  %nop16195 = alloca i1, i1 0
-  %nop16196 = alloca i1, i1 0
-  %nop16197 = alloca i1, i1 0
-  %nop16198 = alloca i1, i1 0
-  %nop16199 = alloca i1, i1 0
-  %nop16200 = alloca i1, i1 0
-  %nop16201 = alloca i1, i1 0
-  %nop16202 = alloca i1, i1 0
-  %nop16203 = alloca i1, i1 0
-  %nop16204 = alloca i1, i1 0
-  %nop16205 = alloca i1, i1 0
-  %nop16206 = alloca i1, i1 0
-  %nop16207 = alloca i1, i1 0
-  %nop16208 = alloca i1, i1 0
-  %nop16209 = alloca i1, i1 0
-  %nop16210 = alloca i1, i1 0
-  %nop16211 = alloca i1, i1 0
-  %nop16212 = alloca i1, i1 0
-  %nop16213 = alloca i1, i1 0
-  %nop16214 = alloca i1, i1 0
-  %nop16215 = alloca i1, i1 0
-  %nop16216 = alloca i1, i1 0
-  %nop16217 = alloca i1, i1 0
-  %nop16218 = alloca i1, i1 0
-  %nop16219 = alloca i1, i1 0
-  %nop16220 = alloca i1, i1 0
-  %nop16221 = alloca i1, i1 0
-  %nop16222 = alloca i1, i1 0
-  %nop16223 = alloca i1, i1 0
-  %nop16224 = alloca i1, i1 0
-  %nop16225 = alloca i1, i1 0
-  %nop16226 = alloca i1, i1 0
-  %nop16227 = alloca i1, i1 0
-  %nop16228 = alloca i1, i1 0
-  %nop16229 = alloca i1, i1 0
-  %nop16230 = alloca i1, i1 0
-  %nop16231 = alloca i1, i1 0
-  %nop16232 = alloca i1, i1 0
-  %nop16233 = alloca i1, i1 0
-  %nop16234 = alloca i1, i1 0
-  %nop16235 = alloca i1, i1 0
-  %nop16236 = alloca i1, i1 0
-  %nop16237 = alloca i1, i1 0
-  %nop16238 = alloca i1, i1 0
-  %nop16239 = alloca i1, i1 0
-  %nop16240 = alloca i1, i1 0
-  %nop16241 = alloca i1, i1 0
-  %nop16242 = alloca i1, i1 0
-  %nop16243 = alloca i1, i1 0
-  %nop16244 = alloca i1, i1 0
-  %nop16245 = alloca i1, i1 0
-  %nop16246 = alloca i1, i1 0
-  %nop16247 = alloca i1, i1 0
-  %nop16248 = alloca i1, i1 0
-  %nop16249 = alloca i1, i1 0
-  %nop16250 = alloca i1, i1 0
-  %nop16251 = alloca i1, i1 0
-  %nop16252 = alloca i1, i1 0
-  %nop16253 = alloca i1, i1 0
-  %nop16254 = alloca i1, i1 0
-  %nop16255 = alloca i1, i1 0
-  %nop16256 = alloca i1, i1 0
-  %nop16257 = alloca i1, i1 0
-  %nop16258 = alloca i1, i1 0
-  %nop16259 = alloca i1, i1 0
-  %nop16260 = alloca i1, i1 0
-  %nop16261 = alloca i1, i1 0
-  %nop16262 = alloca i1, i1 0
-  %nop16263 = alloca i1, i1 0
-  %nop16264 = alloca i1, i1 0
-  %nop16265 = alloca i1, i1 0
-  %nop16266 = alloca i1, i1 0
-  %nop16267 = alloca i1, i1 0
-  %nop16268 = alloca i1, i1 0
-  %nop16269 = alloca i1, i1 0
-  %nop16270 = alloca i1, i1 0
-  %nop16271 = alloca i1, i1 0
-  %nop16272 = alloca i1, i1 0
-  %nop16273 = alloca i1, i1 0
-  %nop16274 = alloca i1, i1 0
-  %nop16275 = alloca i1, i1 0
-  %nop16276 = alloca i1, i1 0
-  %nop16277 = alloca i1, i1 0
-  %nop16278 = alloca i1, i1 0
-  %nop16279 = alloca i1, i1 0
-  %nop16280 = alloca i1, i1 0
-  %nop16281 = alloca i1, i1 0
-  %nop16282 = alloca i1, i1 0
-  %nop16283 = alloca i1, i1 0
-  %nop16284 = alloca i1, i1 0
-  %nop16285 = alloca i1, i1 0
-  %nop16286 = alloca i1, i1 0
-  %nop16287 = alloca i1, i1 0
-  %nop16288 = alloca i1, i1 0
-  %nop16289 = alloca i1, i1 0
-  %nop16290 = alloca i1, i1 0
-  %nop16291 = alloca i1, i1 0
-  %nop16292 = alloca i1, i1 0
-  %nop16293 = alloca i1, i1 0
-  %nop16294 = alloca i1, i1 0
-  %nop16295 = alloca i1, i1 0
-  %nop16296 = alloca i1, i1 0
-  %nop16297 = alloca i1, i1 0
-  %nop16298 = alloca i1, i1 0
-  %nop16299 = alloca i1, i1 0
-  %nop16300 = alloca i1, i1 0
-  %nop16301 = alloca i1, i1 0
-  %nop16302 = alloca i1, i1 0
-  %nop16303 = alloca i1, i1 0
-  %nop16304 = alloca i1, i1 0
-  %nop16305 = alloca i1, i1 0
-  %nop16306 = alloca i1, i1 0
-  %nop16307 = alloca i1, i1 0
-  %nop16308 = alloca i1, i1 0
-  %nop16309 = alloca i1, i1 0
-  %nop16310 = alloca i1, i1 0
-  %nop16311 = alloca i1, i1 0
-  %nop16312 = alloca i1, i1 0
-  %nop16313 = alloca i1, i1 0
-  %nop16314 = alloca i1, i1 0
-  %nop16315 = alloca i1, i1 0
-  %nop16316 = alloca i1, i1 0
-  %nop16317 = alloca i1, i1 0
-  %nop16318 = alloca i1, i1 0
-  %nop16319 = alloca i1, i1 0
-  %nop16320 = alloca i1, i1 0
-  %nop16321 = alloca i1, i1 0
-  %nop16322 = alloca i1, i1 0
-  %nop16323 = alloca i1, i1 0
-  %nop16324 = alloca i1, i1 0
-  %nop16325 = alloca i1, i1 0
-  %nop16326 = alloca i1, i1 0
-  %nop16327 = alloca i1, i1 0
-  %nop16328 = alloca i1, i1 0
-  %nop16329 = alloca i1, i1 0
-  %nop16330 = alloca i1, i1 0
-  %nop16331 = alloca i1, i1 0
-  %nop16332 = alloca i1, i1 0
-  %nop16333 = alloca i1, i1 0
-  %nop16334 = alloca i1, i1 0
-  %nop16335 = alloca i1, i1 0
-  %nop16336 = alloca i1, i1 0
-  %nop16337 = alloca i1, i1 0
-  %nop16338 = alloca i1, i1 0
-  %nop16339 = alloca i1, i1 0
-  %nop16340 = alloca i1, i1 0
-  %nop16341 = alloca i1, i1 0
-  %nop16342 = alloca i1, i1 0
-  %nop16343 = alloca i1, i1 0
-  %nop16344 = alloca i1, i1 0
-  %nop16345 = alloca i1, i1 0
-  %nop16346 = alloca i1, i1 0
-  %nop16347 = alloca i1, i1 0
-  %nop16348 = alloca i1, i1 0
-  %nop16349 = alloca i1, i1 0
-  %nop16350 = alloca i1, i1 0
-  %nop16351 = alloca i1, i1 0
-  %nop16352 = alloca i1, i1 0
-  %nop16353 = alloca i1, i1 0
-  %nop16354 = alloca i1, i1 0
-  %nop16355 = alloca i1, i1 0
-  %nop16356 = alloca i1, i1 0
-  %nop16357 = alloca i1, i1 0
-  %nop16358 = alloca i1, i1 0
-  %nop16359 = alloca i1, i1 0
-  %nop16360 = alloca i1, i1 0
-  %nop16361 = alloca i1, i1 0
-  %nop16362 = alloca i1, i1 0
-  %nop16363 = alloca i1, i1 0
-  %nop16364 = alloca i1, i1 0
-  %nop16365 = alloca i1, i1 0
-  %nop16366 = alloca i1, i1 0
-  %nop16367 = alloca i1, i1 0
-  %nop16368 = alloca i1, i1 0
-  %nop16369 = alloca i1, i1 0
-  %nop16370 = alloca i1, i1 0
-  %nop16371 = alloca i1, i1 0
-  %nop16372 = alloca i1, i1 0
-  %nop16373 = alloca i1, i1 0
-  %nop16374 = alloca i1, i1 0
-  %nop16375 = alloca i1, i1 0
-  %nop16376 = alloca i1, i1 0
-  %nop16377 = alloca i1, i1 0
-  br label %for.inc
-
-for.inc:
-  %3 = load i32* %i, align 4
-  %inc = add nsw i32 %3, 1
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-; CHECK:  addiu $sp, $sp, -8
-; CHECK:  sw  $ra, 0($sp)
-; CHECK:  lui $[[REG1:[0-9]+]], 65534
-; CHECK:  addiu $[[REG1]], $[[REG1]], -12
-; CHECK:  addu  $[[REG1]], $ra, $[[REG1]]
-; CHECK:  lw  $ra, 0($sp)
-; CHECK:  jr  $[[REG1]]
-; CHECK:  addiu $sp, $sp, 8
-
-for.end:
-  ret i32 0
-}
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false"
-  "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"
-  "no-infs-fp-math"="false" "no-nans-fp-math"="false"
-  "stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
-  "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips32r6/compatibility.ll b/test/CodeGen/Mips/mips32r6/compatibility.ll
new file mode 100644
index 0000000..8eac8d4
--- /dev/null
+++ b/test/CodeGen/Mips/mips32r6/compatibility.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | FileCheck %s
+; RUN: not llc -march=mipsel -mcpu=mips32r6 -mattr=+dsp < %s 2>&1 | FileCheck --check-prefix=DSP %s
+
+; CHECK: foo:
+; DSP: MIPS32r6 is not compatible with the DSP ASE
+
+define void @foo() nounwind {
+  ret void
+}
diff --git a/test/CodeGen/Mips/mips64-f128.ll b/test/CodeGen/Mips/mips64-f128.ll
index dc8bbfd..4d590b6 100644
--- a/test/CodeGen/Mips/mips64-f128.ll
+++ b/test/CodeGen/Mips/mips64-f128.ll
@@ -1,5 +1,7 @@
+; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips4 -soft-float -O1 \
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s
 ; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64 -soft-float -O1 \
-; RUN: -disable-mips-delay-filler < %s | FileCheck %s
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s
 
 @gld0 = external global fp128
 @gld1 = external global fp128
diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll
index e01609f..7a52c3d 100644
--- a/test/CodeGen/Mips/mips64-sret.ll
+++ b/test/CodeGen/Mips/mips64-sret.ll
@@ -1,16 +1,23 @@
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -O3 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s
 
-%struct.S = type { [8 x i32] }
+define void @foo(i32* noalias sret %agg.result) nounwind {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: sw {{.*}}, 0($4)
+; CHECK: jr $ra
+; CHECK-NEXT: move $2, $4
 
-@g = common global %struct.S zeroinitializer, align 4
+  store i32 42, i32* %agg.result
+  ret void
+}
 
-define void @f(%struct.S* noalias sret %agg.result) nounwind {
+define void @bar(i32 %v, i32* noalias sret %agg.result) nounwind {
 entry:
-; CHECK: move $2, $4
+; CHECK-LABEL: bar:
+; CHECK: sw $4, 0($5)
+; CHECK: jr $ra
+; CHECK-NEXT: move $2, $5
 
-  %0 = bitcast %struct.S* %agg.result to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.S* @g to i8*), i64 32, i32 4, i1 false)
+  store i32 %v, i32* %agg.result
   ret void
 }
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/Mips/mips64countleading.ll b/test/CodeGen/Mips/mips64countleading.ll
index b2b67e5..252f323 100644
--- a/test/CodeGen/Mips/mips64countleading.ll
+++ b/test/CodeGen/Mips/mips64countleading.ll
@@ -1,8 +1,11 @@
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS4 %s
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS64 %s
 
 define i64 @t1(i64 %X) nounwind readnone {
 entry:
-; CHECK: dclz
+; CHECK-LABEL: t1:
+; MIPS4-NOT: dclz
+; MIPS64: dclz
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %X, i1 true)
   ret i64 %tmp1
 }
@@ -11,7 +14,9 @@ declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
 
 define i64 @t3(i64 %X) nounwind readnone {
 entry:
-; CHECK: dclo 
+; CHECK-LABEL: t3:
+; MIPS4-NOT: dclo
+; MIPS64: dclo
   %neg = xor i64 %X, -1
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %neg, i1 true)
   ret i64 %tmp1
diff --git a/test/CodeGen/Mips/mips64directive.ll b/test/CodeGen/Mips/mips64directive.ll
index fa81b72..3d95f51 100644
--- a/test/CodeGen/Mips/mips64directive.ll
+++ b/test/CodeGen/Mips/mips64directive.ll
@@ -1,3 +1,4 @@
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s
 
 @gl = global i64 1250999896321, align 8
diff --git a/test/CodeGen/Mips/mips64ext.ll b/test/CodeGen/Mips/mips64ext.ll
index 02a35f8..22ea0eb 100644
--- a/test/CodeGen/Mips/mips64ext.ll
+++ b/test/CodeGen/Mips/mips64ext.ll
@@ -1,4 +1,5 @@
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s 
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s
 
 define i64 @zext64_32(i32 %a) nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/mips64fpimm0.ll b/test/CodeGen/Mips/mips64fpimm0.ll
index 17716da..19e076d 100644
--- a/test/CodeGen/Mips/mips64fpimm0.ll
+++ b/test/CodeGen/Mips/mips64fpimm0.ll
@@ -1,3 +1,4 @@
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s
 
 define double @foo1() nounwind readnone {
diff --git a/test/CodeGen/Mips/mips64fpldst.ll b/test/CodeGen/Mips/mips64fpldst.ll
index 368ab83..2f42270 100644
--- a/test/CodeGen/Mips/mips64fpldst.ll
+++ b/test/CodeGen/Mips/mips64fpldst.ll
@@ -1,3 +1,5 @@
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
 
diff --git a/test/CodeGen/Mips/mips64imm.ll b/test/CodeGen/Mips/mips64imm.ll
index 1fc8636..c3fc61d 100644
--- a/test/CodeGen/Mips/mips64imm.ll
+++ b/test/CodeGen/Mips/mips64imm.ll
@@ -1,3 +1,4 @@
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
 
 define i32 @foo1() nounwind readnone {
diff --git a/test/CodeGen/Mips/mips64instrs.ll b/test/CodeGen/Mips/mips64instrs.ll
index 2894d69..58f11f1 100644
--- a/test/CodeGen/Mips/mips64instrs.ll
+++ b/test/CodeGen/Mips/mips64instrs.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips4 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS4 %s
+; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS64 %s
 
 @gll0 = common global i64 0, align 8
 @gll1 = common global i64 0, align 8
@@ -135,14 +136,24 @@ declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
 
 define i64 @f18(i64 %X) nounwind readnone {
 entry:
-; CHECK: dclz $2, $4
+; CHECK-LABEL: f18:
+
+; The MIPS4 version is too long to reasonably test. At least check we don't get dclz
+; MIPS4-NOT: dclz
+
+; MIPS64: dclz $2, $4
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %X, i1 true)
   ret i64 %tmp1
 }
 
 define i64 @f19(i64 %X) nounwind readnone {
 entry:
-; CHECK: dclo $2, $4
+; CHECK-LABEL: f19:
+
+; The MIPS4 version is too long to reasonably test. At least check we don't get dclo
+; MIPS4-NOT: dclo
+
+; MIPS64: dclo $2, $4
   %neg = xor i64 %X, -1
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %neg, i1 true)
   ret i64 %tmp1
@@ -150,6 +161,7 @@ entry:
 
 define i64 @f20(i64 %a, i64 %b) nounwind readnone {
 entry:
+; CHECK-LABEL: f20:
 ; CHECK: nor
   %or = or i64 %b, %a
   %neg = xor i64 %or, -1
diff --git a/test/CodeGen/Mips/mips64intldst.ll b/test/CodeGen/Mips/mips64intldst.ll
index 62244f6..c3607ba 100644
--- a/test/CodeGen/Mips/mips64intldst.ll
+++ b/test/CodeGen/Mips/mips64intldst.ll
@@ -1,3 +1,5 @@
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
 
diff --git a/test/CodeGen/Mips/mips64lea.ll b/test/CodeGen/Mips/mips64lea.ll
index 54d504f..e866b21 100644
--- a/test/CodeGen/Mips/mips64lea.ll
+++ b/test/CodeGen/Mips/mips64lea.ll
@@ -1,3 +1,4 @@
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
 
 define void @foo3() nounwind {
diff --git a/test/CodeGen/Mips/mips64load-store-left-right.ll b/test/CodeGen/Mips/mips64load-store-left-right.ll
deleted file mode 100644
index 4561429..0000000
--- a/test/CodeGen/Mips/mips64load-store-left-right.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck  -check-prefix=EL %s
-; RUN: llc -march=mips64 -mcpu=mips64 -mattr=n64 < %s | FileCheck  -check-prefix=EB %s
-
-%struct.SLL = type { i64 }
-%struct.SI = type { i32 }
-%struct.SUI = type { i32 }
-
-@sll = common global %struct.SLL zeroinitializer, align 1
-@si = common global %struct.SI zeroinitializer, align 1
-@sui = common global %struct.SUI zeroinitializer, align 1
-
-define i64 @foo_load_ll() nounwind readonly {
-entry:
-; EL: ldl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
-; EL: ldr $[[R0]], 0($[[R1]])
-; EB: ldl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: ldr $[[R0]], 7($[[R1]])
-
-  %0 = load i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
-  ret i64 %0
-}
-
-define i64 @foo_load_i() nounwind readonly {
-entry:
-; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: lwr $[[R0]], 0($[[R1]])
-; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: lwr $[[R0]], 3($[[R1]])
-
-  %0 = load i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
-  %conv = sext i32 %0 to i64
-  ret i64 %conv
-}
-
-define i64 @foo_load_ui() nounwind readonly {
-entry:
-; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: lwr $[[R0]], 0($[[R1]])
-; EL: daddiu $[[R2:[0-9]+]], $zero, 1
-; EL: dsll   $[[R3:[0-9]+]], $[[R2]], 32
-; EL: daddiu $[[R4:[0-9]+]], $[[R3]], -1
-; EL: and    ${{[0-9]+}}, $[[R0]], $[[R4]]
-; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: lwr $[[R0]], 3($[[R1]])
-
-
-  %0 = load i32* getelementptr inbounds (%struct.SUI* @sui, i64 0, i32 0), align 1
-  %conv = zext i32 %0 to i64
-  ret i64 %conv
-}
-
-define void @foo_store_ll(i64 %a) nounwind {
-entry:
-; EL: sdl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
-; EL: sdr $[[R0]], 0($[[R1]])
-; EB: sdl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: sdr $[[R0]], 7($[[R1]])
-
-  store i64 %a, i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
-  ret void
-}
-
-define void @foo_store_i(i32 %a) nounwind {
-entry:
-; EL: swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: swr $[[R0]], 0($[[R1]])
-; EB: swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: swr $[[R0]], 3($[[R1]])
-
-  store i32 %a, i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
-  ret void
-}
-
diff --git a/test/CodeGen/Mips/mips64muldiv.ll b/test/CodeGen/Mips/mips64muldiv.ll
index fd036a2..39c73e9 100644
--- a/test/CodeGen/Mips/mips64muldiv.ll
+++ b/test/CodeGen/Mips/mips64muldiv.ll
@@ -1,3 +1,4 @@
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
 
 define i64 @m0(i64 %a0, i64 %a1) nounwind readnone {
diff --git a/test/CodeGen/Mips/mips64r6/compatibility.ll b/test/CodeGen/Mips/mips64r6/compatibility.ll
new file mode 100644
index 0000000..429f68d
--- /dev/null
+++ b/test/CodeGen/Mips/mips64r6/compatibility.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=mipsel -mcpu=mips64r6 < %s | FileCheck %s
+; RUN: not llc -march=mipsel -mcpu=mips64r6 -mattr=+dsp < %s 2>&1 | FileCheck --check-prefix=DSP %s
+
+; CHECK: foo:
+; DSP: MIPS64r6 is not compatible with the DSP ASE
+
+define void @foo() nounwind {
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/basic_operations.ll b/test/CodeGen/Mips/msa/basic_operations.ll
index 2725e9a..dbdf42b 100644
--- a/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/test/CodeGen/Mips/msa/basic_operations.ll
@@ -6,10 +6,11 @@
 @v8i16 = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
 @v4i32 = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>
 @v2i64 = global <2 x i64> <i64 0, i64 0>
+@i32 = global i32 0
 @i64 = global i64 0
 
 define void @const_v16i8() nounwind {
-  ; MIPS32-AE: const_v16i8:
+  ; MIPS32-AE-LABEL: const_v16i8:
 
   store volatile <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8>*@v16i8
   ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
@@ -45,7 +46,7 @@ define void @const_v16i8() nounwind {
 }
 
 define void @const_v8i16() nounwind {
-  ; MIPS32-AE: const_v8i16:
+  ; MIPS32-AE-LABEL: const_v8i16:
 
   store volatile <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16>*@v8i16
   ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
@@ -76,7 +77,7 @@ define void @const_v8i16() nounwind {
 }
 
 define void @const_v4i32() nounwind {
-  ; MIPS32-AE: const_v4i32:
+  ; MIPS32-AE-LABEL: const_v4i32:
 
   store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>*@v4i32
   ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
@@ -107,7 +108,7 @@ define void @const_v4i32() nounwind {
 }
 
 define void @const_v2i64() nounwind {
-  ; MIPS32-AE: const_v2i64:
+  ; MIPS32-AE-LABEL: const_v2i64:
 
   store volatile <2 x i64> <i64 0, i64 0>, <2 x i64>*@v2i64
   ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
@@ -137,7 +138,7 @@ define void @const_v2i64() nounwind {
 }
 
 define void @nonconst_v16i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h) nounwind {
-  ; MIPS32-AE: nonconst_v16i8:
+  ; MIPS32-AE-LABEL: nonconst_v16i8:
 
   %1 = insertelement <16 x i8> undef, i8 %a, i32 0
   %2 = insertelement <16 x i8> %1, i8 %b, i32 1
@@ -187,7 +188,7 @@ define void @nonconst_v16i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8
 }
 
 define void @nonconst_v8i16(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h) nounwind {
-  ; MIPS32-AE: nonconst_v8i16:
+  ; MIPS32-AE-LABEL: nonconst_v8i16:
 
   %1 = insertelement <8 x i16> undef, i16 %a, i32 0
   %2 = insertelement <8 x i16> %1, i16 %b, i32 1
@@ -221,7 +222,7 @@ define void @nonconst_v8i16(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16
 }
 
 define void @nonconst_v4i32(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
-  ; MIPS32-AE: nonconst_v4i32:
+  ; MIPS32-AE-LABEL: nonconst_v4i32:
 
   %1 = insertelement <4 x i32> undef, i32 %a, i32 0
   %2 = insertelement <4 x i32> %1, i32 %b, i32 1
@@ -239,7 +240,7 @@ define void @nonconst_v4i32(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 }
 
 define void @nonconst_v2i64(i64 %a, i64 %b) nounwind {
-  ; MIPS32-AE: nonconst_v2i64:
+  ; MIPS32-AE-LABEL: nonconst_v2i64:
 
   %1 = insertelement <2 x i64> undef, i64 %a, i32 0
   %2 = insertelement <2 x i64> %1, i64 %b, i32 1
@@ -255,7 +256,7 @@ define void @nonconst_v2i64(i64 %a, i64 %b) nounwind {
 }
 
 define i32 @extract_sext_v16i8() nounwind {
-  ; MIPS32-AE: extract_sext_v16i8:
+  ; MIPS32-AE-LABEL: extract_sext_v16i8:
 
   %1 = load <16 x i8>* @v16i8
   ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
@@ -274,7 +275,7 @@ define i32 @extract_sext_v16i8() nounwind {
 }
 
 define i32 @extract_sext_v8i16() nounwind {
-  ; MIPS32-AE: extract_sext_v8i16:
+  ; MIPS32-AE-LABEL: extract_sext_v8i16:
 
   %1 = load <8 x i16>* @v8i16
   ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
@@ -293,7 +294,7 @@ define i32 @extract_sext_v8i16() nounwind {
 }
 
 define i32 @extract_sext_v4i32() nounwind {
-  ; MIPS32-AE: extract_sext_v4i32:
+  ; MIPS32-AE-LABEL: extract_sext_v4i32:
 
   %1 = load <4 x i32>* @v4i32
   ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -309,7 +310,7 @@ define i32 @extract_sext_v4i32() nounwind {
 }
 
 define i64 @extract_sext_v2i64() nounwind {
-  ; MIPS32-AE: extract_sext_v2i64:
+  ; MIPS32-AE-LABEL: extract_sext_v2i64:
 
   %1 = load <2 x i64>* @v2i64
   ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]],
@@ -328,7 +329,7 @@ define i64 @extract_sext_v2i64() nounwind {
 }
 
 define i32 @extract_zext_v16i8() nounwind {
-  ; MIPS32-AE: extract_zext_v16i8:
+  ; MIPS32-AE-LABEL: extract_zext_v16i8:
 
   %1 = load <16 x i8>* @v16i8
   ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
@@ -346,7 +347,7 @@ define i32 @extract_zext_v16i8() nounwind {
 }
 
 define i32 @extract_zext_v8i16() nounwind {
-  ; MIPS32-AE: extract_zext_v8i16:
+  ; MIPS32-AE-LABEL: extract_zext_v8i16:
 
   %1 = load <8 x i16>* @v8i16
   ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
@@ -364,7 +365,7 @@ define i32 @extract_zext_v8i16() nounwind {
 }
 
 define i32 @extract_zext_v4i32() nounwind {
-  ; MIPS32-AE: extract_zext_v4i32:
+  ; MIPS32-AE-LABEL: extract_zext_v4i32:
 
   %1 = load <4 x i32>* @v4i32
   ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -380,7 +381,7 @@ define i32 @extract_zext_v4i32() nounwind {
 }
 
 define i64 @extract_zext_v2i64() nounwind {
-  ; MIPS32-AE: extract_zext_v2i64:
+  ; MIPS32-AE-LABEL: extract_zext_v2i64:
 
   %1 = load <2 x i64>* @v2i64
   ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]],
@@ -397,8 +398,200 @@ define i64 @extract_zext_v2i64() nounwind {
   ; MIPS32-AE: .size extract_zext_v2i64
 }
 
+define i32 @extract_sext_v16i8_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_sext_v16i8_vidx:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v16i8)(
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <16 x i8> %1, %1
+  ; MIPS32-AE-DAG: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <16 x i8> %2, i32 %3
+  %5 = sext i8 %4 to i32
+  ; MIPS32-AE-DAG: splat.b $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: sra [[R6:\$[0-9]+]], [[R5]], 24
+
+  ret i32 %5
+  ; MIPS32-AE: .size extract_sext_v16i8_vidx
+}
+
+define i32 @extract_sext_v8i16_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_sext_v8i16_vidx:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v8i16)(
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <8 x i16> %1, %1
+  ; MIPS32-AE-DAG: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <8 x i16> %2, i32 %3
+  %5 = sext i16 %4 to i32
+  ; MIPS32-AE-DAG: splat.h $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: sra [[R6:\$[0-9]+]], [[R5]], 16
+
+  ret i32 %5
+  ; MIPS32-AE: .size extract_sext_v8i16_vidx
+}
+
+define i32 @extract_sext_v4i32_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_sext_v4i32_vidx:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v4i32)(
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <4 x i32> %1, %1
+  ; MIPS32-AE-DAG: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <4 x i32> %2, i32 %3
+  ; MIPS32-AE-DAG: splat.w $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-NOT: sra
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_sext_v4i32_vidx
+}
+
+define i64 @extract_sext_v2i64_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_sext_v2i64_vidx:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v2i64)(
+  ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <2 x i64> %1, %1
+  ; MIPS32-AE-DAG: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <2 x i64> %2, i32 %3
+  ; MIPS32-AE-DAG: splat.w $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: splat.w $w[[R4:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R6:\$[0-9]+]], $f[[R4]]
+  ; MIPS32-AE-NOT: sra
+
+  ret i64 %4
+  ; MIPS32-AE: .size extract_sext_v2i64_vidx
+}
+
+define i32 @extract_zext_v16i8_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_zext_v16i8_vidx:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v16i8)(
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <16 x i8> %1, %1
+  ; MIPS32-AE-DAG: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <16 x i8> %2, i32 %3
+  %5 = zext i8 %4 to i32
+  ; MIPS32-AE-DAG: splat.b $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: srl [[R6:\$[0-9]+]], [[R5]], 24
+
+  ret i32 %5
+  ; MIPS32-AE: .size extract_zext_v16i8_vidx
+}
+
+define i32 @extract_zext_v8i16_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_zext_v8i16_vidx:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v8i16)(
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <8 x i16> %1, %1
+  ; MIPS32-AE-DAG: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <8 x i16> %2, i32 %3
+  %5 = zext i16 %4 to i32
+  ; MIPS32-AE-DAG: splat.h $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: srl [[R6:\$[0-9]+]], [[R5]], 16
+
+  ret i32 %5
+  ; MIPS32-AE: .size extract_zext_v8i16_vidx
+}
+
+define i32 @extract_zext_v4i32_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_zext_v4i32_vidx:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v4i32)(
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <4 x i32> %1, %1
+  ; MIPS32-AE-DAG: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <4 x i32> %2, i32 %3
+  ; MIPS32-AE-DAG: splat.w $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-NOT: srl
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_zext_v4i32_vidx
+}
+
+define i64 @extract_zext_v2i64_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_zext_v2i64_vidx:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v2i64)(
+  ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <2 x i64> %1, %1
+  ; MIPS32-AE-DAG: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <2 x i64> %2, i32 %3
+  ; MIPS32-AE-DAG: splat.w $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: splat.w $w[[R4:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R6:\$[0-9]+]], $f[[R4]]
+  ; MIPS32-AE-NOT: srl
+
+  ret i64 %4
+  ; MIPS32-AE: .size extract_zext_v2i64_vidx
+}
+
 define void @insert_v16i8(i32 %a) nounwind {
-  ; MIPS32-AE: insert_v16i8:
+  ; MIPS32-AE-LABEL: insert_v16i8:
 
   %1 = load <16 x i8>* @v16i8
   ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
@@ -420,7 +613,7 @@ define void @insert_v16i8(i32 %a) nounwind {
 }
 
 define void @insert_v8i16(i32 %a) nounwind {
-  ; MIPS32-AE: insert_v8i16:
+  ; MIPS32-AE-LABEL: insert_v8i16:
 
   %1 = load <8 x i16>* @v8i16
   ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
@@ -442,7 +635,7 @@ define void @insert_v8i16(i32 %a) nounwind {
 }
 
 define void @insert_v4i32(i32 %a) nounwind {
-  ; MIPS32-AE: insert_v4i32:
+  ; MIPS32-AE-LABEL: insert_v4i32:
 
   %1 = load <4 x i32>* @v4i32
   ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -461,7 +654,7 @@ define void @insert_v4i32(i32 %a) nounwind {
 }
 
 define void @insert_v2i64(i64 %a) nounwind {
-  ; MIPS32-AE: insert_v2i64:
+  ; MIPS32-AE-LABEL: insert_v2i64:
 
   %1 = load <2 x i64>* @v2i64
   ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -480,8 +673,131 @@ define void @insert_v2i64(i64 %a) nounwind {
   ; MIPS32-AE: .size insert_v2i64
 }
 
+define void @insert_v16i8_vidx(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v16i8_vidx:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
+
+  %2 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %a2 = trunc i32 %a to i8
+  %a3 = sext i8 %a2 to i32
+  %a4 = trunc i32 %a3 to i8
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %3 = insertelement <16 x i8> %1, i8 %a4, i32 %2
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][0], $4
+  ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[IDX]]
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <16 x i8> %3, <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: st.b [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v16i8_vidx
+}
+
+define void @insert_v8i16_vidx(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v8i16_vidx:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
+
+  %2 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %a2 = trunc i32 %a to i16
+  %a3 = sext i16 %a2 to i32
+  %a4 = trunc i32 %a3 to i16
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %3 = insertelement <8 x i16> %1, i16 %a4, i32 %2
+  ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 1
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-AE-DAG: insert.h [[R1]][0], $4
+  ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <8 x i16> %3, <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: st.h [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v8i16_vidx
+}
+
+define void @insert_v4i32_vidx(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v4i32_vidx:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %3 = insertelement <4 x i32> %1, i32 %a, i32 %2
+  ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 2
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-AE-DAG: insert.w [[R1]][0], $4
+  ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <4 x i32> %3, <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v4i32_vidx
+}
+
+define void @insert_v2i64_vidx(i64 %a) nounwind {
+  ; MIPS32-AE: insert_v2i64_vidx:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %3 = insertelement <2 x i64> %1, i64 %a, i32 %2
+  ; TODO: This code could be a lot better but it works. The legalizer splits
+  ; 64-bit inserts into two 32-bit inserts because there is no i64 type on
+  ; MIPS32. The obvious optimisation is to perform both insert.w's at once while
+  ; the vector is rotated.
+  ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 2
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-AE-DAG: insert.w [[R1]][0], $4
+  ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+  ; MIPS32-AE-DAG: addiu [[IDX2:\$[0-9]+]], [[IDX]], 1
+  ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX2]], 2
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-AE-DAG: insert.w [[R1]][0], $5
+  ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <2 x i64> %3, <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v2i64_vidx
+}
+
 define void @truncstore() nounwind {
-  ; MIPS32-AE: truncstore:
+  ; MIPS32-AE-LABEL: truncstore:
 
   store volatile <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>, <4 x i8>*@v4i8
   ; TODO: What code should be emitted?
diff --git a/test/CodeGen/Mips/msa/basic_operations_float.ll b/test/CodeGen/Mips/msa/basic_operations_float.ll
index c8cef44..a0c9d29 100644
--- a/test/CodeGen/Mips/msa/basic_operations_float.ll
+++ b/test/CodeGen/Mips/msa/basic_operations_float.ll
@@ -3,11 +3,12 @@
 
 @v4f32 = global <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
 @v2f64 = global <2 x double> <double 0.0, double 0.0>
+@i32 = global i32 0
 @f32 = global float 0.0
 @f64 = global double 0.0
 
 define void @const_v4f32() nounwind {
-  ; MIPS32: const_v4f32:
+  ; MIPS32-LABEL: const_v4f32:
 
   store volatile <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, <4 x float>*@v4f32
   ; MIPS32: ldi.b  [[R1:\$w[0-9]+]], 0
@@ -38,7 +39,7 @@ define void @const_v4f32() nounwind {
 }
 
 define void @const_v2f64() nounwind {
-  ; MIPS32: const_v2f64:
+  ; MIPS32-LABEL: const_v2f64:
 
   store volatile <2 x double> <double 0.0, double 0.0>, <2 x double>*@v2f64
   ; MIPS32: ldi.b  [[R1:\$w[0-9]+]], 0
@@ -72,7 +73,7 @@ define void @const_v2f64() nounwind {
 }
 
 define void @nonconst_v4f32() nounwind {
-  ; MIPS32: nonconst_v4f32:
+  ; MIPS32-LABEL: nonconst_v4f32:
 
   %1 = load float *@f32
   %2 = insertelement <4 x float> undef, float %1, i32 0
@@ -88,7 +89,7 @@ define void @nonconst_v4f32() nounwind {
 }
 
 define void @nonconst_v2f64() nounwind {
-  ; MIPS32: nonconst_v2f64:
+  ; MIPS32-LABEL: nonconst_v2f64:
 
   %1 = load double *@f64
   %2 = insertelement <2 x double> undef, double %1, i32 0
@@ -102,7 +103,7 @@ define void @nonconst_v2f64() nounwind {
 }
 
 define float @extract_v4f32() nounwind {
-  ; MIPS32: extract_v4f32:
+  ; MIPS32-LABEL: extract_v4f32:
 
   %1 = load <4 x float>* @v4f32
   ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -120,7 +121,7 @@ define float @extract_v4f32() nounwind {
 }
 
 define float @extract_v4f32_elt0() nounwind {
-  ; MIPS32: extract_v4f32_elt0:
+  ; MIPS32-LABEL: extract_v4f32_elt0:
 
   %1 = load <4 x float>* @v4f32
   ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -138,7 +139,7 @@ define float @extract_v4f32_elt0() nounwind {
 }
 
 define float @extract_v4f32_elt2() nounwind {
-  ; MIPS32: extract_v4f32_elt2:
+  ; MIPS32-LABEL: extract_v4f32_elt2:
 
   %1 = load <4 x float>* @v4f32
   ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -155,8 +156,29 @@ define float @extract_v4f32_elt2() nounwind {
   ; MIPS32: .size extract_v4f32_elt2
 }
 
+define float @extract_v4f32_vidx() nounwind {
+  ; MIPS32-LABEL: extract_v4f32_vidx:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: lw [[PTR_V:\$[0-9]+]], %got(v4f32)(
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = fadd <4 x float> %1, %1
+  ; MIPS32-DAG: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <4 x float> %2, i32 %3
+  ; MIPS32-DAG: splat.w $w0, [[R1]]{{\[}}[[IDX]]]
+
+  ret float %4
+  ; MIPS32: .size extract_v4f32_vidx
+}
+
 define double @extract_v2f64() nounwind {
-  ; MIPS32: extract_v2f64:
+  ; MIPS32-LABEL: extract_v2f64:
 
   %1 = load <2 x double>* @v2f64
   ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
@@ -179,7 +201,7 @@ define double @extract_v2f64() nounwind {
 }
 
 define double @extract_v2f64_elt0() nounwind {
-  ; MIPS32: extract_v2f64_elt0:
+  ; MIPS32-LABEL: extract_v2f64_elt0:
 
   %1 = load <2 x double>* @v2f64
   ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
@@ -199,8 +221,29 @@ define double @extract_v2f64_elt0() nounwind {
   ; MIPS32: .size extract_v2f64_elt0
 }
 
+define double @extract_v2f64_vidx() nounwind {
+  ; MIPS32-LABEL: extract_v2f64_vidx:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: lw [[PTR_V:\$[0-9]+]], %got(v2f64)(
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = fadd <2 x double> %1, %1
+  ; MIPS32-DAG: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <2 x double> %2, i32 %3
+  ; MIPS32-DAG: splat.d $w0, [[R1]]{{\[}}[[IDX]]]
+
+  ret double %4
+  ; MIPS32: .size extract_v2f64_vidx
+}
+
 define void @insert_v4f32(float %a) nounwind {
-  ; MIPS32: insert_v4f32:
+  ; MIPS32-LABEL: insert_v4f32:
 
   %1 = load <4 x float>* @v4f32
   ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -217,7 +260,7 @@ define void @insert_v4f32(float %a) nounwind {
 }
 
 define void @insert_v2f64(double %a) nounwind {
-  ; MIPS32: insert_v2f64:
+  ; MIPS32-LABEL: insert_v2f64:
 
   %1 = load <2 x double>* @v2f64
   ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
@@ -232,3 +275,55 @@ define void @insert_v2f64(double %a) nounwind {
   ret void
   ; MIPS32: .size insert_v2f64
 }
+
+define void @insert_v4f32_vidx(float %a) nounwind {
+  ; MIPS32-LABEL: insert_v4f32_vidx:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: lw [[PTR_V:\$[0-9]+]], %got(v4f32)(
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = load i32* @i32
+  ; MIPS32-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %3 = insertelement <4 x float> %1, float %a, i32 %2
+  ; float argument passed in $f12
+  ; MIPS32-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 2
+  ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-DAG: insve.w [[R1]][0], $w12[0]
+  ; MIPS32-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <4 x float> %3, <4 x float>* @v4f32
+  ; MIPS32-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32: .size insert_v4f32_vidx
+}
+
+define void @insert_v2f64_vidx(double %a) nounwind {
+  ; MIPS32-LABEL: insert_v2f64_vidx:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: lw [[PTR_V:\$[0-9]+]], %got(v2f64)(
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = load i32* @i32
+  ; MIPS32-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %3 = insertelement <2 x double> %1, double %a, i32 %2
+  ; double argument passed in $f12
+  ; MIPS32-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 3
+  ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-DAG: insve.d [[R1]][0], $w12[0]
+  ; MIPS32-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <2 x double> %3, <2 x double>* @v2f64
+  ; MIPS32-DAG: st.d [[R1]]
+
+  ret void
+  ; MIPS32: .size insert_v2f64_vidx
+}
diff --git a/test/CodeGen/Mips/optimize-fp-math.ll b/test/CodeGen/Mips/optimize-fp-math.ll
index 8b71dc4..7886f29 100644
--- a/test/CodeGen/Mips/optimize-fp-math.ll
+++ b/test/CodeGen/Mips/optimize-fp-math.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=64
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=64
 
 ; 32-LABEL: test_sqrtf_float_:
diff --git a/test/CodeGen/Mips/remat-immed-load.ll b/test/CodeGen/Mips/remat-immed-load.ll
index d93964b..b53b156 100644
--- a/test/CodeGen/Mips/remat-immed-load.ll
+++ b/test/CodeGen/Mips/remat-immed-load.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 < %s | FileCheck %s -check-prefix=64
 ; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck %s -check-prefix=64
 
 define void @f0() nounwind {
diff --git a/test/CodeGen/Mips/sint-fp-store_pattern.ll b/test/CodeGen/Mips/sint-fp-store_pattern.ll
index c44ea08..2735d78 100644
--- a/test/CodeGen/Mips/sint-fp-store_pattern.ll
+++ b/test/CodeGen/Mips/sint-fp-store_pattern.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=64
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=64
 
 @gint_ = external global i32
diff --git a/test/CodeGen/Mips/start-asm-file.ll b/test/CodeGen/Mips/start-asm-file.ll
new file mode 100644
index 0000000..8872464
--- /dev/null
+++ b/test/CodeGen/Mips/start-asm-file.ll
@@ -0,0 +1,91 @@
+; Check the emission of directives at the start of an asm file.
+; This test is XFAILED until we fix the emission of '.option pic0' on
+; N32. At the moment we check if subtarget is Mips64 when we should be
+; checking the Subtarget's ABI.
+
+; ### O32 ABI ###
+; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
+; RUN: -relocation-model=static %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-O32 -check-prefix=CHECK-STATIC-O32-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
+; RUN: -relocation-model=pic %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-O32 -check-prefix=CHECK-PIC-O32-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
+; RUN: -relocation-model=static -mattr=+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-O32 -check-prefix=CHECK-STATIC-O32-N2008 %s
+
+; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
+; RUN: -relocation-model=pic -mattr=+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-O32 -check-prefix=CHECK-PIC-O32-N2008 %s
+
+; ### N32 ABI ###
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=static -mattr=-n64,+n32 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-N32 -check-prefix=CHECK-STATIC-N32-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=pic -mattr=-n64,+n32 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-N32 -check-prefix=CHECK-PIC-N32-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=static -mattr=-n64,+n32,+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-N32 -check-prefix=CHECK-STATIC-N32-N2008 %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=pic -mattr=-n64,+n32,+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-N32 -check-prefix=CHECK-PIC-N32-N2008 %s
+
+; ### N64 ABI ###
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=static -mattr=+n64 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-N64 -check-prefix=CHECK-STATIC-N64-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=pic -mattr=+n64 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-N64 -check-prefix=CHECK-PIC-N64-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=static -mattr=+n64,+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-N64 -check-prefix=CHECK-STATIC-N64-N2008 %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=pic -mattr=+n64,+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-N64 -check-prefix=CHECK-PIC-N64-N2008 %s
+
+; CHECK-STATIC-O32: .abicalls
+; CHECK-STATIC-O32: .option pic0
+; CHECK-STATIC-O32: .section .mdebug.abi32
+; CHECK-STATIC-O32-NLEGACY: .nan legacy
+; CHECK-STATIC-O32-N2008: .nan 2008
+
+; CHECK-PIC-O32: .abicalls
+; CHECK-PIC-O32-NOT: .option pic0
+; CHECK-PIC-O32: .section .mdebug.abi32
+; CHECK-PIC-O32-NLEGACY: .nan legacy
+; CHECK-PIC-O32-N2008: .nan 2008
+
+; CHECK-STATIC-N32: .abicalls
+; CHECK-STATIC-N32: .option pic0
+; CHECK-STATIC-N32: .section .mdebug.abiN32
+; CHECK-STATIC-N32-NLEGACY: .nan legacy
+; CHECK-STATIC-N32-N2008: .nan 2008
+
+; CHECK-PIC-N32: .abicalls
+; CHECK-PIC-N32-NOT: .option pic0
+; CHECK-PIC-N32: .section .mdebug.abiN32
+; CHECK-PIC-N32-NLEGACY: .nan legacy
+; CHECK-PIC-N32-N2008: .nan 2008
+
+; CHECK-STATIC-N64: .abicalls
+; CHECK-STATIC-N64-NOT: .option pic0
+; CHECK-STATIC-N64: .section .mdebug.abi64
+; CHECK-STATIC-N64-NLEGACY: .nan legacy
+; CHECK-STATIC-N64-N2008: .nan 2008
+
+; CHECK-PIC-N64: .abicalls
+; CHECK-PIC-N64-NOT: .option pic0
+; CHECK-PIC-N64: .section .mdebug.abi64
+; CHECK-PIC-N64-NLEGACY: .nan legacy
+; CHECK-PIC-N64-N2008: .nan 2008
diff --git a/test/CodeGen/Mips/tls-alias.ll b/test/CodeGen/Mips/tls-alias.ll
index 3c81054..80fbe87 100644
--- a/test/CodeGen/Mips/tls-alias.ll
+++ b/test/CodeGen/Mips/tls-alias.ll
@@ -5,6 +5,6 @@
 
 define i32* @zed() {
 ; CHECK-DAG: __tls_get_addr
-; CHECK-DAG: %tlsgd(bar)
+; CHECK-DAG: %tlsldm(bar)
        ret i32* @bar
 }
diff --git a/test/CodeGen/Mips/unalignedload.ll b/test/CodeGen/Mips/unalignedload.ll
index 19f3af7..2002b1c 100644
--- a/test/CodeGen/Mips/unalignedload.ll
+++ b/test/CodeGen/Mips/unalignedload.ll
@@ -1,5 +1,9 @@
-; RUN: llc  < %s -march=mipsel  | FileCheck %s -check-prefix=CHECK-EL
-; RUN: llc  < %s -march=mips    | FileCheck %s -check-prefix=CHECK-EB
+; RUN: llc  < %s -march=mipsel -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32-EL
+; RUN: llc  < %s -march=mips   -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32-EB
+; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32-EL
+; RUN: llc  < %s -march=mips   -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32-EB
+; RUN: llc  < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32R6-EL
+; RUN: llc  < %s -march=mips   -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32R6-EB
 %struct.S2 = type { %struct.S1, %struct.S1 }
 %struct.S1 = type { i8, i8 }
 %struct.S4 = type { [7 x i8] }
@@ -7,21 +11,71 @@
 @s2 = common global %struct.S2 zeroinitializer, align 1
 @s4 = common global %struct.S4 zeroinitializer, align 1
 
-define void @foo1() nounwind {
+define void @bar1() nounwind {
 entry:
-; CHECK-EL-DAG: lbu ${{[0-9]+}}, 2($[[R0:[0-9]+]])
-; CHECK-EL-DAG: lbu ${{[0-9]+}}, 3($[[R0]])
-; CHECK-EL:     jalr
-; CHECK-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[R2:[0-9]+]])
-; CHECK-EL-DAG: lwr $[[R1]], 0($[[R2]])
-
-; CHECK-EB-DAG: lbu ${{[0-9]+}}, 3($[[R0:[0-9]+]])
-; CHECK-EB-DAG: lbu ${{[0-9]+}}, 2($[[R0]])
-; CHECK-EB:     jalr
-; CHECK-EB-DAG: lwl $[[R1:[0-9]+]], 0($[[R2:[0-9]+]])
-; CHECK-EB-DAG: lwr $[[R1]], 3($[[R2]])
+; ALL-LABEL: bar1:
+
+; ALL-DAG:       lw $[[R0:[0-9]+]], %got(s2)(
+
+; MIPS32-EL-DAG: lbu $[[PART1:[0-9]+]], 2($[[R0]])
+; MIPS32-EL-DAG: lbu $[[PART2:[0-9]+]], 3($[[R0]])
+; MIPS32-EL-DAG: sll $[[T0:[0-9]+]], $[[PART2]], 8
+; MIPS32-EL-DAG: or  $4, $[[T0]], $[[PART1]]
+
+; MIPS32-EB-DAG: lbu $[[PART1:[0-9]+]], 2($[[R0]])
+; MIPS32-EB-DAG: lbu $[[PART2:[0-9]+]], 3($[[R0]])
+; MIPS32-EB-DAG: sll $[[T0:[0-9]+]], $[[PART1]], 8
+; MIPS32-EB-DAG: or  $[[T1:[0-9]+]], $[[T0]], $[[PART2]]
+; MIPS32-EB-DAG: sll $4, $[[T1]], 16
+
+; MIPS32R6-DAG:  lhu $[[PART1:[0-9]+]], 2($[[R0]])
 
   tail call void @foo2(%struct.S1* byval getelementptr inbounds (%struct.S2* @s2, i32 0, i32 1)) nounwind
+  ret void
+}
+
+define void @bar2() nounwind {
+entry:
+; ALL-LABEL: bar2:
+
+; ALL-DAG:       lw $[[R2:[0-9]+]], %got(s4)(
+
+; MIPS32-EL-DAG: lwl $[[R1:4]], 3($[[R2]])
+; MIPS32-EL-DAG: lwr $[[R1]], 0($[[R2]])
+; MIPS32-EL-DAG: lbu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32-EL-DAG: lbu $[[T1:[0-9]+]], 5($[[R2]])
+; MIPS32-EL-DAG: lbu $[[T2:[0-9]+]], 6($[[R2]])
+; MIPS32-EL-DAG: sll $[[T3:[0-9]+]], $[[T1]], 8
+; MIPS32-EL-DAG: or  $[[T4:[0-9]+]], $[[T3]], $[[T0]]
+; MIPS32-EL-DAG: sll $[[T5:[0-9]+]], $[[T2]], 16
+; MIPS32-EL-DAG: or  $5, $[[T4]], $[[T5]]
+
+; MIPS32-EB-DAG: lwl $[[R1:4]], 0($[[R2]])
+; MIPS32-EB-DAG: lwr $[[R1]], 3($[[R2]])
+; MIPS32-EB-DAG: lbu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32-EB-DAG: lbu $[[T1:[0-9]+]], 5($[[R2]])
+; MIPS32-EB-DAG: lbu $[[T2:[0-9]+]], 6($[[R2]])
+; MIPS32-EB-DAG: sll $[[T3:[0-9]+]], $[[T0]], 8
+; MIPS32-EB-DAG: or  $[[T4:[0-9]+]], $[[T3]], $[[T1]]
+; MIPS32-EB-DAG: sll $[[T5:[0-9]+]], $[[T4]], 16
+; MIPS32-EB-DAG: sll $[[T6:[0-9]+]], $[[T2]], 8
+; MIPS32-EB-DAG: or  $5, $[[T5]], $[[T6]]
+
+; FIXME: We should be able to do better than this using lhu
+; MIPS32R6-EL-DAG: lw $4, 0($[[R2]])
+; MIPS32R6-EL-DAG: lhu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32R6-EL-DAG: lbu $[[T1:[0-9]+]], 6($[[R2]])
+; MIPS32R6-EL-DAG: sll $[[T2:[0-9]+]], $[[T1]], 16
+; MIPS32R6-EL-DAG: or  $5, $[[T0]], $[[T2]]
+
+; FIXME: We should be able to do better than this using lhu
+; MIPS32R6-EB-DAG: lw $4, 0($[[R2]])
+; MIPS32R6-EB-DAG: lhu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32R6-EB-DAG: lbu $[[T1:[0-9]+]], 6($[[R2]])
+; MIPS32R6-EB-DAG: sll $[[T2:[0-9]+]], $[[T0]], 16
+; MIPS32R6-EB-DAG: sll $[[T3:[0-9]+]], $[[T1]], 8
+; MIPS32R6-EB-DAG: or  $5, $[[T2]], $[[T3]]
+
   tail call void @foo4(%struct.S4* byval @s4) nounwind
   ret void
 }
diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll
new file mode 100644
index 0000000..0622aa3
--- /dev/null
+++ b/test/CodeGen/NVPTX/access-non-generic.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix PTX
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX
+; RUN: opt < %s -S -nvptx-favor-non-generic -dce | FileCheck %s --check-prefix IR
+
+@array = internal addrspace(3) global [10 x float] zeroinitializer, align 4
+@scalar = internal addrspace(3) global float 0.000000e+00, align 4
+
+; Verifies nvptx-favor-non-generic correctly optimizes generic address space
+; usage to non-generic address space usage for the patterns we claim to handle:
+; 1. load cast
+; 2. store cast
+; 3. load gep cast
+; 4. store gep cast
+; gep and cast can be an instruction or a constant expression. This function
+; tries all possible combinations.
+define float @ld_st_shared_f32(i32 %i, float %v) {
+; IR-LABEL: @ld_st_shared_f32
+; IR-NOT: addrspacecast
+; PTX-LABEL: ld_st_shared_f32(
+  ; load cast
+  %1 = load float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
+  ; store cast
+  store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
+; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
+  ; use syncthreads to disable optimizations across components
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; cast; load
+  %2 = addrspacecast float addrspace(3)* @scalar to float*
+  %3 = load float* %2, align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
+  ; cast; store
+  store float %v, float* %2, align 4
+; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; load gep cast
+  %4 = load float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
+  ; store gep cast
+  store float %v, float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; gep cast; load
+  %5 = getelementptr inbounds [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5
+  %6 = load float* %5, align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
+  ; gep cast; store
+  store float %v, float* %5, align 4
+; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; cast; gep; load
+  %7 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float]*
+  %8 = getelementptr inbounds [10 x float]* %7, i32 0, i32 %i
+  %9 = load float* %8, align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
+  ; cast; gep; store
+  store float %v, float* %8, align 4
+; PTX: st.shared.f32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  %sum2 = fadd float %1, %3
+  %sum3 = fadd float %sum2, %4
+  %sum4 = fadd float %sum3, %6
+  %sum5 = fadd float %sum4, %9
+  ret float %sum5
+}
+
+; Verifies nvptx-favor-non-generic keeps addrspacecasts between pointers of
+; different element types.
+define i32 @ld_int_from_float() {
+; IR-LABEL: @ld_int_from_float
+; IR: addrspacecast
+; PTX-LABEL: ld_int_from_float(
+; PTX: cvta.shared.u{{(32|64)}}
+  %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
+  ret i32 %1
+}
+
+declare void @llvm.cuda.syncthreads() #3
+
+attributes #3 = { noduplicate nounwind }
+
diff --git a/test/CodeGen/NVPTX/addrspacecast-gvar.ll b/test/CodeGen/NVPTX/addrspacecast-gvar.ll
new file mode 100644
index 0000000..6afbdb8
--- /dev/null
+++ b/test/CodeGen/NVPTX/addrspacecast-gvar.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; CHECK: .visible .global .align 4 .u32 g = 42;
+; CHECK: .visible .global .align 4 .u32 g2 = generic(g);
+; CHECK: .visible .global .align 4 .u32 g3 = g;
+
+@g = addrspace(1) global i32 42
+@g2 = addrspace(1) global i32* addrspacecast (i32 addrspace(1)* @g to i32*)
+@g3 = addrspace(1) global i32 addrspace(1)* @g
diff --git a/test/CodeGen/NVPTX/addrspacecast.ll b/test/CodeGen/NVPTX/addrspacecast.ll
index 98ea655..03b9a98 100644
--- a/test/CodeGen/NVPTX/addrspacecast.ll
+++ b/test/CodeGen/NVPTX/addrspacecast.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefix=PTX32
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefix=PTX64
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -disable-nvptx-favor-non-generic | FileCheck %s -check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -disable-nvptx-favor-non-generic | FileCheck %s -check-prefix=PTX64
 
 
 define i32 @conv1(i32 addrspace(1)* %ptr) {
diff --git a/test/CodeGen/NVPTX/local-stack-frame.ll b/test/CodeGen/NVPTX/local-stack-frame.ll
index 178dff1..c0d7d1c 100644
--- a/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -3,16 +3,16 @@
 
 ; Ensure we access the local stack properly
 
-; PTX32:        mov.u32         %r{{[0-9]+}}, __local_depot{{[0-9]+}};
-; PTX32:        cvta.local.u32  %SP, %r{{[0-9]+}};
-; PTX32:        ld.param.u32    %r{{[0-9]+}}, [foo_param_0];
-; PTX32:        st.u32  [%SP+0], %r{{[0-9]+}};
-; PTX64:        mov.u64         %rl{{[0-9]+}}, __local_depot{{[0-9]+}};
-; PTX64:        cvta.local.u64  %SP, %rl{{[0-9]+}};
-; PTX64:        ld.param.u32    %r{{[0-9]+}}, [foo_param_0];
-; PTX64:        st.u32  [%SP+0], %r{{[0-9]+}};
+; PTX32:        mov.u32          %r{{[0-9]+}}, __local_depot{{[0-9]+}};
+; PTX32:        cvta.local.u32   %SP, %r{{[0-9]+}};
+; PTX32:        ld.param.u32     %r{{[0-9]+}}, [foo_param_0];
+; PTX32:        st.volatile.u32  [%SP+0], %r{{[0-9]+}};
+; PTX64:        mov.u64          %rl{{[0-9]+}}, __local_depot{{[0-9]+}};
+; PTX64:        cvta.local.u64   %SP, %rl{{[0-9]+}};
+; PTX64:        ld.param.u32     %r{{[0-9]+}}, [foo_param_0];
+; PTX64:        st.volatile.u32  [%SP+0], %r{{[0-9]+}};
 define void @foo(i32 %a) {
   %local = alloca i32, align 4
-  store i32 %a, i32* %local
+  store volatile i32 %a, i32* %local
   ret void
 }
diff --git a/test/CodeGen/NVPTX/surf-read.ll b/test/CodeGen/NVPTX/surf-read.ll
new file mode 100644
index 0000000..a69d03e
--- /dev/null
+++ b/test/CodeGen/NVPTX/surf-read.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-nvcl"
+
+declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
+
+; CHECK: .entry foo
+define void @foo(i64 %img, float* %red, i32 %idx) {
+; CHECK: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [foo_param_0, {%r{{[0-9]+}}}]
+  %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
+; CHECK: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+  %ret = sitofp i32 %val to float
+; CHECK: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+  store float %ret, float* %red
+  ret void
+}
+
+!nvvm.annotations = !{!1, !2}
+!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (i64, float*, i32)* @foo, metadata !"rdwrimage", i32 0}
diff --git a/test/CodeGen/NVPTX/surf-write.ll b/test/CodeGen/NVPTX/surf-write.ll
new file mode 100644
index 0000000..880231f
--- /dev/null
+++ b/test/CodeGen/NVPTX/surf-write.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-nvcl"
+
+declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
+
+; CHECK: .entry foo
+define void @foo(i64 %img, i32 %val, i32 %idx) {
+; CHECK: sust.b.1d.b32.trap [foo_param_0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+  tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val)
+  ret void
+}
+
+!nvvm.annotations = !{!1, !2}
+!1 = metadata !{void (i64, i32, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (i64, i32, i32)* @foo, metadata !"wroimage", i32 0}
diff --git a/test/CodeGen/NVPTX/tex-read.ll b/test/CodeGen/NVPTX/tex-read.ll
new file mode 100644
index 0000000..291060b
--- /dev/null
+++ b/test/CodeGen/NVPTX/tex-read.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-nvcl"
+
+declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64, i64, i32)
+
+; CHECK: .entry foo
+define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) {
+; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
+  %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64 %img, i64 %sampler, i32 %idx)
+  %ret = extractvalue { float, float, float, float } %val, 0
+; CHECK: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+  store float %ret, float* %red
+  ret void
+}
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"rdoimage", i32 0}
+!3 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"sampler", i32 1}
diff --git a/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll b/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
index ccf5297..df83f8b 100644
--- a/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
+++ b/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mcpu=g5 < %s | FileCheck %s
+; RUN: llc -mcpu=g5 -addr-sink-using-gep=1 < %s | FileCheck %s
 ;; Formerly crashed, see PR 1508
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc64-apple-darwin8"
diff --git a/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll b/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
index 00a402e..8802b97 100644
--- a/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
+++ b/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=ppc32 -mcpu=g5 | grep vadduhm
 ; RUN: llc < %s -march=ppc32 -mcpu=g5 | grep vsubuhm
-; XFAIL: *
 
 define <4 x i32> @test() nounwind {
 	ret <4 x i32> < i32 4293066722, i32 4293066722, i32 4293066722, i32 4293066722>
diff --git a/test/CodeGen/PowerPC/aa-tbaa.ll b/test/CodeGen/PowerPC/aa-tbaa.ll
index d7f80fa..1939841 100644
--- a/test/CodeGen/PowerPC/aa-tbaa.ll
+++ b/test/CodeGen/PowerPC/aa-tbaa.ll
@@ -1,4 +1,4 @@
-; RUN: llc -enable-misched -misched=shuffle -enable-aa-sched-mi -post-RA-scheduler=0 -mcpu=ppc64 < %s | FileCheck %s
+; RUN: llc -enable-misched -misched=shuffle -enable-aa-sched-mi -use-tbaa-in-sched-mi=0 -post-RA-scheduler=0 -mcpu=ppc64 < %s | FileCheck %s
 
 ; REQUIRES: asserts
 ; -misched=shuffle is NDEBUG only!
diff --git a/test/CodeGen/PowerPC/alias.ll b/test/CodeGen/PowerPC/alias.ll
new file mode 100644
index 0000000..86e4114
--- /dev/null
+++ b/test/CodeGen/PowerPC/alias.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -code-model=medium| FileCheck --check-prefix=CHECK --check-prefix=MEDIUM %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK --check-prefix=LARGE %s
+
+@foo = global i32 42
+@fooa = alias i32* @foo
+
+@foo2 = global i64 42
+@foo2a = alias i64* @foo2
+
+; CHECK-LABEL: bar:
+define i32 @bar() {
+; MEDIUM: addis 3, 2, fooa@toc@ha
+; LARGE: addis 3, 2, .LC1@toc@ha
+  %a = load i32* @fooa
+  ret i32 %a
+}
+
+; CHECK-LABEL: bar2:
+define i64 @bar2() {
+; MEDIUM: addis 3, 2, foo2a@toc@ha
+; MEDIUM: addi 3, 3, foo2a@toc@l
+; LARGE: addis 3, 2, .LC3@toc@ha
+  %a = load i64* @foo2a
+  ret i64 %a
+}
+
+; LARGE: .LC1:
+; LARGE-NEXT: .tc fooa[TC],fooa
+
+; LARGE: .LC3:
+; LARGE-NEXT: .tc foo2a[TC],foo2a
diff --git a/test/CodeGen/PowerPC/cc.ll b/test/CodeGen/PowerPC/cc.ll
new file mode 100644
index 0000000..f92121b
--- /dev/null
+++ b/test/CodeGen/PowerPC/cc.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i64 @test1(i64 %a, i64 %b) {
+entry:
+  %c = icmp eq i64 %a, %b
+  br label %foo
+
+foo:
+  call { i64, i64 } asm sideeffect "sc", "={r0},={r3},{r0},~{cr0},~{cr1},~{cr2},~{cr3},~{cr4},~{cr5},~{cr6},~{cr7}" (i64 %a)
+  br i1 %c, label %bar, label %end
+
+bar:
+  ret i64 %b
+
+end:
+  ret i64 %a
+
+; CHECK-LABEL: @test1
+; CHECK: mfcr [[REG1:[0-9]+]]
+; CHECK-DAG: cmpd
+; CHECK-DAG: mfocrf [[REG2:[0-9]+]],
+; CHECK-DAG: stw [[REG1]], 8(1)
+; CHECK-DAG: stw [[REG2]], -4(1)
+
+; CHECK: sc
+; CHECK: lwz [[REG3:[0-9]+]], -4(1)
+; CHECK: mtocrf 128, [[REG3]]
+
+; CHECK: lwz [[REG4:[0-9]+]], 8(1)
+; CHECK-DAG: mtocrf 32, [[REG4]]
+; CHECK-DAG: mtocrf 16, [[REG4]]
+; CHECK-DAG: mtocrf 8, [[REG4]]
+; CHECK: blr
+}
+
+define i64 @test2(i64 %a, i64 %b) {
+entry:
+  %c = icmp eq i64 %a, %b
+  br label %foo
+
+foo:
+  call { i64, i64 } asm sideeffect "sc", "={r0},={r3},{r0},~{cc}" (i64 %a)
+  br i1 %c, label %bar, label %end
+
+bar:
+  ret i64 %b
+
+end:
+  ret i64 %a
+
+; CHECK-LABEL: @test2
+; CHECK: mfcr [[REG1:[0-9]+]]
+; CHECK-DAG: cmpd
+; CHECK-DAG: mfocrf [[REG2:[0-9]+]],
+; CHECK-DAG: stw [[REG1]], 8(1)
+; CHECK-DAG: stw [[REG2]], -4(1)
+
+; CHECK: sc
+; CHECK: lwz [[REG3:[0-9]+]], -4(1)
+; CHECK: mtocrf 128, [[REG3]]
+
+; CHECK: lwz [[REG4:[0-9]+]], 8(1)
+; CHECK-DAG: mtocrf 32, [[REG4]]
+; CHECK-DAG: mtocrf 16, [[REG4]]
+; CHECK-DAG: mtocrf 8, [[REG4]]
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/ctrloop-le.ll b/test/CodeGen/PowerPC/ctrloop-le.ll
index 7b8185e..60b0536 100644
--- a/test/CodeGen/PowerPC/ctrloop-le.ll
+++ b/test/CodeGen/PowerPC/ctrloop-le.ll
@@ -2,6 +2,9 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "powerpc64-unknown-linux-gnu"
 ; RUN: llc < %s -march=ppc64 | FileCheck %s
 
+; XFAIL: *
+; SE needs improvement
+
 ; CHECK: test_pos1_ir_sle
 ; CHECK: bdnz
 ; a < b
diff --git a/test/CodeGen/PowerPC/ctrloop-lt.ll b/test/CodeGen/PowerPC/ctrloop-lt.ll
index eaab61a..a9dc42c 100644
--- a/test/CodeGen/PowerPC/ctrloop-lt.ll
+++ b/test/CodeGen/PowerPC/ctrloop-lt.ll
@@ -2,6 +2,9 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "powerpc64-unknown-linux-gnu"
 ; RUN: llc < %s -march=ppc64 | FileCheck %s
 
+; XFAIL: *
+; SE needs improvement
+
 ; CHECK: test_pos1_ir_slt
 ; CHECK: bdnz
 ; a < b
diff --git a/test/CodeGen/PowerPC/ctrloop-sh.ll b/test/CodeGen/PowerPC/ctrloop-sh.ll
new file mode 100644
index 0000000..d8e6fc7
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-sh.ll
@@ -0,0 +1,72 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-p:32:32-i128:64-n32"
+target triple = "powerpc-ellcc-linux"
+
+; Function Attrs: nounwind
+define void @foo1(i128* %a, i128* readonly %b, i128* readonly %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i128* %b, align 16
+  %1 = load i128* %c, align 16
+  %shl = shl i128 %0, %1
+  store i128 %shl, i128* %a, align 16
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, 2048
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; CHECK-LABEL: @foo1
+; CHECK-NOT: mtctr
+}
+
+; Function Attrs: nounwind
+define void @foo2(i128* %a, i128* readonly %b, i128* readonly %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i128* %b, align 16
+  %1 = load i128* %c, align 16
+  %shl = ashr i128 %0, %1
+  store i128 %shl, i128* %a, align 16
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, 2048
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; CHECK-LABEL: @foo2
+; CHECK-NOT: mtctr
+}
+
+; Function Attrs: nounwind
+define void @foo3(i128* %a, i128* readonly %b, i128* readonly %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i128* %b, align 16
+  %1 = load i128* %c, align 16
+  %shl = lshr i128 %0, %1
+  store i128 %shl, i128* %a, align 16
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, 2048
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; CHECK-LABEL: @foo3
+; CHECK-NOT: mtctr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index 0d6c4a6..6beea55 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll
@@ -28,8 +28,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !10 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
 !11 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
 !12 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!13 = metadata !{metadata !14}
-!14 = metadata !{metadata !15, metadata !16}
+!13 = metadata !{metadata !15, metadata !16}
 !15 = metadata !{i32 721153, metadata !5, metadata !"argc", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !16 = metadata !{i32 721153, metadata !5, metadata !"argv", metadata !6, i32 33554433, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !17 = metadata !{i32 1, i32 14, metadata !5, null}
diff --git a/test/CodeGen/PowerPC/indexed-load.ll b/test/CodeGen/PowerPC/indexed-load.ll
new file mode 100644
index 0000000..59fc058
--- /dev/null
+++ b/test/CodeGen/PowerPC/indexed-load.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+
+; The SplitIndexingFromLoad tranformation exposed an isel backend bug.  This
+; testcase used to generate stwx 4, 3, 64.  stwx does not have an
+; immediate-offset format (note the 64) and it should not be matched.
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%class.test = type { [64 x i8], [5 x i8] }
+
+; CHECK-LABEL: f:
+; CHECK-NOT: stwx {{[0-9]+}}, {{[0-9]+}}, 64
+define void @f(%class.test* %this) {
+entry:
+  %Subminor.i.i = getelementptr inbounds %class.test* %this, i64 0, i32 1
+  %0 = bitcast [5 x i8]* %Subminor.i.i to i40*
+  %bf.load2.i.i = load i40* %0, align 4
+  %bf.clear7.i.i = and i40 %bf.load2.i.i, -8589934592
+  store i40 %bf.clear7.i.i, i40* %0, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/mcm-10.ll b/test/CodeGen/PowerPC/mcm-10.ll
index b479559..c3ab747 100644
--- a/test/CodeGen/PowerPC/mcm-10.ll
+++ b/test/CodeGen/PowerPC/mcm-10.ll
@@ -18,7 +18,8 @@ entry:
 
 ; CHECK-LABEL: test_fn_static:
 ; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
-; CHECK: lwz {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK: lwa {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-NOT: extsw
 ; CHECK: stw {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
 ; CHECK: .type [[VAR]],@object
 ; CHECK: .local [[VAR]]
diff --git a/test/CodeGen/PowerPC/mcm-11.ll b/test/CodeGen/PowerPC/mcm-11.ll
index c49e865..033045c 100644
--- a/test/CodeGen/PowerPC/mcm-11.ll
+++ b/test/CodeGen/PowerPC/mcm-11.ll
@@ -18,7 +18,8 @@ entry:
 
 ; CHECK-LABEL: test_file_static:
 ; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
-; CHECK: lwz {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK: lwa {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-NOT: extsw
 ; CHECK: stw {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
 ; CHECK: .type [[VAR]],@object
 ; CHECK: .data
diff --git a/test/CodeGen/PowerPC/mcm-obj-2.ll b/test/CodeGen/PowerPC/mcm-obj-2.ll
index a6e9855..c42cf0c 100644
--- a/test/CodeGen/PowerPC/mcm-obj-2.ll
+++ b/test/CodeGen/PowerPC/mcm-obj-2.ll
@@ -22,7 +22,7 @@ entry:
 ; CHECK: Relocations [
 ; CHECK:   Section (2) .rela.text {
 ; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM2:[^ ]+]]
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO [[SYM2]]
+; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM2]]
 ; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO [[SYM2]]
 
 @gi = global i32 5, align 4
@@ -39,7 +39,7 @@ entry:
 ; accessing file-scope variable gi.
 ;
 ; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM3:[^ ]+]]
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO [[SYM3]]
+; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM3]]
 ; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO [[SYM3]]
 
 define double @test_double_const() nounwind {
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r0.ll b/test/CodeGen/PowerPC/named-reg-alloc-r0.ll
new file mode 100644
index 0000000..e683f99
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r0.ll
@@ -0,0 +1,15 @@
+; RUN: not llc < %s -mtriple=powerpc-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=powerpc-unknown-linux-gnu 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i32 @get_reg() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK: Invalid register name global variable
+        %reg = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %reg
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"r0\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll b/test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll
new file mode 100644
index 0000000..b047f9f
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=powerpc64-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i64 @get_reg() nounwind {
+entry:
+  %reg = call i64 @llvm.read_register.i64(metadata !0)
+  ret i64 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 1
+
+; CHECK-DARWIN-LABEL: @get_reg
+; CHECK-DARWIN: mr r3, r1
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+
+!0 = metadata !{metadata !"r1\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r1.ll b/test/CodeGen/PowerPC/named-reg-alloc-r1.ll
new file mode 100644
index 0000000..9d0eb34
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r1.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=powerpc-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc64-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i32 @get_reg() nounwind {
+entry:
+  %reg = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 1
+
+; CHECK-DARWIN-LABEL: @get_reg
+; CHECK-DARWIN: mr r3, r1
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"r1\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll b/test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll
new file mode 100644
index 0000000..df5085b
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=powerpc64-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i64 @get_reg() nounwind {
+entry:
+  %reg = call i64 @llvm.read_register.i64(metadata !0)
+  ret i64 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 13
+
+; CHECK-DARWIN-LABEL: @get_reg
+; CHECK-DARWIN: mr r3, r13
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+
+!0 = metadata !{metadata !"r13\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r13.ll b/test/CodeGen/PowerPC/named-reg-alloc-r13.ll
new file mode 100644
index 0000000..900ebb2
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r13.ll
@@ -0,0 +1,18 @@
+; RUN: not llc < %s -mtriple=powerpc-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i32 @get_reg() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK-DARWIN: Invalid register name global variable
+        %reg = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 13
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"r13\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll b/test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll
new file mode 100644
index 0000000..0da33fa
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll
@@ -0,0 +1,17 @@
+; RUN: not llc < %s -mtriple=powerpc64-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i64 @get_reg() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK-DARWIN: Invalid register name global variable
+        %reg = call i64 @llvm.read_register.i64(metadata !0)
+  ret i64 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 2
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+
+!0 = metadata !{metadata !"r2\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r2.ll b/test/CodeGen/PowerPC/named-reg-alloc-r2.ll
new file mode 100644
index 0000000..51e7e3e
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r2.ll
@@ -0,0 +1,18 @@
+; RUN: not llc < %s -mtriple=powerpc-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i32 @get_reg() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK-DARWIN: Invalid register name global variable
+        %reg = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 2
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"r2\00"}
diff --git a/test/CodeGen/PowerPC/rlwimi-dyn-and.ll b/test/CodeGen/PowerPC/rlwimi-dyn-and.ll
new file mode 100644
index 0000000..e02801f
--- /dev/null
+++ b/test/CodeGen/PowerPC/rlwimi-dyn-and.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @test1() #0 {
+entry:
+  %conv67.reload = load i32* undef
+  %const = bitcast i32 65535 to i32
+  br label %next
+
+next:
+  %shl161 = shl nuw nsw i32 %conv67.reload, 15
+  %0 = load i8* undef, align 1
+  %conv169 = zext i8 %0 to i32
+  %shl170 = shl nuw nsw i32 %conv169, 7
+  %const_mat = add i32 %const, -32767
+  %shl161.masked = and i32 %shl161, %const_mat
+  %conv174 = or i32 %shl170, %shl161.masked
+  ret i32 %conv174
+
+; CHECK-LABEL: @test1
+; CHECK-NOT: rlwimi 3, {{[0-9]+}}, 15, 0, 16
+; CHECK: blr
+}
+
+define i32 @test2() #0 {
+entry:
+  %conv67.reload = load i32* undef
+  %const = bitcast i32 65535 to i32
+  br label %next
+
+next:
+  %shl161 = shl nuw nsw i32 %conv67.reload, 15
+  %0 = load i8* undef, align 1
+  %conv169 = zext i8 %0 to i32
+  %shl170 = shl nuw nsw i32 %conv169, 7
+  %shl161.masked = and i32 %shl161, 32768
+  %conv174 = or i32 %shl170, %shl161.masked
+  ret i32 %conv174
+
+; CHECK-LABEL: @test2
+; CHECK: slwi 3, {{[0-9]+}}, 7
+; CHECK: rlwimi 3, {{[0-9]+}}, 15, 16, 16
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/splat-bug.ll b/test/CodeGen/PowerPC/splat-bug.ll
new file mode 100644
index 0000000..4b5250b
--- /dev/null
+++ b/test/CodeGen/PowerPC/splat-bug.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mcpu=ppc64 -O0 -fast-isel=false < %s | FileCheck %s
+
+; Checks for a previous bug where vspltisb/vaddubm were issued in place
+; of vsplitsh/vadduhm.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = external global <16 x i8>
+
+define void @foo() nounwind ssp {
+; CHECK: foo:
+  store <16 x i8> <i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16>, <16 x i8>* @a
+; CHECK: vspltish [[REG:[0-9]+]], 8
+; CHECK: vadduhm {{[0-9]+}}, [[REG]], [[REG]]
+  ret void
+}
+
diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll
index fffaefe..7dec426 100644
--- a/test/CodeGen/R600/32-bit-local-address-space.ll
+++ b/test/CodeGen/R600/32-bit-local-address-space.ll
@@ -33,7 +33,7 @@ entry:
 
 ; CHECK-LABEL: @local_address_gep_const_offset
 ; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VPTR]], 4,
+; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VPTR]], 0x4,
 define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(3)* %in, i32 1
@@ -44,7 +44,7 @@ entry:
 
 ; Offset too large, can't fold into 16-bit immediate offset.
 ; CHECK-LABEL: @local_address_gep_large_const_offset
-; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 65540
+; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; CHECK: DS_READ_B32 [[VPTR]]
 define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
@@ -119,7 +119,7 @@ define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32
 ; CHECK-LABEL: @local_address_gep_const_offset_store
 ; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; CHECK: V_MOV_B32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_WRITE_B32 [[VPTR]], [[VAL]], 4
+; CHECK: DS_WRITE_B32 [[VPTR]], [[VAL]], 0x4
 define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32 addrspace(3)* %out, i32 1
   store i32 %val, i32 addrspace(3)* %gep, align 4
@@ -128,7 +128,7 @@ define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %v
 
 ; Offset too large, can't fold into 16-bit immediate offset.
 ; CHECK-LABEL: @local_address_gep_large_const_offset_store
-; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 65540
+; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; CHECK: DS_WRITE_B32 [[VPTR]], v{{[0-9]+}}, 0
 define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
index 0d6bfb1..2d82c1e 100644
--- a/test/CodeGen/R600/64bit-kernel-args.ll
+++ b/test/CodeGen/R600/64bit-kernel-args.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; SI-CHECK: @f64_kernel_arg
-; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 9
-; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 11
+; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
+; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
 ; SI-CHECK: BUFFER_STORE_DWORDX2
 define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll
index e9db52a..711a2bc 100644
--- a/test/CodeGen/R600/add.ll
+++ b/test/CodeGen/R600/add.ll
@@ -140,3 +140,28 @@ entry:
   store i64 %1, i64 addrspace(1)* %out
   ret void
 }
+
+; Test i64 add inside a branch.  We don't allow SALU instructions inside of
+; branches.
+; FIXME: We are being conservative here.  We could allow this in some cases.
+; FUNC-LABEL: @add64_in_branch
+; SI-CHECK-NOT: S_ADD_I32
+; SI-CHECK-NOT: S_ADDC_U32
+define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
index 7081b07..c9eaeda 100644
--- a/test/CodeGen/R600/add_i64.ll
+++ b/test/CodeGen/R600/add_i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
 declare i32 @llvm.r600.read.tidig.x() readnone
diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll
index 15d2ed2..f75a8ac 100644
--- a/test/CodeGen/R600/address-space.ll
+++ b/test/CodeGen/R600/address-space.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s
 
 ; Test that codegenprepare understands address space sizes
 
@@ -10,8 +10,8 @@
 ; CHECK-LABEL: @do_as_ptr_calcs:
 ; CHECK: S_LOAD_DWORD [[SREG1:s[0-9]+]],
 ; CHECK: V_MOV_B32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 20
-; CHECK: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 12
+; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 0x14
+; CHECK: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0xc
 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll
index cb2a1c8..c2362da 100644
--- a/test/CodeGen/R600/array-ptr-calc-i32.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
 declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
diff --git a/test/CodeGen/R600/array-ptr-calc-i64.ll b/test/CodeGen/R600/array-ptr-calc-i64.ll
index 652bbfe..e254c5f 100644
--- a/test/CodeGen/R600/array-ptr-calc-i64.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i64.ll
@@ -1,5 +1,5 @@
 ; XFAIL: *
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() readnone
 
diff --git a/test/CodeGen/R600/call.ll b/test/CodeGen/R600/call.ll
new file mode 100644
index 0000000..d803474
--- /dev/null
+++ b/test/CodeGen/R600/call.ll
@@ -0,0 +1,33 @@
+; RUN: not llc -march=r600 -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s
+; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported call to function defined_function in test_call
+
+
+declare i32 @external_function(i32) nounwind
+
+define i32 @defined_function(i32 %x) nounwind noinline {
+  %y = add i32 %x, 8
+  ret i32 %y
+}
+
+define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %c = call i32 @defined_function(i32 %b) nounwind
+  %result = add i32 %a, %c
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %c = call i32 @external_function(i32 %b) nounwind
+  %result = add i32 %a, %c
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
index 2e70d47..dc056e0 100644
--- a/test/CodeGen/R600/extload.ll
+++ b/test/CodeGen/R600/extload.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: @anyext_load_i8:
 ; EG: AND_INT
@@ -87,8 +87,9 @@ define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
 }
 
 ; FUNC-LABEL: @zextload_global_i8_to_i64
+; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
 ; SI: BUFFER_LOAD_UBYTE [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, 0
+; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
 ; SI: BUFFER_STORE_DWORDX2
 define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %a = load i8 addrspace(1)* %in, align 8
@@ -98,8 +99,9 @@ define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
 }
 
 ; FUNC-LABEL: @zextload_global_i16_to_i64
+; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
 ; SI: BUFFER_LOAD_USHORT [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, 0
+; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
 ; SI: BUFFER_STORE_DWORDX2
 define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16 addrspace(1)* %in, align 8
@@ -109,8 +111,9 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 }
 
 ; FUNC-LABEL: @zextload_global_i32_to_i64
+; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
 ; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, 0
+; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
 ; SI: BUFFER_STORE_DWORDX2
 define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %a = load i32 addrspace(1)* %in, align 8
diff --git a/test/CodeGen/R600/extract_vector_elt_i16.ll b/test/CodeGen/R600/extract_vector_elt_i16.ll
new file mode 100644
index 0000000..5cd1b04
--- /dev/null
+++ b/test/CodeGen/R600/extract_vector_elt_i16.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @extract_vector_elt_v2i16
+; SI: BUFFER_LOAD_USHORT
+; SI: BUFFER_STORE_SHORT
+; SI: BUFFER_LOAD_USHORT
+; SI: BUFFER_STORE_SHORT
+define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind {
+  %p0 = extractelement <2 x i16> %foo, i32 0
+  %p1 = extractelement <2 x i16> %foo, i32 1
+  %out1 = getelementptr i16 addrspace(1)* %out, i32 1
+  store i16 %p1, i16 addrspace(1)* %out, align 2
+  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  ret void
+}
+
+; FUNC-LABEL: @extract_vector_elt_v4i16
+; SI: BUFFER_LOAD_USHORT
+; SI: BUFFER_STORE_SHORT
+; SI: BUFFER_LOAD_USHORT
+; SI: BUFFER_STORE_SHORT
+define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind {
+  %p0 = extractelement <4 x i16> %foo, i32 0
+  %p1 = extractelement <4 x i16> %foo, i32 2
+  %out1 = getelementptr i16 addrspace(1)* %out, i32 1
+  store i16 %p1, i16 addrspace(1)* %out, align 2
+  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  ret void
+}
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
index 2cd3a4f..b87ce22 100644
--- a/test/CodeGen/R600/fabs.ll
+++ b/test/CodeGen/R600/fabs.ll
@@ -49,6 +49,17 @@ entry:
   ret void
 }
 
+; SI-CHECK-LABEL: @fabs_fold
+; SI-CHECK-NOT: V_AND_B32_e32
+; SI-CHECK: V_MUL_F32_e64 v{{[0-9]+}}, s{{[0-9]+}}, |v{{[0-9]+}}|
+define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
+entry:
+  %0 = call float @fabs(float %in0)
+  %1 = fmul float %0, %in1
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
 declare float @fabs(float ) readnone
 declare <2 x float> @llvm.fabs.v2f32(<2 x float> ) readnone
 declare <4 x float> @llvm.fabs.v4f32(<4 x float> ) readnone
diff --git a/test/CodeGen/R600/fconst64.ll b/test/CodeGen/R600/fconst64.ll
index 5c5ee7e..9c3a7e3 100644
--- a/test/CodeGen/R600/fconst64.ll
+++ b/test/CodeGen/R600/fconst64.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fconst_f64
-; CHECK: V_MOV_B32_e32 {{v[0-9]+}}, 0.000000e+00
-; CHECK-NEXT: V_MOV_B32_e32 {{v[0-9]+}}, 2.312500e+00
+; CHECK-DAG: S_MOV_B32 {{s[0-9]+}}, 0x40140000
+; CHECK-DAG: S_MOV_B32 {{s[0-9]+}}, 0
 
 define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r1 = load double addrspace(1)* %in
diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll
index f4e6be6..4cddc73 100644
--- a/test/CodeGen/R600/fneg.ll
+++ b/test/CodeGen/R600/fneg.ll
@@ -51,7 +51,7 @@ entry:
 ; R600-CHECK: -KC0[2].Z
 ; SI-CHECK-LABEL: @fneg_free
 ; XXX: We could use V_ADD_F32_e64 with the negate bit here instead.
-; SI-CHECK: V_SUB_F32_e64 v{{[0-9]}}, 0.000000e+00, s{{[0-9]}}, 0, 0, 0, 0
+; SI-CHECK: V_SUB_F32_e64 v{{[0-9]}}, 0.000000e+00, s{{[0-9]}}, 0, 0
 define void @fneg_free(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = bitcast i32 %in to float
@@ -59,3 +59,14 @@ entry:
   store float %1, float addrspace(1)* %out
   ret void
 }
+
+; SI-CHECK-LABEL: @fneg_fold
+; SI-CHECK-NOT: V_XOR_B32
+; SI-CHECK: V_MUL_F32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
+define void @fneg_fold(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fsub float -0.0, %in
+  %1 = fmul float %0, %in
+  store float %1, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fp_to_uint.f64.ll b/test/CodeGen/R600/fp_to_uint.f64.ll
new file mode 100644
index 0000000..bf607ce
--- /dev/null
+++ b/test/CodeGen/R600/fp_to_uint.f64.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @fp_to_uint_i32_f64
+; SI: V_CVT_U32_F64_e32
+define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
+  %cast = fptoui double %in to i32
+  store i32 %cast, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll
index ee914fa..ab2c0bf 100644
--- a/test/CodeGen/R600/gep-address-space.ll
+++ b/test/CodeGen/R600/gep-address-space.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s
 
 define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
 ; CHECK-LABEL: @use_gep_address_space:
 ; CHECK: V_MOV_B32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_WRITE_B32 [[PTR]], v{{[0-9]+}}, 64
+; CHECK: DS_WRITE_B32 [[PTR]], v{{[0-9]+}}, 0x40
   %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16
   store i32 99, i32 addrspace(3)* %p
   ret void
diff --git a/test/CodeGen/R600/gv-const-addrspace-fail.ll b/test/CodeGen/R600/gv-const-addrspace-fail.ll
new file mode 100644
index 0000000..ebd7811
--- /dev/null
+++ b/test/CodeGen/R600/gv-const-addrspace-fail.ll
@@ -0,0 +1,58 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1
+
+; FUNC-LABEL: @test_i8
+; EG: CF_END
+; SI: BUFFER_STORE_BYTE
+; SI: S_ENDPGM
+define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
+  %arrayidx = getelementptr inbounds [1 x i8] addrspace(2)* @a, i32 0, i32 %s
+  %1 = load i8 addrspace(2)* %arrayidx, align 1
+  store i8 %1, i8 addrspace(1)* %out
+  ret void
+}
+
+@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
+
+; FUNC-LABEL: @test_i16
+; EG: CF_END
+; SI: BUFFER_STORE_SHORT
+; SI: S_ENDPGM
+define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
+  %arrayidx = getelementptr inbounds [1 x i16] addrspace(2)* @b, i32 0, i32 %s
+  %1 = load i16 addrspace(2)* %arrayidx, align 2
+  store i16 %1, i16 addrspace(1)* %out
+  ret void
+}
+
+%struct.bar = type { float, [5 x i8] }
+
+; The illegal i8s aren't handled
+@struct_bar_gv = internal addrspace(2) unnamed_addr constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
+
+; FUNC-LABEL: @struct_bar_gv_load
+define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i8 addrspace(2)* %gep, align 1
+  store i8 %load, i8 addrspace(1)* %out, align 1
+  ret void
+}
+
+
+; The private load isn't scalarzied.
+@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> <i32 1, i32 2, i32 3, i32 4>,
+                                                                    <4 x i32> <i32 5, i32 6, i32 7, i32 8>,
+                                                                    <4 x i32> <i32 9, i32 10, i32 11, i32 12>,
+                                                                    <4 x i32> <i32 13, i32 14, i32 15, i32 16> ]
+
+; FUNC-LABEL: @array_vector_gv_load
+define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
+  %load = load <4 x i32> addrspace(2)* %gep, align 16
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
index cda7ab1..0176061 100644
--- a/test/CodeGen/R600/gv-const-addrspace.ll
+++ b/test/CodeGen/R600/gv-const-addrspace.ll
@@ -1,4 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
 
 ; XXX: Test on SI once 64-bit adds are supportes.
 
@@ -6,12 +10,12 @@
 
 ; FUNC-LABEL: @float
 
-; R600-DAG: MOV {{\** *}}T2.X
-; R600-DAG: MOV {{\** *}}T3.X
-; R600-DAG: MOV {{\** *}}T4.X
-; R600-DAG: MOV {{\** *}}T5.X
-; R600-DAG: MOV {{\** *}}T6.X
-; R600: MOVA_INT
+; EG-DAG: MOV {{\** *}}T2.X
+; EG-DAG: MOV {{\** *}}T3.X
+; EG-DAG: MOV {{\** *}}T4.X
+; EG-DAG: MOV {{\** *}}T5.X
+; EG-DAG: MOV {{\** *}}T6.X
+; EG: MOVA_INT
 
 define void @float(float addrspace(1)* %out, i32 %index) {
 entry:
@@ -25,12 +29,12 @@ entry:
 
 ; FUNC-LABEL: @i32
 
-; R600-DAG: MOV {{\** *}}T2.X
-; R600-DAG: MOV {{\** *}}T3.X
-; R600-DAG: MOV {{\** *}}T4.X
-; R600-DAG: MOV {{\** *}}T5.X
-; R600-DAG: MOV {{\** *}}T6.X
-; R600: MOVA_INT
+; EG-DAG: MOV {{\** *}}T2.X
+; EG-DAG: MOV {{\** *}}T3.X
+; EG-DAG: MOV {{\** *}}T4.X
+; EG-DAG: MOV {{\** *}}T5.X
+; EG-DAG: MOV {{\** *}}T6.X
+; EG: MOVA_INT
 
 define void @i32(i32 addrspace(1)* %out, i32 %index) {
 entry:
@@ -39,3 +43,30 @@ entry:
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
+
+
+%struct.foo = type { float, [5 x i32] }
+
+@struct_foo_gv = internal addrspace(2) unnamed_addr constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
+
+; FUNC-LABEL: @struct_foo_gv_load
+
+define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i32 addrspace(2)* %gep, align 4
+  store i32 %load, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> <i32 1>,
+                                                                <1 x i32> <i32 2>,
+                                                                <1 x i32> <i32 3>,
+                                                                <1 x i32> <i32 4> ]
+
+; FUNC-LABEL: @array_v1_gv_load
+define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
+  %load = load <1 x i32> addrspace(2)* %gep, align 4
+  store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/infinite-loop.ll b/test/CodeGen/R600/infinite-loop.ll
index a60bc37..68ffaae 100644
--- a/test/CodeGen/R600/infinite-loop.ll
+++ b/test/CodeGen/R600/infinite-loop.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: @infinite_loop:
-; SI: V_MOV_B32_e32 [[REG:v[0-9]+]], 999
+; SI: V_MOV_B32_e32 [[REG:v[0-9]+]], 0x3e7
 ; SI: BB0_1:
 ; SI: BUFFER_STORE_DWORD [[REG]]
 ; SI: S_WAITCNT vmcnt(0) expcnt(0)
diff --git a/test/CodeGen/R600/insert_vector_elt.ll b/test/CodeGen/R600/insert_vector_elt.ll
index 530d1cc..43b4efc 100644
--- a/test/CodeGen/R600/insert_vector_elt.ll
+++ b/test/CodeGen/R600/insert_vector_elt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 
 ; FIXME: Broken on evergreen
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
@@ -173,3 +173,29 @@ define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8>
   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
   ret void
 }
+
+; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
+; the compiler doesn't crash.
+; SI-LABEL: @insert_split_bb
+define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
+entry:
+  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
+  %1 = icmp eq i32 %a, 0
+  br i1 %1, label %if, label %else
+
+if:
+  %2 = load i32 addrspace(1)* %in
+  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
+  br label %endif
+
+else:
+  %4 = getelementptr i32 addrspace(1)* %in, i32 1
+  %5 = load i32 addrspace(1)* %4
+  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
+  br label %endif
+
+endif:
+  %7 = phi <2 x i32> [%3, %if], [%6, %else]
+  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/insert_vector_elt_f64.ll b/test/CodeGen/R600/insert_vector_elt_f64.ll
index e334be1..595bc59 100644
--- a/test/CodeGen/R600/insert_vector_elt_f64.ll
+++ b/test/CodeGen/R600/insert_vector_elt_f64.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
 ; SI-LABEL: @dynamic_insertelement_v2f64:
diff --git a/test/CodeGen/R600/kernel-args.ll b/test/CodeGen/R600/kernel-args.ll
index 247e316..6fc6979 100644
--- a/test/CodeGen/R600/kernel-args.ll
+++ b/test/CodeGen/R600/kernel-args.ll
@@ -17,7 +17,7 @@ entry:
 ; EG-CHECK-LABEL: @i8_zext_arg
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @i8_zext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 
 define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
@@ -29,7 +29,7 @@ entry:
 ; EG-CHECK-LABEL: @i8_sext_arg
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @i8_sext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 
 define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
 entry:
@@ -53,7 +53,7 @@ entry:
 ; EG-CHECK-LABEL: @i16_zext_arg
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @i16_zext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 
 define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
@@ -65,7 +65,7 @@ entry:
 ; EG-CHECK-LABEL: @i16_sext_arg
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @i16_sext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 
 define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
 entry:
@@ -77,7 +77,7 @@ entry:
 ; EG-CHECK-LABEL: @i32_arg
 ; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @i32_arg
-; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
@@ -87,7 +87,7 @@ entry:
 ; EG-CHECK-LABEL: @f32_arg
 ; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @f32_arg
-; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
@@ -122,7 +122,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; SI-CHECK-LABEL: @v2i32_arg
-; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
@@ -133,7 +133,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; SI-CHECK-LABEL: @v2f32_arg
-; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
@@ -166,7 +166,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; SI-CHECK-LABEL: @v3i32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 13
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
@@ -178,7 +178,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; SI-CHECK-LABEL: @v3f32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 13
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
@@ -223,7 +223,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; SI-CHECK-LABEL: @v4i32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 13
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
@@ -236,7 +236,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; SI-CHECK-LABEL: @v4f32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 13
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
@@ -300,7 +300,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
 ; SI-CHECK-LABEL: @v8i32_arg
-; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 17
+; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
@@ -317,7 +317,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
 ; SI-CHECK-LABEL: @v8f32_arg
-; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 17
+; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
@@ -422,7 +422,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
 ; SI-CHECK-LABEL: @v16i32_arg
-; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 25
+; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
@@ -447,7 +447,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
 ; SI-CHECK-LABEL: @v16f32_arg
-; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 25
+; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
index c3f000a..eb50942 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
@@ -1,11 +1,12 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
 
 ; FUNC-LABEL: @bfe_i32_arg_arg_arg
 ; SI: V_BFE_I32
 ; EG: BFE_INT
+; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac
 define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
@@ -38,3 +39,388 @@ define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) n
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: @v_bfe_print_arg
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
+define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
+  %load = load i32 addrspace(1)* %src0, align 4
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_arg_0_width_reg_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_arg_0_width_imm_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_6
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: S_ENDPGM
+define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_7
+; SI-NOT: SHL
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: The shifts should be 1 BFE
+; FUNC-LABEL: @bfe_i32_test_8
+; SI: BUFFER_LOAD_DWORD
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: S_ENDPGM
+define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_9
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_10
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_11
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_12
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_13
+; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_14
+; SI-NOT: LSHR
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_0
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_1
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_2
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_3
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_4
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_5
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_6
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0xffffff80
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_7
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_8
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_9
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_10
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_11
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -6
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_12
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_13
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_14
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_15
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_16
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_17
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_18
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX - This should really be a single BFE, but the sext_inreg of the
+; extended type i24 is never custom lowered.
+; FUNC-LABEL: @bfe_sext_in_reg_i24
+; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
+; SI: V_LSHLREV_B32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
+; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
+; XSI: V_BFE_I32 [[BFE:v[0-9]+]], [[LOAD]], 0, 8
+; XSI-NOT: SHL
+; XSI-NOT: SHR
+; XSI: BUFFER_STORE_DWORD [[BFE]],
+define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
+  %shl = shl i32 %bfe, 8
+  %ashr = ashr i32 %shl, 8
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
index 0d47863..1a62253 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
@@ -38,3 +38,517 @@ define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) n
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: @bfe_u32_arg_0_width_reg_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_arg_0_width_imm_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zextload_i8
+; SI: BUFFER_LOAD_UBYTE
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %load = load i8 addrspace(1)* %in
+  %ext = zext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i16
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_1
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_3
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0xf8
+; SI-NEXT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_7
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0x80
+; SI-NEXT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i16_offset_8
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_1
+; SI: BUFFER_LOAD_DWORD
+; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI: S_ENDPGM
+; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
+define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_4
+; SI-NOT: LSHL
+; SI-NOT: SHR
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = lshr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_5
+; SI: BUFFER_LOAD_DWORD
+; SI-NOT: LSHL
+; SI-NOT: SHR
+; SI: V_BFE_I32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
+; SI: S_ENDPGM
+define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_6
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: S_ENDPGM
+define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_7
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_8
+; SI-NOT: BFE
+; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_9
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_10
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_11
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_12
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_13
+; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_14
+; SI-NOT: LSHR
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_0
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_1
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_2
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_3
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_4
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_5
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_6
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x80
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_7
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_8
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_9
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFEfppppppppppppp
+define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_10
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_11
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_12
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_13
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_14
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_15
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_16
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_17
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_18
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
new file mode 100644
index 0000000..95795ea
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FIXME: Store of i32 seems to be broken pre-EG somehow?
+
+declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @test_imad24
+; SI: V_MAD_I32_I24
+; CM: MULADD_INT24
+; R600: MULLO_INT
+; R600: ADD_INT
+define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
new file mode 100644
index 0000000..8ee3520
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: @test_imul24
+; SI: V_MUL_I32_I24
+; CM: MUL_INT24
+; R600: MULLO_INT
+define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
new file mode 100644
index 0000000..afdfb18
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @test_umad24
+; SI: V_MAD_U32_U24
+; EG: MULADD_UINT24
+; R600: MULLO_UINT
+; R600: ADD_INT
+define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
new file mode 100644
index 0000000..72a3602
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: @test_umul24
+; SI: V_MUL_U32_U24
+; R600: MUL_UINT24
+; R600: MULLO_UINT
+define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
index 569efb6..740581a 100644
--- a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
+++ b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: @test1
-;CHECK: TBUFFER_STORE_FORMAT_XYZW {{v\[[0-9]+:[0-9]+\]}}, 32, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: TBUFFER_STORE_FORMAT_XYZW {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test1(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -11,7 +11,7 @@ define void @test1(i32 %a1, i32 %vaddr) #0 {
 }
 
 ;CHECK-LABEL: @test2
-;CHECK: TBUFFER_STORE_FORMAT_XYZ {{v\[[0-9]+:[0-9]+\]}}, 24, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: TBUFFER_STORE_FORMAT_XYZ {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test2(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -21,7 +21,7 @@ define void @test2(i32 %a1, i32 %vaddr) #0 {
 }
 
 ;CHECK-LABEL: @test3
-;CHECK: TBUFFER_STORE_FORMAT_XY {{v\[[0-9]+:[0-9]+\]}}, 16, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: TBUFFER_STORE_FORMAT_XY {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test3(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
@@ -31,7 +31,7 @@ define void @test3(i32 %a1, i32 %vaddr) #0 {
 }
 
 ;CHECK-LABEL: @test4
-;CHECK: TBUFFER_STORE_FORMAT_X {{v[0-9]+}}, 8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: TBUFFER_STORE_FORMAT_X {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test4(i32 %vdata, i32 %vaddr) #0 {
     call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
         i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
index aaf2305..9e7a4de 100644
--- a/test/CodeGen/R600/llvm.cos.ll
+++ b/test/CodeGen/R600/llvm.cos.ll
@@ -1,19 +1,40 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 
-;CHECK: MULADD_IEEE *
-;CHECK: FRACT *
-;CHECK: ADD *
-;CHECK: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;FUNC-LABEL: test
+;EG: MULADD_IEEE *
+;EG: FRACT *
+;EG: ADD *
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: COS
+;SI: V_COS_F32
+;SI-NOT: V_COS_F32
 
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = call float @llvm.cos.f32(float %r0)
-   %vec = insertelement <4 x float> undef, float %r1, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+define void @test(float addrspace(1)* %out, float %x) #1 {
+   %cos = call float @llvm.cos.f32(float %x)
+   store float %cos, float addrspace(1)* %out
+   ret void
+}
+
+;FUNC-LABEL: testv
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: COS
+;SI: V_COS_F32
+;SI: V_COS_F32
+;SI: V_COS_F32
+;SI: V_COS_F32
+;SI-NOT: V_COS_F32
+
+define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
+   %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx)
+   store <4 x float> %cos, <4 x float> addrspace(1)* %out
    ret void
 }
 
 declare float @llvm.cos.f32(float) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone
 
 attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.rint.f64.ll b/test/CodeGen/R600/llvm.rint.f64.ll
new file mode 100644
index 0000000..a7a909a
--- /dev/null
+++ b/test/CodeGen/R600/llvm.rint.f64.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @f64
+; CI: V_RNDNE_F64_e32
+define void @f64(double addrspace(1)* %out, double %in) {
+entry:
+  %0 = call double @llvm.rint.f64(double %in)
+  store double %0, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @v2f64
+; CI: V_RNDNE_F64_e32
+; CI: V_RNDNE_F64_e32
+define void @v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+entry:
+  %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
+  store <2 x double> %0, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @v4f64
+; CI: V_RNDNE_F64_e32
+; CI: V_RNDNE_F64_e32
+; CI: V_RNDNE_F64_e32
+; CI: V_RNDNE_F64_e32
+define void @v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+entry:
+  %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
+  store <4 x double> %0, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+
+declare double @llvm.rint.f64(double) #0
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0
+declare <4 x double> @llvm.rint.v4f64(<4 x double>) #0
diff --git a/test/CodeGen/R600/llvm.rint.ll b/test/CodeGen/R600/llvm.rint.ll
index c174b33..db8352f 100644
--- a/test/CodeGen/R600/llvm.rint.ll
+++ b/test/CodeGen/R600/llvm.rint.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; R600-CHECK: @f32
-; R600-CHECK: RNDNE
-; SI-CHECK: @f32
-; SI-CHECK: V_RNDNE_F32_e32
+; FUNC-LABEL: @f32
+; R600: RNDNE
+
+; SI: V_RNDNE_F32_e32
 define void @f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.rint.f32(float %in)
@@ -12,12 +12,12 @@ entry:
   ret void
 }
 
-; R600-CHECK: @v2f32
-; R600-CHECK: RNDNE
-; R600-CHECK: RNDNE
-; SI-CHECK: @v2f32
-; SI-CHECK: V_RNDNE_F32_e32
-; SI-CHECK: V_RNDNE_F32_e32
+; FUNC-LABEL: @v2f32
+; R600: RNDNE
+; R600: RNDNE
+
+; SI: V_RNDNE_F32_e32
+; SI: V_RNDNE_F32_e32
 define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in)
@@ -25,16 +25,16 @@ entry:
   ret void
 }
 
-; R600-CHECK: @v4f32
-; R600-CHECK: RNDNE
-; R600-CHECK: RNDNE
-; R600-CHECK: RNDNE
-; R600-CHECK: RNDNE
-; SI-CHECK: @v4f32
-; SI-CHECK: V_RNDNE_F32_e32
-; SI-CHECK: V_RNDNE_F32_e32
-; SI-CHECK: V_RNDNE_F32_e32
-; SI-CHECK: V_RNDNE_F32_e32
+; FUNC-LABEL: @v4f32
+; R600: RNDNE
+; R600: RNDNE
+; R600: RNDNE
+; R600: RNDNE
+
+; SI: V_RNDNE_F32_e32
+; SI: V_RNDNE_F32_e32
+; SI: V_RNDNE_F32_e32
+; SI: V_RNDNE_F32_e32
 define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in)
@@ -42,13 +42,8 @@ entry:
   ret void
 }
 
-; Function Attrs: nounwind readonly
 declare float @llvm.rint.f32(float) #0
-
-; Function Attrs: nounwind readonly
 declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0
-
-; Function Attrs: nounwind readonly
 declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0
 
 attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
index 9eb9983..41c363c 100644
--- a/test/CodeGen/R600/llvm.sin.ll
+++ b/test/CodeGen/R600/llvm.sin.ll
@@ -1,19 +1,41 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 
-;CHECK: MULADD_IEEE *
-;CHECK: FRACT *
-;CHECK: ADD *
-;CHECK: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;FUNC-LABEL: test
+;EG: MULADD_IEEE *
+;EG: FRACT *
+;EG: ADD *
+;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: SIN
+;SI: V_MUL_F32
+;SI: V_SIN_F32
+;SI-NOT: V_SIN_F32
 
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = call float @llvm.sin.f32( float %r0)
-   %vec = insertelement <4 x float> undef, float %r1, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+define void @test(float addrspace(1)* %out, float %x) #1 {
+   %sin = call float @llvm.sin.f32(float %x)
+   store float %sin, float addrspace(1)* %out
+   ret void
+}
+
+;FUNC-LABEL: testv
+;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: SIN
+;SI: V_SIN_F32
+;SI: V_SIN_F32
+;SI: V_SIN_F32
+;SI: V_SIN_F32
+;SI-NOT: V_SIN_F32
+
+define void @testv(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
+   %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
+   store <4 x float> %sin, <4 x float> addrspace(1)* %out
    ret void
 }
 
 declare float @llvm.sin.f32(float) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone
 
 attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.sqrt.ll b/test/CodeGen/R600/llvm.sqrt.ll
index 0d0d186..4eee37f 100644
--- a/test/CodeGen/R600/llvm.sqrt.ll
+++ b/test/CodeGen/R600/llvm.sqrt.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 --mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK-LABEL: @sqrt_f32
 ; R600-CHECK: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
diff --git a/test/CodeGen/R600/load-i1.ll b/test/CodeGen/R600/load-i1.ll
new file mode 100644
index 0000000..9ba81b8
--- /dev/null
+++ b/test/CodeGen/R600/load-i1.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+
+; SI-LABEL: @global_copy_i1_to_i1
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_AND_B32_e32 v{{[0-9]+}}, 1
+; SI: BUFFER_STORE_BYTE
+; SI: S_ENDPGM
+define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  store i1 %load, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: @global_sextload_i1_to_i32
+; XSI: BUFFER_LOAD_BYTE
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @global_zextload_i1_to_i32
+; SI: BUFFER_LOAD_UBYTE
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @global_sextload_i1_to_i64
+; XSI: BUFFER_LOAD_BYTE
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = sext i1 %load to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @global_zextload_i1_to_i64
+; SI: BUFFER_LOAD_UBYTE
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = zext i1 %load to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @i1_arg
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_AND_B32_e32
+; SI: BUFFER_STORE_BYTE
+; SI: S_ENDPGM
+define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+  store i1 %x, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: @i1_arg_zext_i32
+; SI: BUFFER_LOAD_UBYTE
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = zext i1 %x to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @i1_arg_zext_i64
+; SI: BUFFER_LOAD_UBYTE
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = zext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @i1_arg_sext_i32
+; XSI: BUFFER_LOAD_BYTE
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = sext i1 %x to i32
+  store i32 %ext, i32addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @i1_arg_sext_i64
+; XSI: BUFFER_LOAD_BYTE
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = sext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/local-64.ll b/test/CodeGen/R600/local-64.ll
index 87f18ae..c52b41b 100644
--- a/test/CodeGen/R600/local-64.ll
+++ b/test/CodeGen/R600/local-64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: @local_i32_load
-; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 28, [M0]
+; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 0x1c, [M0]
 ; SI: BUFFER_STORE_DWORD [[REG]],
 define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %gep = getelementptr i32 addrspace(3)* %in, i32 7
@@ -11,7 +11,7 @@ define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounw
 }
 
 ; SI-LABEL: @local_i32_load_0_offset
-; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 0, [M0]
+; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 0x0, [M0]
 ; SI: BUFFER_STORE_DWORD [[REG]],
 define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %val = load i32 addrspace(3)* %in, align 4
@@ -21,7 +21,7 @@ define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %
 
 ; SI-LABEL: @local_i8_load_i16_max_offset
 ; SI-NOT: ADD
-; SI: DS_READ_U8 [[REG:v[0-9]+]], {{v[0-9]+}}, -1, [M0]
+; SI: DS_READ_U8 [[REG:v[0-9]+]], {{v[0-9]+}}, 0xffff, [M0]
 ; SI: BUFFER_STORE_BYTE [[REG]],
 define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8 addrspace(3)* %in, i32 65535
@@ -31,9 +31,9 @@ define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 }
 
 ; SI-LABEL: @local_i8_load_over_i16_max_offset
-; SI: S_ADD_I32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 65536
+; SI: S_ADD_I32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
 ; SI: V_MOV_B32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
-; SI: DS_READ_U8 [[REG:v[0-9]+]], [[VREGADDR]], 0, [M0]
+; SI: DS_READ_U8 [[REG:v[0-9]+]], [[VREGADDR]], 0x0, [M0]
 ; SI: BUFFER_STORE_BYTE [[REG]],
 define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8 addrspace(3)* %in, i32 65536
@@ -44,7 +44,7 @@ define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspa
 
 ; SI-LABEL: @local_i64_load
 ; SI-NOT: ADD
-; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 56, [M0]
+; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 0x38, [M0]
 ; SI: BUFFER_STORE_DWORDX2 [[REG]],
 define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %gep = getelementptr i64 addrspace(3)* %in, i32 7
@@ -54,7 +54,7 @@ define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounw
 }
 
 ; SI-LABEL: @local_i64_load_0_offset
-; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0, [M0]
+; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0x0, [M0]
 ; SI: BUFFER_STORE_DWORDX2 [[REG]],
 define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %val = load i64 addrspace(3)* %in, align 8
@@ -64,7 +64,7 @@ define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %
 
 ; SI-LABEL: @local_f64_load
 ; SI-NOT: ADD
-; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 56, [M0]
+; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 0x38, [M0]
 ; SI: BUFFER_STORE_DWORDX2 [[REG]],
 define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %gep = getelementptr double addrspace(3)* %in, i32 7
@@ -74,7 +74,7 @@ define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in)
 }
 
 ; SI-LABEL: @local_f64_load_0_offset
-; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0, [M0]
+; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0x0, [M0]
 ; SI: BUFFER_STORE_DWORDX2 [[REG]],
 define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %val = load double addrspace(3)* %in, align 8
@@ -84,7 +84,7 @@ define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace
 
 ; SI-LABEL: @local_i64_store
 ; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 56 [M0]
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x38 [M0]
 define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
   %gep = getelementptr i64 addrspace(3)* %out, i32 7
   store i64 5678, i64 addrspace(3)* %gep, align 8
@@ -93,7 +93,7 @@ define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_i64_store_0_offset
 ; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
 define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
   store i64 1234, i64 addrspace(3)* %out, align 8
   ret void
@@ -101,7 +101,7 @@ define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_f64_store
 ; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 56 [M0]
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x38 [M0]
 define void @local_f64_store(double addrspace(3)* %out) nounwind {
   %gep = getelementptr double addrspace(3)* %out, i32 7
   store double 16.0, double addrspace(3)* %gep, align 8
@@ -109,7 +109,7 @@ define void @local_f64_store(double addrspace(3)* %out) nounwind {
 }
 
 ; SI-LABEL: @local_f64_store_0_offset
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
 define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
   store double 20.0, double addrspace(3)* %out, align 8
   ret void
@@ -117,8 +117,8 @@ define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_v2i64_store
 ; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 120 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 112 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x78 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x70 [M0]
 define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64> addrspace(3)* %out, i32 7
   store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
@@ -127,8 +127,8 @@ define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_v2i64_store_0_offset
 ; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x8 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
 define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
   ret void
@@ -136,10 +136,10 @@ define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_v4i64_store
 ; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 248 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 240 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 232 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 224 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xf8 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xf0 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xe8 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xe0 [M0]
 define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64> addrspace(3)* %out, i32 7
   store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
@@ -148,10 +148,10 @@ define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_v4i64_store_0_offset
 ; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 24 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 16 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x18 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x10 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x8 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
 define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
   ret void
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index 616000d..1e42285 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -28,8 +28,8 @@
 ; constant offsets.
 ; EG-CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]], 16
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR]], 0,
+; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]], 0x10
+; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR]], 0x0,
 
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/R600/loop-idiom.ll b/test/CodeGen/R600/loop-idiom.ll
index 8a9cba2..128f661 100644
--- a/test/CodeGen/R600/loop-idiom.ll
+++ b/test/CodeGen/R600/loop-idiom.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 target triple = "r600--"
diff --git a/test/CodeGen/R600/mad_int24.ll b/test/CodeGen/R600/mad_int24.ll
index df063ec..abb5290 100644
--- a/test/CodeGen/R600/mad_int24.ll
+++ b/test/CodeGen/R600/mad_int24.ll
@@ -1,12 +1,15 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; EG-CHECK: @i32_mad24
+; FUNC-LABEL: @i32_mad24
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
-; EG-CHECK: MULLO_INT
-; CM-CHECK: MULADD_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
-; SI-CHECK: V_MAD_I32_I24
+; EG: MULLO_INT
+; Make sure we aren't masking the inputs.
+; CM-NOT: AND
+; CM: MULADD_INT24
+; SI-NOT: AND
+; SI: V_MAD_I32_I24
 define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll
index 3dcadc9..0f0893b 100644
--- a/test/CodeGen/R600/mad_uint24.ll
+++ b/test/CodeGen/R600/mad_uint24.ll
@@ -1,11 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; EG-CHECK-LABEL: @u32_mad24
-; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
-; SI-CHECK-LABEL: @u32_mad24
-; SI-CHECK: V_MAD_U32_U24
+; FUNC-LABEL: @u32_mad24
+; EG: MULADD_UINT24
+; SI: V_MAD_U32_U24
 
 define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
@@ -19,18 +18,14 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i16_mad24
-; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
-; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
-; EG-CHECK-DAG: VTX_READ_16 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
+; FUNC-LABEL: @i16_mad24
 ; The order of A and B does not matter.
-; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
+; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
-; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
-; EG-CHECK: 16
-; SI-CHECK-LABEL: @i16_mad24
-; SI-CHECK: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16
+; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG: 16
+; SI: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16
 
 define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
@@ -41,18 +36,13 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i8_mad24
-; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
-; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
-; EG-CHECK-DAG: VTX_READ_8 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
-; The order of A and B does not matter.
-; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
+; FUNC-LABEL: @i8_mad24
+; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
-; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
-; EG-CHECK: 8
-; SI-CHECK-LABEL: @i8_mad24
-; SI-CHECK: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8
+; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG: 8
+; SI: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8
 
 define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
@@ -62,3 +52,24 @@ entry:
   store i32 %2, i32 addrspace(1)* %out
   ret void
 }
+
+; This tests for a bug where the mad_u24 pattern matcher would call
+; SimplifyDemandedBits on the first operand of the mul instruction
+; assuming that the pattern would be matched to a 24-bit mad.  This
+; led to some instructions being incorrectly erased when the entire
+; 24-bit mad pattern wasn't being matched.
+
+; Check that the select instruction is not deleted.
+; FUNC-LABEL: @i24_i32_i32_mad
+; EG: CNDE_INT
+; SI: V_CNDMASK
+define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %0 = ashr i32 %a, 8
+  %1 = icmp ne i32 %c, 0
+  %2 = select i1 %1, i32 %0, i32 34
+  %3 = mul i32 %2, %c
+  %4 = add i32 %3, %d
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll
index 2d5ddeb..f465d3d 100644
--- a/test/CodeGen/R600/mubuf.ll
+++ b/test/CodeGen/R600/mubuf.ll
@@ -6,7 +6,7 @@
 
 ; MUBUF load with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: @mubuf_load0
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
 define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 1
@@ -17,7 +17,7 @@ entry:
 
 ; MUBUF load with the largest possible immediate offset
 ; CHECK-LABEL: @mubuf_load1
-; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4095 ; encoding: [0xff,0x8f
+; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0xfff ; encoding: [0xff,0x8f
 define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i8 addrspace(1)* %in, i64 4095
@@ -28,7 +28,7 @@ entry:
 
 ; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
 ; CHECK-LABEL: @mubuf_load2
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0 ; encoding: [0x00,0x80
+; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x0 ; encoding: [0x00,0x80
 define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 1024
@@ -40,7 +40,7 @@ entry:
 ; MUBUF load with a 12-bit immediate offset and a register offset
 ; CHECK-LABEL: @mubuf_load3
 ; CHECK-NOT: ADD
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
 define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 %offset
@@ -56,7 +56,7 @@ entry:
 
 ; MUBUF store with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: @mubuf_store0
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
 define void @mubuf_store0(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 1
@@ -66,7 +66,7 @@ entry:
 
 ; MUBUF store with the largest possible immediate offset
 ; CHECK-LABEL: @mubuf_store1
-; CHECK: BUFFER_STORE_BYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4095 ; encoding: [0xff,0x8f
+; CHECK: BUFFER_STORE_BYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0xfff ; encoding: [0xff,0x8f
 
 define void @mubuf_store1(i8 addrspace(1)* %out) {
 entry:
@@ -77,7 +77,7 @@ entry:
 
 ; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
 ; CHECK-LABEL: @mubuf_store2
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0 ; encoding: [0x00,0x80
+; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x0 ; encoding: [0x00,0x80
 define void @mubuf_store2(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 1024
@@ -88,7 +88,7 @@ entry:
 ; MUBUF store with a 12-bit immediate offset and a register offset
 ; CHECK-LABEL: @mubuf_store3
 ; CHECK-NOT: ADD
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
 define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 %offset
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index e176148..6ed754c 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll
@@ -1,15 +1,14 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; mul24 and mad24 are affected
 
-;EG-CHECK: @test2
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;FUNC-LABEL: @test2
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test2
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -20,17 +19,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: @test4
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;FUNC-LABEL: @test4
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test4
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -52,3 +50,32 @@ define void @trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   store i32 %trunc, i32 addrspace(1)* %out, align 8
   ret void
 }
+
+; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
+; 32-bits of both arguments are sign bits.
+; FUNC-LABEL: @mul64_sext_c
+; EG-DAG: MULLO_INT
+; EG-DAG: MULHI_INT
+; SI-DAG: V_MUL_LO_I32
+; SI-DAG: V_MUL_HI_I32
+define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = sext i32 %in to i64
+  %1 = mul i64 %0, 80
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; A standard 64-bit multiply.  The expansion should be around 6 instructions.
+; It would be difficult to match the expansion correctly without writing
+; a really complicated list of FileCheck expressions.  I don't want
+; to confuse people who may 'break' this test with a correct optimization,
+; so this test just uses FUNC-LABEL to make sure the compiler does not
+; crash with a 'failed to select' error.
+; FUNC-LABEL: @mul64
+define void @mul64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = mul i64 %a, %b
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/mul_int24.ll b/test/CodeGen/R600/mul_int24.ll
index 66a1a9e..046911b 100644
--- a/test/CodeGen/R600/mul_int24.ll
+++ b/test/CodeGen/R600/mul_int24.ll
@@ -1,12 +1,15 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; EG-CHECK: @i32_mul24
+; FUNC-LABEL: @i32_mul24
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
-; EG-CHECK: MULLO_INT
-; CM-CHECK: MUL_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI-CHECK: V_MUL_I32_I24
+; EG: MULLO_INT
+; Make sure we are not masking the inputs
+; CM-NOT: AND
+; CM: MUL_INT24
+; SI-NOT: AND
+; SI: V_MUL_I32_I24
 define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8
diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll
index a413961..419f275 100644
--- a/test/CodeGen/R600/mul_uint24.ll
+++ b/test/CodeGen/R600/mul_uint24.ll
@@ -1,11 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; EG-CHECK-LABEL: @u32_mul24
-; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI-CHECK-LABEL: @u32_mul24
-; SI-CHECK: V_MUL_U32_U24
+; FUNC-LABEL: @u32_mul24
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI: V_MUL_U32_U24
 
 define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
@@ -18,17 +17,13 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i16_mul24
-; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
-; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
-; The order of A and B does not matter.
-; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
+; FUNC-LABEL: @i16_mul24
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; The result must be sign-extended
-; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-; EG-CHECK: 16
-; SI-CHECK-LABEL: @i16_mul24
-; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16,
+; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
+; EG: 16
+; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16,
 define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %0 = mul i16 %a, %b
@@ -37,16 +32,12 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i8_mul24
-; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
-; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
-; The order of A and B does not matter.
-; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
+; FUNC-LABEL: @i8_mul24
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; The result must be sign-extended
-; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-; SI-CHECK-LABEL: @i8_mul24
-; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8,
+; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
+; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8,
 
 define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
 entry:
@@ -55,3 +46,21 @@ entry:
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
+
+; Multiply with 24-bit inputs and 64-bit output
+; FUNC_LABEL: @mul24_i64
+; EG; MUL_UINT24
+; EG: MULHI
+; SI: V_MUL_U32_U24
+; FIXME: SI support 24-bit mulhi
+; SI: V_MUL_HI_U32
+define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = shl i64 %a, 40
+  %a_24 = lshr i64 %0, 40
+  %1 = shl i64 %b, 40
+  %b_24 = lshr i64 %1, 40
+  %2 = mul i64 %a_24, %b_24
+  store i64 %2, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/mulhu.ll b/test/CodeGen/R600/mulhu.ll
index d5fc014..8640127 100644
--- a/test/CodeGen/R600/mulhu.ll
+++ b/test/CodeGen/R600/mulhu.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_MOV_B32_e32 v{{[0-9]+}}, -1431655765
+;CHECK: V_MOV_B32_e32 v{{[0-9]+}}, 0xaaaaaaab
 ;CHECK: V_MUL_HI_U32 v0, {{[sv][0-9]+}}, {{v[0-9]+}}
 ;CHECK-NEXT: V_LSHRREV_B32_e32 v0, 1, v0
 
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index 2cc991e..9878366 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll
@@ -89,8 +89,8 @@ define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 }
 
 ; SI-LABEL: @vector_or_i64_loadimm
-; SI-DAG: S_MOV_B32 [[LO_S_IMM:s[0-9]+]], -545810305
-; SI-DAG: S_MOV_B32 [[HI_S_IMM:s[0-9]+]], 5231
+; SI-DAG: S_MOV_B32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
+; SI-DAG: S_MOV_B32 [[HI_S_IMM:s[0-9]+]], 0x146f
 ; SI-DAG: BUFFER_LOAD_DWORDX2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
 ; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index 4920320..d3453f2 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 
 ; This test checks that uses and defs of the AR register happen in the same
 ; instruction clause.
@@ -119,7 +119,7 @@ for.end:
 ; R600-CHECK: *
 ; R600-CHECK: MOVA_INT
 
-; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 65536
+; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 0x10000
 ; SI-CHECK: V_MOVRELS_B32_e32
 define void @short_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
@@ -142,7 +142,7 @@ entry:
 ; R600-CHECK: *
 ; R600-CHECK-NEXT: MOVA_INT
 
-; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 256
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100
 ; SI-CHECK: V_MOVRELS_B32_e32
 define void @char_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
index 5a930b2..f322bc7 100644
--- a/test/CodeGen/R600/pv.ll
+++ b/test/CodeGen/R600/pv.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 | FileCheck %s
 
 ;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
-;CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
+;CHECK: MAX T{{[0-9].[XYZW]}}, PV.X, 0.0
 
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
 main_body:
diff --git a/test/CodeGen/R600/register-count-comments.ll b/test/CodeGen/R600/register-count-comments.ll
index a64b280..329077c 100644
--- a/test/CodeGen/R600/register-count-comments.ll
+++ b/test/CodeGen/R600/register-count-comments.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
 
diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll
index e461bf9..e7719b6 100644
--- a/test/CodeGen/R600/salu-to-valu.ll
+++ b/test/CodeGen/R600/salu-to-valu.ll
@@ -46,3 +46,45 @@ declare i32 @llvm.r600.read.tidig.x() #1
 declare i32 @llvm.r600.read.tidig.y() #1
 
 attributes #1 = { nounwind readnone }
+
+; Test moving an SMRD instruction to the VALU
+
+; CHECK-LABEL: @smrd_valu
+; CHECK: BUFFER_LOAD_DWORD [[OUT:v[0-9]+]]
+; CHECK: BUFFER_STORE_DWORD [[OUT]]
+
+define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) {
+entry:
+  %0 = icmp ne i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i32 addrspace(2)* addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = getelementptr i32 addrspace(2)* addrspace(1)* %in
+  %3 = load i32 addrspace(2)* addrspace(1)* %2
+  br label %endif
+
+endif:
+  %4 = phi i32 addrspace(2)*  [%1, %if], [%3, %else]
+  %5 = getelementptr i32 addrspace(2)* %4, i32 3000
+  %6 = load i32 addrspace(2)* %5
+  store i32 %6, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test moving ann SMRD with an immediate offset to the VALU
+
+; CHECK-LABEL: @smrd_valu2
+; CHECK: BUFFER_LOAD_DWORD
+define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %1 = add i32 %0, 4
+  %2 = getelementptr [8 x i32] addrspace(2)* %in, i32 %0, i32 4
+  %3 = load i32 addrspace(2)* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
index 2a286d1..3d2142d 100644
--- a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
+++ b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
@@ -1,6 +1,6 @@
 ; XFAIL: *
 ; REQUIRES: asserts
-; RUN: llc -O0 -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI
+; RUN: llc -O0 -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
 
 declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 
diff --git a/test/CodeGen/R600/selectcc.ll b/test/CodeGen/R600/selectcc.ll
new file mode 100644
index 0000000..a8f57cf
--- /dev/null
+++ b/test/CodeGen/R600/selectcc.ll
@@ -0,0 +1,19 @@
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @selectcc_i64
+; EG: XOR_INT
+; EG: XOR_INT
+; EG: OR_INT
+; EG: CNDE_INT
+; EG: CNDE_INT
+; SI: V_CMP_EQ_I64
+; SI: V_CNDMASK
+; SI: V_CNDMASK
+define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
+entry:
+  %0 = icmp eq i64 %lhs, %rhs
+  %1 = select i1 %0, i64 %true, i64 %false
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 8d34c4a..5bd95b7 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -1,5 +1,5 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; FUNC-LABEL: @setcc_v2i32
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
@@ -96,7 +96,9 @@ entry:
 ; R600-DAG: SETNE_INT
 ; SI: V_CMP_O_F32
 ; SI: V_CMP_NEQ_F32
-; SI: S_AND_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_AND_B32_e32
 define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp one float %a, %b
@@ -128,7 +130,9 @@ entry:
 ; R600-DAG: SETNE_INT
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_EQ_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ueq float %a, %b
@@ -142,7 +146,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_GT_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ugt float %a, %b
@@ -156,7 +162,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_GE_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp uge float %a, %b
@@ -170,7 +178,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_LT_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ult float %a, %b
@@ -184,7 +194,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_LE_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ule float %a, %b
diff --git a/test/CodeGen/R600/setcc64.ll b/test/CodeGen/R600/setcc64.ll
index 9202fc0..54a33b3 100644
--- a/test/CodeGen/R600/setcc64.ll
+++ b/test/CodeGen/R600/setcc64.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; XXX: Merge this into setcc, once R600 supports 64-bit operations
 
@@ -59,7 +59,9 @@ entry:
 ; FUNC-LABEL: @f64_one
 ; SI: V_CMP_O_F64
 ; SI: V_CMP_NEQ_F64
-; SI: S_AND_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_AND_B32_e32
 define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp one double %a, %b
@@ -81,7 +83,9 @@ entry:
 ; FUNC-LABEL: @f64_ueq
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_EQ_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ueq double %a, %b
@@ -93,7 +97,9 @@ entry:
 ; FUNC-LABEL: @f64_ugt
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_GT_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ugt double %a, %b
@@ -105,7 +111,9 @@ entry:
 ; FUNC-LABEL: @f64_uge
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_GE_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp uge double %a, %b
@@ -117,7 +125,9 @@ entry:
 ; FUNC-LABEL: @f64_ult
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_LT_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ult double %a, %b
@@ -129,7 +139,9 @@ entry:
 ; FUNC-LABEL: @f64_ule
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_LE_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ule double %a, %b
diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll
index 8633a4b..e90e788 100644
--- a/test/CodeGen/R600/seto.ll
+++ b/test/CodeGen/R600/seto.ll
@@ -1,6 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_CMP_O_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0, 0, 0
+;CHECK-LABEL: @main
+;CHECK: V_CMP_O_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0
 
 define void @main(float %p) {
 main_body:
diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll
index c77a37e..3b1db8b 100644
--- a/test/CodeGen/R600/setuo.ll
+++ b/test/CodeGen/R600/setuo.ll
@@ -1,6 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_CMP_U_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0, 0, 0
+;CHECK-LABEL: @main
+;CHECK: V_CMP_U_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0
 
 define void @main(float %p) {
 main_body:
diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
index eef3f07..1b02e4b 100644
--- a/test/CodeGen/R600/sext-in-reg.ll
+++ b/test/CodeGen/R600/sext-in-reg.ll
@@ -1,15 +1,18 @@
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
 
 
 ; FUNC-LABEL: @sext_in_reg_i1_i32
 ; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[ARG]], 0, 1
+; SI: S_BFE_I32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
+; SI: V_MOV_B32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
 ; SI: BUFFER_STORE_DWORD [[EXTRACT]],
 
-; EG: BFE_INT
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
+; EG-NEXT: LSHR * [[ADDR]]
 define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
   %shl = shl i32 %in, 31
   %sext = ashr i32 %shl, 31
@@ -19,10 +22,14 @@ define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
 
 ; FUNC-LABEL: @sext_in_reg_i8_to_i32
 ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 8
-; SI: BUFFER_STORE_DWORD [[EXTRACT]],
-
-; EG: BFE_INT
+; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
 define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 24
@@ -33,10 +40,14 @@ define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounw
 
 ; FUNC-LABEL: @sext_in_reg_i16_to_i32
 ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 16
-; SI: BUFFER_STORE_DWORD [[EXTRACT]],
-
-; EG: BFE_INT
+; SI: S_SEXT_I32_I16 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
 define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 16
@@ -47,10 +58,14 @@ define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) noun
 
 ; FUNC-LABEL: @sext_in_reg_i8_to_v1i32
 ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 8
-; SI: BUFFER_STORE_DWORD [[EXTRACT]],
-
-; EG: BFE_INT
+; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
 define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
   %c = add <1 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <1 x i32> %c, <i32 24>
@@ -59,13 +74,35 @@ define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a,
   ret void
 }
 
+; FUNC-LABEL: @sext_in_reg_i1_to_i64
+; SI: S_ADD_I32 [[VAL:s[0-9]+]],
+; SI: S_BFE_I32 s{{[0-9]+}}, s{{[0-9]+}}, 0x10000
+; SI: S_MOV_B32 {{s[0-9]+}}, -1
+; SI: BUFFER_STORE_DWORDX2
+define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = add i64 %a, %b
+  %shl = shl i64 %c, 63
+  %ashr = ashr i64 %shl, 63
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
 ; FUNC-LABEL: @sext_in_reg_i8_to_i64
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
-; SI: BUFFER_STORE_DWORD
+; SI: S_ADD_I32 [[VAL:s[0-9]+]],
+; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: S_MOV_B32 {{s[0-9]+}}, -1
+; SI: BUFFER_STORE_DWORDX2
 
-; EG: BFE_INT
-; EG: ASHR
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: ASHR [[RES_HI]]
+; EG-NOT: BFE_INT
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
 define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %c = add i64 %a, %b
   %shl = shl i64 %c, 56
@@ -75,12 +112,21 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
 }
 
 ; FUNC-LABEL: @sext_in_reg_i16_to_i64
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 16
-; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
-; SI: BUFFER_STORE_DWORD
+; SI: S_ADD_I32 [[VAL:s[0-9]+]],
+; SI: S_SEXT_I32_I16 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: S_MOV_B32 {{s[0-9]+}}, -1
+; SI: BUFFER_STORE_DWORDX2
 
-; EG: BFE_INT
-; EG: ASHR
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: ASHR [[RES_HI]]
+; EG-NOT: BFE_INT
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
 define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %c = add i64 %a, %b
   %shl = shl i64 %c, 48
@@ -95,6 +141,17 @@ define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
 ; SI: S_ADD_I32 [[ADD:s[0-9]+]],
 ; SI: S_ASHR_I32 s{{[0-9]+}}, [[ADD]], 31
 ; SI: BUFFER_STORE_DWORDX2
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG-NOT: BFE_INT
+; EG: ADD_INT {{\*?}} [[RES_LO]]
+; EG: ASHR [[RES_HI]]
+; EG: ADD_INT
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
 define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %c = add i64 %a, %b
   %shl = shl i64 %c, 32
@@ -105,8 +162,8 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
 
 ; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
 ; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64
-; XSI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; XSI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
+; XSI: S_BFE_I32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
+; XSI: S_ASHR_I32 {{v[0-9]+}}, [[EXTRACT]], 31
 ; XSI: BUFFER_STORE_DWORD
 ; XEG: BFE_INT
 ; XEG: ASHR
@@ -122,7 +179,13 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
 ; SI-NOT: BFE
 ; SI: S_LSHL_B32 [[REG:s[0-9]+]], {{s[0-9]+}}, 6
 ; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG]], 7
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE
+; EG: ADD_INT
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = add i32 %a, %b
   %x = shl i32 %c, 6
@@ -136,7 +199,15 @@ define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a,
 ; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG0]], 7
 ; SI: S_LSHL_B32 [[REG1:s[0-9]+]], {{s[0-9]}}, 6
 ; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG1]], 7
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE
+; EG: ADD_INT
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b
   %x = shl <2 x i32> %c, <i32 6, i32 6>
@@ -147,11 +218,14 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
 
 
 ; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
 ; SI: BUFFER_STORE_DWORDX2
-; EG: BFE
-; EG: BFE
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 31, i32 31>
@@ -161,16 +235,18 @@ define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 }
 
 ; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
 ; SI: BUFFER_STORE_DWORDX4
 
-; EG: BFE
-; EG: BFE
-; EG: BFE
-; EG: BFE
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
@@ -180,12 +256,14 @@ define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
 }
 
 ; FUNC-LABEL: @sext_in_reg_v2i8_to_v2i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
 ; SI: BUFFER_STORE_DWORDX2
 
-; EG: BFE
-; EG: BFE
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 24, i32 24>
@@ -195,16 +273,18 @@ define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 }
 
 ; FUNC-LABEL: @sext_in_reg_v4i8_to_v4i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
 ; SI: BUFFER_STORE_DWORDX4
 
-; EG: BFE
-; EG: BFE
-; EG: BFE
-; EG: BFE
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
@@ -214,16 +294,18 @@ define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
 }
 
 ; FUNC-LABEL: @sext_in_reg_v2i16_to_v2i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: S_SEXT_I32_I16 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: S_SEXT_I32_I16 {{s[0-9]+}}, {{s[0-9]+}}
 ; SI: BUFFER_STORE_DWORDX2
 
-; EG: BFE
-; EG: BFE
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
-  %shl = shl <2 x i32> %c, <i32 24, i32 24>
-  %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
+  %shl = shl <2 x i32> %c, <i32 16, i32 16>
+  %ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
   store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
@@ -252,8 +334,36 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
   ret void
 }
 
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i8_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
+  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i16_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
+  %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
 ; FIXME: The BFE should really be eliminated. I think it should happen
-; when computeMaskedBitsForTargetNode is implemented for imax.
+; when computeKnownBitsForTargetNode is implemented for imax.
 
 ; FUNC-LABEL: @sext_in_reg_to_illegal_type
 ; SI: BUFFER_LOAD_SBYTE
@@ -269,3 +379,146 @@ define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 ad
   store i16 %tmp6, i16 addrspace(1)* %out, align 2
   ret void
 }
+
+declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @bfe_0_width
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_8_bfe_8
+; SI: V_BFE_I32
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_8_bfe_16
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI: S_ENDPGM
+define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; This really should be folded into 1
+; FUNC-LABEL: @bfe_16_bfe_8
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure there isn't a redundant BFE
+; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe
+; SI: S_SEXT_I32_I8 s{{[0-9]+}}, s{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe_wrong
+define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sextload_i8_to_i32_bfe
+; SI: BUFFER_LOAD_SBYTE
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
+  %load = load i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sextload_i8_to_i32_bfe_0:
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
+  %load = load i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_0:
+; SI-NOT: SHR
+; SI-NOT: SHL
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: S_ENDPGM
+define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_1
+; SI: BUFFER_LOAD_DWORD
+; SI-NOT: SHL
+; SI-NOT: SHR
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
+; SI: S_ENDPGM
+define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i2_bfe_offset_1:
+; SI: BUFFER_LOAD_DWORD
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
+; SI: S_ENDPGM
+define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/sgpr-control-flow.ll b/test/CodeGen/R600/sgpr-control-flow.ll
new file mode 100644
index 0000000..06ad24d
--- /dev/null
+++ b/test/CodeGen/R600/sgpr-control-flow.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+;
+;
+; Most SALU instructions ignore control flow, so we need to make sure
+; they don't overwrite values from other blocks.
+
+; SI-NOT: S_ADD
+
+define void @sgpr_if_else(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  %0 = icmp eq i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = add i32 %b, %c
+  br label %endif
+
+else:
+  %2 = add i32 %d, %e
+  br label %endif
+
+endif:
+  %3 = phi i32 [%1, %if], [%2, %else]
+  %4 = add i32 %3, %a
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
index d74161b..9d8a623 100644
--- a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 ; Copy VGPR -> SGPR used twice as an instruction operand, which is then
 ; used in an REG_SEQUENCE that also needs to be handled.
diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll
index 5472c1b..c581d86 100644
--- a/test/CodeGen/R600/sgpr-copy.ll
+++ b/test/CodeGen/R600/sgpr-copy.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=SI  | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
 
 ; This test checks that no VGPR to SGPR copies are created by the register
 ; allocator.
diff --git a/test/CodeGen/R600/si-annotate-cf-assertion.ll b/test/CodeGen/R600/si-annotate-cf-assertion.ll
index cd3ba2b..daa4667 100644
--- a/test/CodeGen/R600/si-annotate-cf-assertion.ll
+++ b/test/CodeGen/R600/si-annotate-cf-assertion.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
 
 
 define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
new file mode 100644
index 0000000..d9f60ea
--- /dev/null
+++ b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; 64-bit select was originally lowered with a build_pair, and this
+; could be simplified to 1 cndmask instead of 2, but that broken when
+; it started being implemented with a v2i32 build_vector and
+; bitcasting.
+define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %select to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @trunc_load_alloca_i64:
+; SI: V_MOVRELS_B32
+; SI-NOT: V_MOVRELS_B32
+; SI: S_ENDPGM
+define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
+  %idx = add i32 %a, %b
+  %alloca = alloca i64, i32 4
+  %gep0 = getelementptr i64* %alloca, i64 0
+  %gep1 = getelementptr i64* %alloca, i64 1
+  %gep2 = getelementptr i64* %alloca, i64 2
+  %gep3 = getelementptr i64* %alloca, i64 3
+  store i64 24, i64* %gep0, align 8
+  store i64 9334, i64* %gep1, align 8
+  store i64 3935, i64* %gep2, align 8
+  store i64 9342, i64* %gep3, align 8
+  %gep = getelementptr i64* %alloca, i32 %idx
+  %load = load i64* %gep, align 8
+  %mask = and i64 %load, 4294967296
+  %add = add i64 %mask, -1
+  store i64 %add, i64 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
index 43231df..dec6185 100644
--- a/test/CodeGen/R600/smrd.ll
+++ b/test/CodeGen/R600/smrd.ll
@@ -2,7 +2,7 @@
 
 ; SMRD load with an immediate offset.
 ; CHECK-LABEL: @smrd0
-; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 1 ; encoding: [0x01
+; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 1
@@ -13,7 +13,7 @@ entry:
 
 ; SMRD load with the largest possible immediate offset.
 ; CHECK-LABEL: @smrd1
-; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 255 ; encoding: [0xff
+; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 255
@@ -24,7 +24,7 @@ entry:
 
 ; SMRD load with an offset greater than the largest possible immediate.
 ; CHECK-LABEL: @smrd2
-; CHECK: S_MOV_B32 s[[OFFSET:[0-9]]], 1024
+; CHECK: S_MOV_B32 s[[OFFSET:[0-9]]], 0x400
 ; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
@@ -34,9 +34,27 @@ entry:
   ret void
 }
 
+; SMRD load with a 64-bit offset
+; CHECK-LABEL: @smrd3
+; CHECK-DAG: S_MOV_B32 s[[SHI:[0-9]+]], 4
+; CHECK-DAG: S_MOV_B32 s[[SLO:[0-9]+]], 0
+; FIXME: We don't need to copy these values to VGPRs
+; CHECK-DAG: V_MOV_B32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; CHECK-DAG: V_MOV_B32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; FIXME: We should be able to use S_LOAD_DWORD here
+; BUFFER_LOAD_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] + v[[[VLO]]:[[VHI]]] + 0x0
+
+define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
+  %1 = load i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
 ; SMRD load using the load.const intrinsic with an immediate offset
 ; CHECK-LABEL: @smrd_load_const0
-; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 4 ; encoding: [0x04
+; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
@@ -49,7 +67,7 @@ main_body:
 ; SMRD load using the load.const intrinsic with an offset greater largest possible
 ; immediate offset.
 ; CHECK-LABEL: @smrd_load_const1
-; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 255 ; encoding: [0xff
+; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
diff --git a/test/CodeGen/R600/store-v3i64.ll b/test/CodeGen/R600/store-v3i64.ll
index 58229f6..58d28b5 100644
--- a/test/CodeGen/R600/store-v3i64.ll
+++ b/test/CodeGen/R600/store-v3i64.ll
@@ -1,5 +1,5 @@
 ; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI
 
 ; SI-LABEL: @global_store_v3i64:
 ; SI: BUFFER_STORE_DWORDX4
diff --git a/test/CodeGen/R600/store-vector-ptrs.ll b/test/CodeGen/R600/store-vector-ptrs.ll
index 3af7d91..41c5edc 100644
--- a/test/CodeGen/R600/store-vector-ptrs.ll
+++ b/test/CodeGen/R600/store-vector-ptrs.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI < %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s
 
 define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
   %p = getelementptr <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index a3c5331..c0c8ccc 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll
@@ -177,6 +177,26 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: @store_i64_i8
+; EG-CHECK: MEM_RAT MSKOR
+; SI-CHECK: BUFFER_STORE_BYTE
+define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i8
+  store i8 %0, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @store_i64_i16
+; EG-CHECK: MEM_RAT MSKOR
+; SI-CHECK: BUFFER_STORE_SHORT
+define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i16
+  store i16 %0, i16 addrspace(1)* %out
+  ret void
+}
+
 ;===------------------------------------------------------------------------===;
 ; Local Address Space
 ;===------------------------------------------------------------------------===;
@@ -272,6 +292,26 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: @store_local_i64_i8
+; EG-CHECK: LDS_BYTE_WRITE
+; SI-CHECK: DS_WRITE_B8
+define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i8
+  store i8 %0, i8 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: @store_local_i64_i16
+; EG-CHECK: LDS_SHORT_WRITE
+; SI-CHECK: DS_WRITE_B16
+define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i16
+  store i16 %0, i16 addrspace(3)* %out
+  ret void
+}
+
 ; The stores in this function are combined by the optimizer to create a
 ; 64-bit store with 32-bit alignment.  This is legal for SI and the legalizer
 ; should not try to split the 64-bit store back into 2 32-bit stores.
@@ -297,3 +337,29 @@ entry:
 }
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; When i128 was a legal type this program generated cannot select errors:
+
+; FUNC-LABEL: @i128-const-store
+; FIXME: We should be able to to this with one store instruction
+; EG-CHECK: STORE_RAW
+; EG-CHECK: STORE_RAW
+; EG-CHECK: STORE_RAW
+; EG-CHECK: STORE_RAW
+; CM-CHECK: STORE_DWORD
+; CM-CHECK: STORE_DWORD
+; CM-CHECK: STORE_DWORD
+; CM-CHECK: STORE_DWORD
+; SI: BUFFER_STORE_DWORDX2
+; SI: BUFFER_STORE_DWORDX2
+define void @i128-const-store(i32 addrspace(1)* %out) {
+entry:
+  store i32 1, i32 addrspace(1)* %out, align 4
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i64 1
+  store i32 1, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32 addrspace(1)* %out, i64 2
+  store i32 2, i32 addrspace(1)* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32 addrspace(1)* %out, i64 3
+  store i32 2, i32 addrspace(1)* %arrayidx6, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
index 5fdd2b8..e321ed6 100644
--- a/test/CodeGen/R600/sub.ll
+++ b/test/CodeGen/R600/sub.ll
@@ -1,13 +1,12 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
-;EG-CHECK: @test2
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;FUNC-LABEL: @test2
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test2
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +17,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: @test4
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;FUNC-LABEL: @test4
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test4
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -38,3 +36,24 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+;FUNC_LABEL: @test5
+
+;EG-DAG: SETGE_UINT
+;EG-DAG: CNDE_INT
+;EG-DAG: SUB_INT
+;EG-DAG: SUB_INT
+;EG-DAG: SUB_INT
+
+;SI: S_XOR_B64
+;SI-DAG: S_ADD_I32
+;SI-DAG: S_ADDC_U32
+;SI-DAG: S_ADD_I32
+;SI-DAG: S_ADDC_U32
+
+define void @test5(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = sub i64 %a, %b
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/trunc-store-i1.ll b/test/CodeGen/R600/trunc-store-i1.ll
index a888943..a3975c8 100644
--- a/test/CodeGen/R600/trunc-store-i1.ll
+++ b/test/CodeGen/R600/trunc-store-i1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
 ; SI-LABEL: @global_truncstore_i32_to_i1
diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll
index 8a759dc..31cdfcd 100644
--- a/test/CodeGen/R600/trunc.ll
+++ b/test/CodeGen/R600/trunc.ll
@@ -3,7 +3,7 @@
 
 define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
 ; SI-LABEL: @trunc_i64_to_i32_store
-; SI: S_LOAD_DWORD s0, s[0:1], 11
+; SI: S_LOAD_DWORD s0, s[0:1], 0xb
 ; SI: V_MOV_B32_e32 v0, s0
 ; SI: BUFFER_STORE_DWORD v0
 
@@ -31,8 +31,9 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
 
 ; SI-LABEL: @trunc_shl_i64:
 ; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}},
-; SI: V_ADD_I32_e32 v[[LO_ADD:[0-9]+]], s[[LO_SREG]],
-; SI: V_LSHL_B64 v{{\[}}[[LO_VREG:[0-9]+]]:{{[0-9]+\]}}, v{{\[}}[[LO_ADD]]:{{[0-9]+\]}}, 2
+; SI: S_ADD_I32 s[[LO_ADD:[0-9]+]], s[[LO_SREG]],
+; SI: S_LSHL_B64 s{{\[}}[[LO_SREG2:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_ADD]]:{{[0-9]+\]}}, 2
+; SI: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
 ; SI: BUFFER_STORE_DWORD v[[LO_VREG]],
 define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
diff --git a/test/CodeGen/R600/uaddo.ll b/test/CodeGen/R600/uaddo.ll
new file mode 100644
index 0000000..3b69687
--- /dev/null
+++ b/test/CodeGen/R600/uaddo.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+
+; SI-LABEL: @uaddo_i64_zext
+; SI: ADD
+; SI: ADDC
+; SI: ADDC
+define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %uadd, 0
+  %carry = extractvalue { i64, i1 } %uadd, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll
new file mode 100644
index 0000000..a71315a
--- /dev/null
+++ b/test/CodeGen/R600/udivrem64.ll
@@ -0,0 +1,82 @@
+;XUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+
+;FUNC-LABEL: @test_udiv
+;EG: RECIP_UINT
+;EG: LSHL {{.*}}, 1,
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;SI: S_ENDPGM
+define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = udiv i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: @test_urem
+;EG: RECIP_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: AND_INT {{.*}}, 1,
+;SI: S_ENDPGM
+define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = urem i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll
new file mode 100644
index 0000000..75150c2
--- /dev/null
+++ b/test/CodeGen/R600/uint_to_fp.f64.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @uint_to_fp_f64_i32
+; SI: V_CVT_F64_U32_e32
+define void @uint_to_fp_f64_i32(double addrspace(1)* %out, i32 %in) {
+  %cast = uitofp i32 %in to double
+  store double %cast, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll
index 2824ff8..4df69d1 100644
--- a/test/CodeGen/R600/unaligned-load-store.ll
+++ b/test/CodeGen/R600/unaligned-load-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: @unaligned_load_store_i32:
 ; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
diff --git a/test/CodeGen/R600/v_cndmask.ll b/test/CodeGen/R600/v_cndmask.ll
index f8e9655..84087ee 100644
--- a/test/CodeGen/R600/v_cndmask.ll
+++ b/test/CodeGen/R600/v_cndmask.ll
@@ -3,7 +3,8 @@
 ; SI: @v_cnd_nan
 ; SI: V_CNDMASK_B32_e64 v{{[0-9]}},
 ; SI-DAG: v{{[0-9]}}
-; SI-DAG: {{nan|#QNAN}}
+; All nan values are converted to 0xffffffff
+; SI-DAG: -1
 define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) {
 entry:
   %0 = icmp ne i32 %c, 0
diff --git a/test/CodeGen/R600/valu-i1.ll b/test/CodeGen/R600/valu-i1.ll
new file mode 100644
index 0000000..5d5e3ff
--- /dev/null
+++ b/test/CodeGen/R600/valu-i1.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI %s
+
+; Make sure the i1 values created by the cfg structurizer pass are
+; moved using VALU instructions
+; SI-NOT: S_MOV_B64 s[{{[0-9]:[0-9]}}], -1
+; SI: V_MOV_B32_e32 v{{[0-9]}}, -1
+define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) {
+entry:
+  switch i32 %a, label %default [
+    i32 0, label %case0
+    i32 1, label %case1
+  ]
+
+case0:
+  %arrayidx1 = getelementptr i32 addrspace(1)* %dst, i32 %b
+  store i32 0, i32 addrspace(1)* %arrayidx1, align 4
+  br label %end
+
+case1:
+  %arrayidx5 = getelementptr i32 addrspace(1)* %dst, i32 %b
+  store i32 1, i32 addrspace(1)* %arrayidx5, align 4
+  br label %end
+
+default:
+  %cmp8 = icmp eq i32 %a, 2
+  %arrayidx10 = getelementptr i32 addrspace(1)* %dst, i32 %b
+  br i1 %cmp8, label %if, label %else
+
+if:
+  store i32 2, i32 addrspace(1)* %arrayidx10, align 4
+  br label %end
+
+else:
+  store i32 3, i32 addrspace(1)* %arrayidx10, align 4
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
index 9618d7f..90079b0 100644
--- a/test/CodeGen/R600/work-item-intrinsics.ll
+++ b/test/CodeGen/R600/work-item-intrinsics.ll
@@ -19,7 +19,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[0].Y
 ; SI-CHECK: @ngroups_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 1
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x1
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @ngroups_y (i32 addrspace(1)* %out) {
@@ -33,7 +33,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[0].Z
 ; SI-CHECK: @ngroups_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 2
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x2
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @ngroups_z (i32 addrspace(1)* %out) {
@@ -47,7 +47,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[0].W
 ; SI-CHECK: @global_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 3
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x3
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_x (i32 addrspace(1)* %out) {
@@ -61,7 +61,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[1].X
 ; SI-CHECK: @global_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 4
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x4
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_y (i32 addrspace(1)* %out) {
@@ -75,7 +75,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[1].Y
 ; SI-CHECK: @global_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 5
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x5
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_z (i32 addrspace(1)* %out) {
@@ -89,7 +89,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[1].Z
 ; SI-CHECK: @local_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 6
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x6
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_x (i32 addrspace(1)* %out) {
@@ -103,7 +103,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[1].W
 ; SI-CHECK: @local_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 7
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x7
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_y (i32 addrspace(1)* %out) {
@@ -117,7 +117,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[2].X
 ; SI-CHECK: @local_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 8
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x8
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_z (i32 addrspace(1)* %out) {
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index 49ed12d..5a5c86d 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -72,3 +72,21 @@ define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
+
+; SI-CHECK-LABEL: @scalar_not_i32
+; SI-CHECK: S_NOT_B32
+define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
+  %result = xor i32 %a, -1
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @vector_not_i32
+; SI-CHECK: V_NOT_B32
+define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+  %a = load i32 addrspace(1)* %in0
+  %b = load i32 addrspace(1)* %in1
+  %result = xor i32 %a, -1
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
index a114bfc..8585d4a 100644
--- a/test/CodeGen/R600/zero_extend.ll
+++ b/test/CodeGen/R600/zero_extend.ll
@@ -6,8 +6,9 @@
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
 
 ; SI-CHECK: @test
-; SI-CHECK: V_MOV_B32_e32 v[[ZERO:[0-9]]], 0
-; SI-CHECK: BUFFER_STORE_DWORDX2 v[0:[[ZERO]]{{\]}}
+; SI-CHECK: S_MOV_B32 [[ZERO:s[0-9]]], 0
+; SI-CHECK: V_MOV_B32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
+; SI-CHECK: BUFFER_STORE_DWORDX2 v[0:[[V_ZERO]]{{\]}}
 define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = mul i32 %a, %b
@@ -26,3 +27,14 @@ entry:
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
+
+; SI-CHECK-LABEL: @zext_i1_to_i64
+; SI-CHECK: V_CMP_EQ_I32
+; SI-CHECK: V_CNDMASK_B32
+; SI-CHECK: S_MOV_B32 s{{[0-9]+}}, 0
+define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %ext = zext i1 %cmp to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll b/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll
index 050b76d..1c8e7d8 100644
--- a/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll
+++ b/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll
@@ -60,13 +60,13 @@ declare i8* @llvm.frameaddress(i32) nounwind readnone
 define i8* @retaddr() nounwind readnone {
 entry:
 ;V8-LABEL: retaddr:
-;V8: or %g0, %o7, {{.+}}
+;V8: mov %o7, {{.+}}
 
 ;V9-LABEL: retaddr:
-;V9: or %g0, %o7, {{.+}}
+;V9: mov %o7, {{.+}}
 
 ;SPARC64-LABEL: retaddr
-;SPARC64:       or %g0, %o7, {{.+}}
+;SPARC64:       mov %o7, {{.+}}
 
   %0 = tail call i8* @llvm.returnaddress(i32 0)
   ret i8* %0
diff --git a/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll b/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
index 60bdf06..8a3edc6 100644
--- a/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
+++ b/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
@@ -1,6 +1,7 @@
 ;RUN: llc -march=sparc < %s -verify-machineinstrs | FileCheck %s
 ;RUN: llc -march=sparc -O0 < %s -verify-machineinstrs | FileCheck %s -check-prefix=UNOPT
 
+target triple = "sparc-unknown-linux-gnu"
 
 define i32 @test(i32 %a) nounwind {
 entry:
@@ -59,7 +60,7 @@ entry:
 ;CHECK:      !NO_APP
 ;CHECK-NEXT: cmp
 ;CHECK-NEXT: bg
-;CHECK-NEXT: or
+;CHECK-NEXT: mov
   tail call void asm sideeffect "sethi 0, %g0", ""() nounwind
   %0 = icmp slt i32 %a, 0
   br i1 %0, label %bb, label %bb1
diff --git a/test/CodeGen/SPARC/64abi.ll b/test/CodeGen/SPARC/64abi.ll
index 3771888..a88e19a5 100644
--- a/test/CodeGen/SPARC/64abi.ll
+++ b/test/CodeGen/SPARC/64abi.ll
@@ -44,7 +44,7 @@ define void @intarg(i8  %a0,   ; %i0
 ; CHECK: sra %i0, 0, [[R:%[gilo][0-7]]]
 ; CHECK: stx [[R]], [%sp+2223]
 ; Use %o0-%o5 for outgoing arguments
-; CHECK: or %g0, 5, %o5
+; CHECK: mov 5, %o5
 ; CHECK: call intarg
 ; CHECK-NOT: add %sp
 ; CHECK: restore
@@ -208,7 +208,7 @@ define i32 @inreg_if(float inreg %a0, ; %f0
 
 ; CHECK: call_inreg_if
 ; CHECK: fmovs %f3, %f0
-; CHECK: or %g0, %i2, %o0
+; CHECK: mov %i2, %o0
 ; CHECK: call inreg_if
 define void @call_inreg_if(i32* %p, float %f3, i32 %i2) {
   %x = call i32 @inreg_if(float %f3, i32 %i2)
diff --git a/test/CodeGen/SPARC/64bit.ll b/test/CodeGen/SPARC/64bit.ll
index 7ab19f3..b18f1bc 100644
--- a/test/CodeGen/SPARC/64bit.ll
+++ b/test/CodeGen/SPARC/64bit.ll
@@ -2,11 +2,11 @@
 ; RUN: llc < %s -march=sparcv9 -mattr=+popc | FileCheck %s -check-prefix=OPT
 
 ; CHECK-LABEL: ret2:
-; CHECK: or %g0, %i1, %i0
+; CHECK: mov %i1, %i0
 
 ; OPT-LABEL: ret2:
 ; OPT: retl
-; OPT: or %g0, %o1, %o0
+; OPT: mov %o1, %o0
 define i64 @ret2(i64 %a, i64 %b) {
   ret i64 %b
 }
@@ -39,21 +39,21 @@ define i64 @sra_reg(i64 %a, i64 %b) {
 ;     restore %g0, %g0, %o0
 ;
 ; CHECK: ret_imm0
-; CHECK: or %g0, 0, %i0
+; CHECK: mov 0, %i0
 
 ; OPT: ret_imm0
 ; OPT: retl
-; OPT: or %g0, 0, %o0
+; OPT: mov 0, %o0
 define i64 @ret_imm0() {
   ret i64 0
 }
 
 ; CHECK: ret_simm13
-; CHECK: or %g0, -4096, %i0
+; CHECK: mov -4096, %i0
 
 ; OPT:   ret_simm13
 ; OPT:   retl
-; OPT:   or %g0, -4096, %o0
+; OPT:   mov -4096, %o0
 define i64 @ret_simm13() {
   ret i64 -4096
 }
diff --git a/test/CodeGen/SPARC/64cond.ll b/test/CodeGen/SPARC/64cond.ll
index 1bd17a4..e491d61 100644
--- a/test/CodeGen/SPARC/64cond.ll
+++ b/test/CodeGen/SPARC/64cond.ll
@@ -112,9 +112,9 @@ entry:
 
 ; CHECK-LABEL: setcc_resultty
 ; CHECK-DAG:       srax %i0, 63, %o0
-; CHECK-DAG:       or %g0, %i0, %o1
-; CHECK-DAG:       or %g0, 0, %o2
-; CHECK-DAG:       or %g0, 32, %o3
+; CHECK-DAG:       mov %i0, %o1
+; CHECK-DAG:       mov 0, %o2
+; CHECK-DAG:       mov 32, %o3
 ; CHECK-DAG:       call __multi3
 ; CHECK:       cmp
 ; CHECK:       movne %xcc, 1, [[R:%[gilo][0-7]]]
diff --git a/test/CodeGen/SPARC/atomics.ll b/test/CodeGen/SPARC/atomics.ll
index 4e3e7ae..5e41300 100644
--- a/test/CodeGen/SPARC/atomics.ll
+++ b/test/CodeGen/SPARC/atomics.ll
@@ -33,7 +33,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_cmpxchg_i32
-; CHECK:       or  %g0, 123, [[R:%[gilo][0-7]]]
+; CHECK:       mov 123, [[R:%[gilo][0-7]]]
 ; CHECK:       cas [%o1], %o0, [[R]]
 
 define i32 @test_cmpxchg_i32(i32 %a, i32* %ptr) {
@@ -43,7 +43,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_cmpxchg_i64
-; CHECK:       or  %g0, 123, [[R:%[gilo][0-7]]]
+; CHECK:       mov 123, [[R:%[gilo][0-7]]]
 ; CHECK:       casx [%o1], %o0, [[R]]
 
 define i64 @test_cmpxchg_i64(i64 %a, i64* %ptr) {
@@ -53,7 +53,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_swap_i32
-; CHECK:       or  %g0, 42, [[R:%[gilo][0-7]]]
+; CHECK:       mov 42, [[R:%[gilo][0-7]]]
 ; CHECK:       swap [%o1], [[R]]
 
 define i32 @test_swap_i32(i32 %a, i32* %ptr) {
diff --git a/test/CodeGen/SPARC/exception.ll b/test/CodeGen/SPARC/exception.ll
index 3a3f59f..eca9c8b 100644
--- a/test/CodeGen/SPARC/exception.ll
+++ b/test/CodeGen/SPARC/exception.ll
@@ -1,9 +1,7 @@
 ; RUN: llc < %s -march=sparc   -relocation-model=static | FileCheck -check-prefix=V8ABS %s
 ; RUN: llc < %s -march=sparc   -relocation-model=pic    | FileCheck -check-prefix=V8PIC %s
-; RUN: llc < %s -march=sparc   -relocation-model=pic -disable-cfi    | FileCheck -check-prefix=V8PIC_NOCFI %s
 ; RUN: llc < %s -march=sparcv9 -relocation-model=static | FileCheck -check-prefix=V9ABS %s
 ; RUN: llc < %s -march=sparcv9 -relocation-model=pic    | FileCheck -check-prefix=V9PIC %s
-; RUN: llc < %s -march=sparcv9 -relocation-model=pic -disable-cfi    | FileCheck -check-prefix=V9PIC_NOCFI %s
 
 
 %struct.__fundamental_type_info_pseudo = type { %struct.__type_info_pseudo }
@@ -47,22 +45,6 @@
 ; V8PIC: .L_ZTIi.DW.stub:
 ; V8PIC-NEXT:   .word _ZTIi
 
-; V8PIC_NOCFI-LABEL: main:
-; V8PIC_NOCFI:        .section .gcc_except_table
-; V8PIC_NOCFI-NOT:    .section
-; V8PIC_NOCFI:        .word %r_disp32(.L_ZTIi.DW.stub)
-; V8PIC_NOCFI:        .data
-; V8PIC_NOCFI: .L_ZTIi.DW.stub:
-; V8PIC_NOCFI-NEXT:   .word _ZTIi
-; V8PIC_NOCFI:        .section .eh_frame
-; V8PIC_NOCFI-NOT:    .section
-; V8PIC_NOCFI:        .byte 15                     ! CIE Return Address Column
-; V8PIC_NOCFI:        .word %r_disp32(DW.ref.__gxx_personality_v0)
-; V8PIC_NOCFI:        .byte 12                     ! DW_CFA_def_cfa
-; V8PIC_NOCFI:        .byte 14                     ! Reg 14
-; V8PIC_NOCFI-NEXT:   .byte 0                      ! Offset 0
-; V8PIC_NOCFI:        .word %r_disp32(.Ltmp{{.+}}) ! FDE initial location
-
 
 ; V9ABS-LABEL: main:
 ; V9ABS:        .cfi_startproc
@@ -89,22 +71,6 @@
 ; V9PIC: .L_ZTIi.DW.stub:
 ; V9PIC-NEXT:   .xword _ZTIi
 
-; V9PIC_NOCFI-LABEL: main:
-; V9PIC_NOCFI:        .section .gcc_except_table
-; V9PIC_NOCFI-NOT:    .section
-; V9PIC_NOCFI:        .word %r_disp32(.L_ZTIi.DW.stub)
-; V9PIC_NOCFI:        .data
-; V9PIC_NOCFI: .L_ZTIi.DW.stub:
-; V9PIC_NOCFI-NEXT:   .xword _ZTIi
-; V9PIC_NOCFI:        .section .eh_frame
-; V9PIC_NOCFI-NOT:    .section
-; V9PIC_NOCFI:        .byte 15                     ! CIE Return Address Column
-; V9PIC_NOCFI:        .word %r_disp32(DW.ref.__gxx_personality_v0)
-; V9PIC_NOCFI:        .byte 12                     ! DW_CFA_def_cfa
-; V9PIC_NOCFI-NEXT:   .byte 14                     ! Reg 14
-; V9PIC_NOCFI:        .ascii "\377\017"            ! Offset 2047
-; V9PIC_NOCFI:        .word %r_disp32(.Ltmp{{.+}}) ! FDE initial location
-
 define i32 @main(i32 %argc, i8** nocapture readnone %argv) unnamed_addr #0 {
 entry:
   %0 = icmp eq i32 %argc, 2
diff --git a/test/CodeGen/SPARC/leafproc.ll b/test/CodeGen/SPARC/leafproc.ll
index 963fac0..abb8ed9 100644
--- a/test/CodeGen/SPARC/leafproc.ll
+++ b/test/CodeGen/SPARC/leafproc.ll
@@ -11,7 +11,7 @@ entry:
 
 ; CHECK-LABEL:      return_int_const:
 ; CHECK:      retl
-; CHECK-NEXT: or %g0, 1729, %o0
+; CHECK-NEXT: mov 1729, %o0
 define i32 @return_int_const() {
 entry:
   ret i32 1729
@@ -58,9 +58,9 @@ entry:
 
 ; CHECK-LABEL:      leaf_proc_with_local_array:
 ; CHECK:      add %sp, -104, %sp
-; CHECK:      or %g0, 1, [[R1:%[go][0-7]]]
+; CHECK:      mov 1, [[R1:%[go][0-7]]]
 ; CHECK:      st [[R1]], [%sp+96]
-; CHECK:      or %g0, 2, [[R2:%[go][0-7]]]
+; CHECK:      mov 2, [[R2:%[go][0-7]]]
 ; CHECK:      st [[R2]], [%sp+100]
 ; CHECK:      ld {{.+}}, %o0
 ; CHECK:      retl
diff --git a/test/CodeGen/SPARC/parts.ll b/test/CodeGen/SPARC/parts.ll
index 57add49..47feb15 100644
--- a/test/CodeGen/SPARC/parts.ll
+++ b/test/CodeGen/SPARC/parts.ll
@@ -2,10 +2,10 @@
   
 ; CHECK-LABEL: test
 ; CHECK:        srl %i1, 0, %o2
-; CHECK-NEXT:   or %g0, %i2, %o0
+; CHECK-NEXT:   mov %i2, %o0
 ; CHECK-NEXT:   call __ashlti3
-; CHECK-NEXT:   or %g0, %i3, %o1
-; CHECK-NEXT:   or %g0, %o0, %i0
+; CHECK-NEXT:   mov %i3, %o1
+; CHECK-NEXT:   mov %o0, %i0
   
 define i128 @test(i128 %a, i128 %b) {
 entry:
diff --git a/test/CodeGen/SPARC/sret-secondary.ll b/test/CodeGen/SPARC/sret-secondary.ll
new file mode 100644
index 0000000..4efcabf
--- /dev/null
+++ b/test/CodeGen/SPARC/sret-secondary.ll
@@ -0,0 +1,8 @@
+; RUN: not llc -march=sparc < %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: sparc only supports sret on the first parameter
+
+define void @foo(i32 %a, i32* sret %out) {
+  store i32 %a, i32* %out
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/alias-01.ll b/test/CodeGen/SystemZ/alias-01.ll
index dc90481..8839aad 100644
--- a/test/CodeGen/SystemZ/alias-01.ll
+++ b/test/CodeGen/SystemZ/alias-01.ll
@@ -2,9 +2,6 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; The use of TBAA in CodeGen has been temporarily disabled pending correctness fixes.
-; XFAIL: *
-
 ; Check that there are no spills.
 define void @f1(<16 x i32> *%src1, <16 x float> *%dest) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb/2009-06-18-ThumbCommuteMul.ll b/test/CodeGen/Thumb/2009-06-18-ThumbCommuteMul.ll
index 5c883b3..ca6df7c 100644
--- a/test/CodeGen/Thumb/2009-06-18-ThumbCommuteMul.ll
+++ b/test/CodeGen/Thumb/2009-06-18-ThumbCommuteMul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb | grep r0 | count 1
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @a(i32 %x, i32 %y) nounwind readnone {
 entry:
@@ -6,3 +6,5 @@ entry:
 	ret i32 %mul
 }
 
+; CHECK: r0
+
diff --git a/test/CodeGen/Thumb/2010-06-18-SibCallCrash.ll b/test/CodeGen/Thumb/2010-06-18-SibCallCrash.ll
index ad8b064..e1efd3b 100644
--- a/test/CodeGen/Thumb/2010-06-18-SibCallCrash.ll
+++ b/test/CodeGen/Thumb/2010-06-18-SibCallCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=thumb < %s
+; RUN: llc -mtriple=thumb-eabi %s -o /dev/null
 ; rdar://8104457
 
 define arm_apcscc void @t(i32* %m) nounwind {
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index b87bf24..ffc9584 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -151,5 +151,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !98 = metadata !{i32 52, i32 0, metadata !1, null}
 !101 = metadata !{metadata !"ggEdgeDiscrepancy.cc", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
 !102 = metadata !{i32 0}
-!103 = metadata !{metadata !3}
+!103 = metadata !{metadata !3, metadata !77}
 !104 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/Thumb/DbgValueOtherTargets.test b/test/CodeGen/Thumb/DbgValueOtherTargets.test
index afb18a4..557892b 100644
--- a/test/CodeGen/Thumb/DbgValueOtherTargets.test
+++ b/test/CodeGen/Thumb/DbgValueOtherTargets.test
@@ -1 +1 @@
-RUN: llc -O0 -march=thumb -asm-verbose < %S/../Inputs/DbgValueOtherTargets.ll | FileCheck %S/../Inputs/DbgValueOtherTargets.ll
+RUN: llc -O0 -mtriple=thumb-eabi -asm-verbose %S/../Inputs/DbgValueOtherTargets.ll -o - | FileCheck %S/../Inputs/DbgValueOtherTargets.ll
diff --git a/test/CodeGen/Thumb/barrier.ll b/test/CodeGen/Thumb/barrier.ll
index 1c27fa0..92d9bb2 100644
--- a/test/CodeGen/Thumb/barrier.ll
+++ b/test/CodeGen/Thumb/barrier.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=thumbv6-apple-darwin  | FileCheck %s -check-prefix=V6
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=-db | FileCheck %s -check-prefix=V6
-; RUN: llc < %s -march=thumb -mcpu=cortex-m0   | FileCheck %s -check-prefix=V6M
+; RUN: llc -mtriple=thumbv6-apple-darwin %s -o - | FileCheck %s -check-prefix=V6
+; RUN: llc -mtriple=thumbv7-apple-darwin -mattr=-db %s -o - | FileCheck %s -check-prefix=V6
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m0 %s -o - | FileCheck %s -check-prefix=V6M
 
 define void @t1() {
 ; V6-LABEL: t1:
diff --git a/test/CodeGen/Thumb/dyn-stackalloc.ll b/test/CodeGen/Thumb/dyn-stackalloc.ll
index 6c6de55..6bc39af 100644
--- a/test/CodeGen/Thumb/dyn-stackalloc.ll
+++ b/test/CodeGen/Thumb/dyn-stackalloc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s -check-prefix=CHECK -check-prefix=RA_GREEDY
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s -check-prefix=CHECK -check-prefix=RA_BASIC
 
 	%struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* }
 	%struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
@@ -45,7 +45,8 @@ define void @t2(%struct.comment* %vc, i8* %tag, i8* %contents) {
 ; CHECK: sub sp, #
 ; CHECK: mov r[[R0:[0-9]+]], sp
 ; CHECK: str r{{[0-9+]}}, [r[[R0]]
-; CHECK: str r{{[0-9+]}}, [r[[R0]]
+; RA_GREEDY: str r{{[0-9+]}}, [r[[R0]]
+; RA_BASIC: stm r[[R0]]!
 ; CHECK-NOT: ldr r0, [sp
 ; CHECK: mov r[[R1:[0-9]+]], sp
 ; CHECK: subs r[[R2:[0-9]+]], r[[R1]], r{{[0-9]+}}
diff --git a/test/CodeGen/Thumb/fpconv.ll b/test/CodeGen/Thumb/fpconv.ll
index 7da36dd..0ade798 100644
--- a/test/CodeGen/Thumb/fpconv.ll
+++ b/test/CodeGen/Thumb/fpconv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb
+; RUN: llc -mtriple=thumb-eabi %s -o /dev/null
 
 define float @f1(double %x) {
 entry:
diff --git a/test/CodeGen/Thumb/fpow.ll b/test/CodeGen/Thumb/fpow.ll
index be3dc0b..18b1c91 100644
--- a/test/CodeGen/Thumb/fpow.ll
+++ b/test/CodeGen/Thumb/fpow.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb
+; RUN: llc -mtriple=thumb-eabi %s -o /dev/null
 
 define double @t(double %x, double %y) nounwind optsize {
 entry:
diff --git a/test/CodeGen/Thumb/inlineasm-imm-thumb.ll b/test/CodeGen/Thumb/inlineasm-imm-thumb.ll
index d557b9d..4e4f8fa 100644
--- a/test/CodeGen/Thumb/inlineasm-imm-thumb.ll
+++ b/test/CodeGen/Thumb/inlineasm-imm-thumb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -no-integrated-as
+; RUN: llc -mtriple=thumb-eabi -no-integrated-as %s -o /dev/null
 
 ; Test Thumb-mode "I" constraint, for ADD immediate.
 define i32 @testI(i32 %x) {
diff --git a/test/CodeGen/Thumb/inlineasm-thumb.ll b/test/CodeGen/Thumb/inlineasm-thumb.ll
index f2683c8..2547ce8 100644
--- a/test/CodeGen/Thumb/inlineasm-thumb.ll
+++ b/test/CodeGen/Thumb/inlineasm-thumb.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=thumb | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
+
 define i32 @t1(i32 %x, i32 %y) nounwind {
 entry:
   ; CHECK: mov r0, r12
diff --git a/test/CodeGen/Thumb/ispositive.ll b/test/CodeGen/Thumb/ispositive.ll
index 7b28227..8d39687 100644
--- a/test/CodeGen/Thumb/ispositive.ll
+++ b/test/CodeGen/Thumb/ispositive.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @test1(i32 %X) {
 entry:
diff --git a/test/CodeGen/Thumb/ldr_ext.ll b/test/CodeGen/Thumb/ldr_ext.ll
index 9a28124..2d25af3 100644
--- a/test/CodeGen/Thumb/ldr_ext.ll
+++ b/test/CodeGen/Thumb/ldr_ext.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb | FileCheck %s -check-prefix=V5
-; RUN: llc < %s -march=thumb -mattr=+v6 | FileCheck %s -check-prefix=V6
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s -check-prefix=V5
+; RUN: llc -mtriple=thumb-eabi -mattr=+v6 %s -o - | FileCheck %s -check-prefix=V6
 
 ; rdar://7176514
 
diff --git a/test/CodeGen/Thumb/ldr_frame.ll b/test/CodeGen/Thumb/ldr_frame.ll
index 6c58638..0e879d7 100644
--- a/test/CodeGen/Thumb/ldr_frame.ll
+++ b/test/CodeGen/Thumb/ldr_frame.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @f1() {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb/long-setcc.ll b/test/CodeGen/Thumb/long-setcc.ll
index 8f2d98f..3460edb 100644
--- a/test/CodeGen/Thumb/long-setcc.ll
+++ b/test/CodeGen/Thumb/long-setcc.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb | grep cmp | count 1
-
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i1 @t1(i64 %x) {
 	%B = icmp slt i64 %x, 0
@@ -15,3 +14,9 @@ define i1 @t3(i32 %x) {
 	%tmp = icmp ugt i32 %x, -1
 	ret i1 %tmp
 }
+
+; CHECK: cmp
+; CHECK-NOT: cmp
+
+
+
diff --git a/test/CodeGen/Thumb/long.ll b/test/CodeGen/Thumb/long.ll
index 197e19e..2449e5a 100644
--- a/test/CodeGen/Thumb/long.ll
+++ b/test/CodeGen/Thumb/long.ll
@@ -1,10 +1,5 @@
-; RUN: llc < %s -march=thumb | \
-; RUN:   grep mvn | count 1
-; RUN: llc < %s -march=thumb | \
-; RUN:   grep adc | count 1
-; RUN: llc < %s -march=thumb | \
-; RUN:   grep sbc | count 1
-; RUN: llc < %s -mtriple=thumb-apple-darwin | grep __muldi3
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-apple-darwin %s -o - | FileCheck %s -check-prefix CHECK-DARWIN
 
 define i64 @f1() {
 entry:
@@ -74,3 +69,14 @@ entry:
         ret i64 %retval
 }
 
+; CHECK: mvn
+; CHECK-NOT: mvn
+
+; CHECK: adc
+; CHECK-NOT: adc
+
+; CHECK: sbc
+; CHECK-NOT: sbc
+
+; CHECK-DARWIN: __muldi3
+
diff --git a/test/CodeGen/Thumb/long_shift.ll b/test/CodeGen/Thumb/long_shift.ll
index 2431714..6aa1afd 100644
--- a/test/CodeGen/Thumb/long_shift.ll
+++ b/test/CodeGen/Thumb/long_shift.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb
+; RUN: llc -mtriple=thumb-eabi %s -o /dev/null
 
 define i64 @f0(i64 %A, i64 %B) {
         %tmp = bitcast i64 %A to i64
diff --git a/test/CodeGen/Thumb/mul.ll b/test/CodeGen/Thumb/mul.ll
index c1a2fb2..13a2cfb 100644
--- a/test/CodeGen/Thumb/mul.ll
+++ b/test/CodeGen/Thumb/mul.ll
@@ -1,22 +1,32 @@
-; RUN: llc < %s -march=thumb | grep mul | count 3
-; RUN: llc < %s -march=thumb | grep lsl | count 1
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @f1(i32 %u) {
     %tmp = mul i32 %u, %u
     ret i32 %tmp
 }
 
+; CHECK: mul{{s?}}
+
 define i32 @f2(i32 %u, i32 %v) {
     %tmp = mul i32 %u, %v
     ret i32 %tmp
 }
 
+; CHECK: mul{{s?}}
+
 define i32 @f3(i32 %u) {
     %tmp = mul i32 %u, 5
     ret i32 %tmp
 }
 
+; CHECK: mul{{s?}}
+
 define i32 @f4(i32 %u) {
     %tmp = mul i32 %u, 4
     ret i32 %tmp
 }
+
+; CHECK: lsl
+; CHECK-NOT: mul{{s?}}
+; CHECK-NOT: lsl
+
diff --git a/test/CodeGen/Thumb/rev.ll b/test/CodeGen/Thumb/rev.ll
index dcba00e..3e94702 100644
--- a/test/CodeGen/Thumb/rev.ll
+++ b/test/CodeGen/Thumb/rev.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+v6 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mattr=+v6 %s -o - | FileCheck %s
 
 define i32 @test1(i32 %X) nounwind {
 ; CHECK: test1
diff --git a/test/CodeGen/Thumb/segmented-stacks-dynamic.ll b/test/CodeGen/Thumb/segmented-stacks-dynamic.ll
index 067c07b..5d51f40 100644
--- a/test/CodeGen/Thumb/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/Thumb/segmented-stacks-dynamic.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-linux
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
-; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -filetype=obj
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-linux
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -filetype=obj
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define i32 @test_basic(i32 %l) {
+define i32 @test_basic(i32 %l) #0 {
         %mem = alloca i32, i32 %l
         call void @dummy_use (i32* %mem, i32 %l)
         %terminate = icmp eq i32 %l, 0
@@ -61,3 +61,5 @@ false:
 ; Thumb-android:      pop {r4, r5}
 
 }
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/Thumb/segmented-stacks.ll b/test/CodeGen/Thumb/segmented-stacks.ll
index 5649b00..d6e25c7 100644
--- a/test/CodeGen/Thumb/segmented-stacks.ll
+++ b/test/CodeGen/Thumb/segmented-stacks.ll
@@ -1,13 +1,13 @@
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
-; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-linux
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -filetype=obj
-; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-linux
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -filetype=obj
 
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define void @test_basic() {
+define void @test_basic() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -54,9 +54,11 @@ define void @test_basic() {
 
 }
 
-define i32 @test_nested(i32 * nest %closure, i32 %other) {
+define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        %addend = load i32 * %closure
        %result = add i32 %other, %addend
+       %mem = alloca i32, i32 10
+       call void @dummy_use (i32* %mem, i32 10)
        ret i32 %result
 
 ; Thumb-android:      test_nested:
@@ -68,7 +70,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; Thumb-android-NEXT: cmp     r4, r5
 ; Thumb-android-NEXT: blo     .LBB1_2
 
-; Thumb-android:      mov     r4, #0
+; Thumb-android:      mov     r4, #56
 ; Thumb-android-NEXT: mov     r5, #0
 ; Thumb-android-NEXT: push    {lr}
 ; Thumb-android-NEXT: bl      __morestack
@@ -88,7 +90,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; Thumb-linux-NEXT: cmp     r4, r5
 ; Thumb-linux-NEXT: blo     .LBB1_2
 
-; Thumb-linux:      mov     r4, #0
+; Thumb-linux:      mov     r4, #56
 ; Thumb-linux-NEXT: mov     r5, #0
 ; Thumb-linux-NEXT: push    {lr}
 ; Thumb-linux-NEXT: bl      __morestack
@@ -101,7 +103,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 
 }
 
-define void @test_large() {
+define void @test_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -150,7 +152,7 @@ define void @test_large() {
 
 }
 
-define fastcc void @test_fastcc() {
+define fastcc void @test_fastcc() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
         ret void
@@ -197,7 +199,7 @@ define fastcc void @test_fastcc() {
 
 }
 
-define fastcc void @test_fastcc_large() {
+define fastcc void @test_fastcc_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -245,3 +247,15 @@ define fastcc void @test_fastcc_large() {
 ; Thumb-linux:      pop     {r4, r5}
 
 }
+
+define void @test_nostack() #0 {
+	ret void
+
+; Thumb-android-LABEL: test_nostack:
+; Thumb-android-NOT:   bl __morestack
+
+; Thumb-linux-LABEL: test_nostack:
+; Thumb-linux-NOT:   bl __morestack
+}
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll b/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll
index 3f6407a..97c66d9 100644
--- a/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll
+++ b/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1022e
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1022e %s -o /dev/null
 
 %iterator = type { i8**, i8**, i8**, i8*** }
 %insert_iterator = type { %deque*, %iterator }
diff --git a/test/CodeGen/Thumb/stack-frame.ll b/test/CodeGen/Thumb/stack-frame.ll
index b103b33..09d480a 100644
--- a/test/CodeGen/Thumb/stack-frame.ll
+++ b/test/CodeGen/Thumb/stack-frame.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb
-; RUN: llc < %s -march=thumb | grep add | count 1
+; RUN: llc -mtriple=thumb-eabi < %s -o - | FileCheck %s
 
 define void @f1() {
 	%c = alloca i8, align 1
@@ -10,4 +9,6 @@ define i32 @f2() {
 	ret i32 1
 }
 
+; CHECK: add
+; CHECK-NOT: add
 
diff --git a/test/CodeGen/Thumb/thumb-imm.ll b/test/CodeGen/Thumb/thumb-imm.ll
index 74a57ff..592e694 100644
--- a/test/CodeGen/Thumb/thumb-imm.ll
+++ b/test/CodeGen/Thumb/thumb-imm.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb | not grep CPI
-
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @test1() {
   ret i32 1000
@@ -8,3 +7,6 @@ define i32 @test1() {
 define i32 @test2() {
   ret i32 -256
 }
+
+; CHECK-NOT: CPI
+
diff --git a/test/CodeGen/Thumb/thumb-ldm.ll b/test/CodeGen/Thumb/thumb-ldm.ll
new file mode 100644
index 0000000..dd98e6f
--- /dev/null
+++ b/test/CodeGen/Thumb/thumb-ldm.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
+
+@X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
+
+define i32 @t1() {
+; CHECK-LABEL: t1:
+; CHECK: push {r7, lr}
+; CHECK: ldm
+; CHECK: pop {r7, pc}
+        %tmp = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 0)            ; <i32> [#uses=1]
+        %tmp3 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 1)           ; <i32> [#uses=1]
+        %tmp4 = call i32 @f1( i32 %tmp, i32 %tmp3 )                ; <i32> [#uses=1]
+        ret i32 %tmp4
+}
+
+define i32 @t2() {
+; CHECK-LABEL: t2:
+; CHECK: push {r7, lr}
+; CHECK: ldm
+; CHECK: pop {r7, pc}
+        %tmp = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 2)            ; <i32> [#uses=1]
+        %tmp3 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 3)           ; <i32> [#uses=1]
+        %tmp5 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 4)           ; <i32> [#uses=1]
+        %tmp6 = call i32 @f2( i32 %tmp, i32 %tmp3, i32 %tmp5 )             ; <i32> [#uses=1]
+        ret i32 %tmp6
+}
+
+define i32 @t3() {
+; CHECK-LABEL: t3:
+; CHECK: push {r7, lr}
+; CHECK: ldm
+; CHECK: pop {r7, pc}
+        %tmp = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 1)            ; <i32> [#uses=1]
+        %tmp3 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 2)           ; <i32> [#uses=1]
+        %tmp5 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 3)           ; <i32> [#uses=1]
+        %tmp6 = call i32 @f2( i32 %tmp, i32 %tmp3, i32 %tmp5 )             ; <i32> [#uses=1]
+        ret i32 %tmp6
+}
+
+declare i32 @f1(i32, i32)
+
+declare i32 @f2(i32, i32, i32)
diff --git a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
new file mode 100644
index 0000000..06cfd9b
--- /dev/null
+++ b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s
+
+@d = external global [64 x i32]
+@s = external global [64 x i32]
+
+; Function Attrs: nounwind
+define void @t1() #0 {
+entry:
+; CHECK: ldr [[REG0:r[0-9]]],
+; CHECK: ldm [[REG0]]!,
+; CHECK: ldr [[REG1:r[0-9]]],
+; CHECK: stm [[REG1]]!,
+; CHECK: subs [[REG0]], #32
+; CHECK-NEXT: ldrb
+; CHECK: subs [[REG1]], #32
+; CHECK-NEXT: strb
+    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 33, i32 4, i1 false)
+    ret void
+}
+
+; Function Attrs: nounwind
+define void @t2() #0 {
+entry:
+; CHECK: ldr [[REG0:r[0-9]]],
+; CHECK: ldm [[REG0]]!,
+; CHECK: ldr [[REG1:r[0-9]]],
+; CHECK: stm [[REG1]]!,
+; CHECK: ldrh
+; CHECK: ldrb
+; CHECK: strb
+; CHECK: strh
+    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false)
+    ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
diff --git a/test/CodeGen/Thumb/trap.ll b/test/CodeGen/Thumb/trap.ll
index e04059c..7d2f6f1 100644
--- a/test/CodeGen/Thumb/trap.ll
+++ b/test/CodeGen/Thumb/trap.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 ; rdar://7961298
 
 define void @t() nounwind {
diff --git a/test/CodeGen/Thumb/tst_teq.ll b/test/CodeGen/Thumb/tst_teq.ll
index 21ada3e..2b6d9a3 100644
--- a/test/CodeGen/Thumb/tst_teq.ll
+++ b/test/CodeGen/Thumb/tst_teq.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb | grep tst
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @f(i32 %a) {
 entry:
@@ -15,3 +15,6 @@ entry:
 	%retval = select i1 %0, i32 20, i32 10		; <i32> [#uses=1]
 	ret i32 %retval
 }
+
+; CHECK: tst
+
diff --git a/test/CodeGen/Thumb/vargs.ll b/test/CodeGen/Thumb/vargs.ll
index 50a1a07..4078b01 100644
--- a/test/CodeGen/Thumb/vargs.ll
+++ b/test/CodeGen/Thumb/vargs.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=thumb
-; RUN: llc < %s -mtriple=thumb-linux | grep pop | count 2
-; RUN: llc < %s -mtriple=thumb-darwin | grep pop | count 2
+; RUN: llc -mtriple=thumb-eabi %s -o /dev/null
+; RUN: llc -mtriple=thumb-linux %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-darwin %s -o - | FileCheck %s
 
 @str = internal constant [4 x i8] c"%d\0A\00"           ; <[4 x i8]*> [#uses=1]
 
@@ -34,3 +34,8 @@ declare void @llvm.va_start(i8*)
 declare i32 @printf(i8*, ...)
 
 declare void @llvm.va_end(i8*)
+
+; CHECK: pop
+; CHECK: pop
+; CHECK-NOT: pop
+
diff --git a/test/CodeGen/Thumb2/bfi.ll b/test/CodeGen/Thumb2/bfi.ll
index 3612e27..4f056d5 100644
--- a/test/CodeGen/Thumb2/bfi.ll
+++ b/test/CodeGen/Thumb2/bfi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=thumb -mattr=+v6t2 < %s | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mattr=+v6t2 %s -o - | FileCheck %s
 
 %struct.F = type { [3 x i8], i8 }
 
diff --git a/test/CodeGen/Thumb2/bfx.ll b/test/CodeGen/Thumb2/bfx.ll
index e380b8f..9bd8d70 100644
--- a/test/CodeGen/Thumb2/bfx.ll
+++ b/test/CodeGen/Thumb2/bfx.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @sbfx1(i32 %a) {
 ; CHECK: sbfx1
diff --git a/test/CodeGen/Thumb2/carry.ll b/test/CodeGen/Thumb2/carry.ll
index 48fba4e..26622e2 100644
--- a/test/CodeGen/Thumb2/carry.ll
+++ b/test/CodeGen/Thumb2/carry.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 entry:
diff --git a/test/CodeGen/Thumb2/div.ll b/test/CodeGen/Thumb2/div.ll
index e783c88..b273a89 100644
--- a/test/CodeGen/Thumb2/div.ll
+++ b/test/CodeGen/Thumb2/div.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -mcpu=arm1156t2-s -mattr=+thumb2 \
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMB
-; RUN: llc < %s -march=thumb -mcpu=cortex-m3 -mattr=+thumb2 \
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=cortex-m3 -mattr=+thumb2 %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMBV7M
-; RUN: llc < %s -march=thumb -mcpu=swift \
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=swift %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-HWDIV
-; RUN: llc < %s -march=thumb -mcpu=cortex-r5 \
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=cortex-r5 %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-HWDIV
 
 define i32 @f1(i32 %a, i32 %b) {
diff --git a/test/CodeGen/Thumb2/ifcvt-neon.ll b/test/CodeGen/Thumb2/ifcvt-neon.ll
index 6832053..501b0b6 100644
--- a/test/CodeGen/Thumb2/ifcvt-neon.ll
+++ b/test/CodeGen/Thumb2/ifcvt-neon.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
 ; rdar://7368193
 
 @a = common global float 0.000000e+00             ; <float*> [#uses=2]
diff --git a/test/CodeGen/Thumb2/longMACt.ll b/test/CodeGen/Thumb2/longMACt.ll
index abe65f2..7322d0f 100644
--- a/test/CodeGen/Thumb2/longMACt.ll
+++ b/test/CodeGen/Thumb2/longMACt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 ; Check generated signed and unsigned multiply accumulate long.
 
 define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
diff --git a/test/CodeGen/Thumb2/mul_const.ll b/test/CodeGen/Thumb2/mul_const.ll
index 41de477..7064798 100644
--- a/test/CodeGen/Thumb2/mul_const.ll
+++ b/test/CodeGen/Thumb2/mul_const.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 ; rdar://7069502
 
 define i32 @t1(i32 %v) nounwind readnone {
diff --git a/test/CodeGen/Thumb2/segmented-stacks.ll b/test/CodeGen/Thumb2/segmented-stacks.ll
index 602fc84..38bf915 100644
--- a/test/CodeGen/Thumb2/segmented-stacks.ll
+++ b/test/CodeGen/Thumb2/segmented-stacks.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -filetype=obj
 
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define void @test_basic() {
+define void @test_basic() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -30,3 +30,5 @@ define void @test_basic() {
 ; Thumb-android:      pop     {r4, r5}
 
 }
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/Thumb2/thumb2-adc.ll b/test/CodeGen/Thumb2/thumb2-adc.ll
index 58e4c59..a97654c 100644
--- a/test/CodeGen/Thumb2/thumb2-adc.ll
+++ b/test/CodeGen/Thumb2/thumb2-adc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 734439407618 = 0x000000ab00000002
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-add.ll b/test/CodeGen/Thumb2/thumb2-add.ll
index 5e81fcf..8ff931a 100644
--- a/test/CodeGen/Thumb2/thumb2-add.ll
+++ b/test/CodeGen/Thumb2/thumb2-add.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @t2ADDrc_255(i32 %lhs) {
 ; CHECK-LABEL: t2ADDrc_255:
diff --git a/test/CodeGen/Thumb2/thumb2-add2.ll b/test/CodeGen/Thumb2/thumb2-add2.ll
index ff0e087..9d64fd2 100644
--- a/test/CodeGen/Thumb2/thumb2-add2.ll
+++ b/test/CodeGen/Thumb2/thumb2-add2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-add3.ll b/test/CodeGen/Thumb2/thumb2-add3.ll
index bb7788f..03a8170 100644
--- a/test/CodeGen/Thumb2/thumb2-add3.ll
+++ b/test/CodeGen/Thumb2/thumb2-add3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
     %tmp = add i32 %a, 4095
diff --git a/test/CodeGen/Thumb2/thumb2-add4.ll b/test/CodeGen/Thumb2/thumb2-add4.ll
index ed68d62..ad9642d 100644
--- a/test/CodeGen/Thumb2/thumb2-add4.ll
+++ b/test/CodeGen/Thumb2/thumb2-add4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-add5.ll b/test/CodeGen/Thumb2/thumb2-add5.ll
index 7ef756f..f60e0be 100644
--- a/test/CodeGen/Thumb2/thumb2-add5.ll
+++ b/test/CodeGen/Thumb2/thumb2-add5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-add6.ll b/test/CodeGen/Thumb2/thumb2-add6.ll
index c4a13be..af09293 100644
--- a/test/CodeGen/Thumb2/thumb2-add6.ll
+++ b/test/CodeGen/Thumb2/thumb2-add6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-and.ll b/test/CodeGen/Thumb2/thumb2-and.ll
index 3ffcfd7..1984b3f 100644
--- a/test/CodeGen/Thumb2/thumb2-and.ll
+++ b/test/CodeGen/Thumb2/thumb2-and.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-and2.ll b/test/CodeGen/Thumb2/thumb2-and2.ll
index 3bfe9b2..70de9c9 100644
--- a/test/CodeGen/Thumb2/thumb2-and2.ll
+++ b/test/CodeGen/Thumb2/thumb2-and2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-asr.ll b/test/CodeGen/Thumb2/thumb2-asr.ll
index fbe3971..a4cccd5 100644
--- a/test/CodeGen/Thumb2/thumb2-asr.ll
+++ b/test/CodeGen/Thumb2/thumb2-asr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-asr2.ll b/test/CodeGen/Thumb2/thumb2-asr2.ll
index 321b3f5..da050fb 100644
--- a/test/CodeGen/Thumb2/thumb2-asr2.ll
+++ b/test/CodeGen/Thumb2/thumb2-asr2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-bcc.ll b/test/CodeGen/Thumb2/thumb2-bcc.ll
index 61171ac..e7b3822 100644
--- a/test/CodeGen/Thumb2/thumb2-bcc.ll
+++ b/test/CodeGen/Thumb2/thumb2-bcc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 ; If-conversion defeats the purpose of this test, which is to check CBZ
 ; generation, so use memory barrier instruction to make sure it doesn't
 ; happen and we get actual branches.
diff --git a/test/CodeGen/Thumb2/thumb2-bfc.ll b/test/CodeGen/Thumb2/thumb2-bfc.ll
index 844fb4a..dbf697cd 100644
--- a/test/CodeGen/Thumb2/thumb2-bfc.ll
+++ b/test/CodeGen/Thumb2/thumb2-bfc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 4278190095 = 0xff00000f
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-bic.ll b/test/CodeGen/Thumb2/thumb2-bic.ll
index fc57ec8..68d92b8 100644
--- a/test/CodeGen/Thumb2/thumb2-bic.ll
+++ b/test/CodeGen/Thumb2/thumb2-bic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-clz.ll b/test/CodeGen/Thumb2/thumb2-clz.ll
index a5cd074..52b540b 100644
--- a/test/CodeGen/Thumb2/thumb2-clz.ll
+++ b/test/CodeGen/Thumb2/thumb2-clz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+v7 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+v7 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-cmn.ll b/test/CodeGen/Thumb2/thumb2-cmn.ll
index da7d4b1..efa1505 100644
--- a/test/CodeGen/Thumb2/thumb2-cmn.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests could be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-cmn2.ll b/test/CodeGen/Thumb2/thumb2-cmn2.ll
index a09a149..42473c2 100644
--- a/test/CodeGen/Thumb2/thumb2-cmn2.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmn2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; -0x000000bb = 4294967109
 define i1 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-cmp.ll b/test/CodeGen/Thumb2/thumb2-cmp.ll
index 06c611d..8f08617 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-cmp2.ll b/test/CodeGen/Thumb2/thumb2-cmp2.ll
index 8ca3caf..4d84003 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp2.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-eor.ll b/test/CodeGen/Thumb2/thumb2-eor.ll
index 6dfc5cd..2028299 100644
--- a/test/CodeGen/Thumb2/thumb2-eor.ll
+++ b/test/CodeGen/Thumb2/thumb2-eor.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-eor2.ll b/test/CodeGen/Thumb2/thumb2-eor2.ll
index cf27448..f26aafe 100644
--- a/test/CodeGen/Thumb2/thumb2-eor2.ll
+++ b/test/CodeGen/Thumb2/thumb2-eor2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 0x000000bb = 187
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-jtb.ll b/test/CodeGen/Thumb2/thumb2-jtb.ll
index 11620c2..ce7fb9f 100644
--- a/test/CodeGen/Thumb2/thumb2-jtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-jtb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -arm-adjust-jump-tables=0 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 -arm-adjust-jump-tables=0 %s -o - | FileCheck %s
 
 ; Do not use tbb / tbh if any destination is before the jumptable.
 ; rdar://7102917
diff --git a/test/CodeGen/Thumb2/thumb2-ldm.ll b/test/CodeGen/Thumb2/thumb2-ldm.ll
index 8716d80..adfcf2b 100644
--- a/test/CodeGen/Thumb2/thumb2-ldm.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldm.ll
@@ -5,6 +5,7 @@
 define i32 @t1() {
 ; CHECK-LABEL: t1:
 ; CHECK: push {r7, lr}
+; CHECK: ldrd
 ; CHECK: pop {r7, pc}
         %tmp = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 0)            ; <i32> [#uses=1]
         %tmp3 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 1)           ; <i32> [#uses=1]
@@ -27,6 +28,7 @@ define i32 @t2() {
 define i32 @t3() {
 ; CHECK-LABEL: t3:
 ; CHECK: push {r7, lr}
+; CHECK: ldm
 ; CHECK: pop {r7, pc}
         %tmp = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 1)            ; <i32> [#uses=1]
         %tmp3 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 2)           ; <i32> [#uses=1]
diff --git a/test/CodeGen/Thumb2/thumb2-ldr.ll b/test/CodeGen/Thumb2/thumb2-ldr.ll
index 09212d3..c25ed78 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32* %v) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_ext.ll b/test/CodeGen/Thumb2/thumb2-ldr_ext.ll
index b865cf4..b50b333 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_ext.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_ext.ll
@@ -1,7 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrb | count 1
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrh | count 1
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrsb | count 1
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrsh | count 1
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @test1(i8* %v.pntr.s0.u1) {
     %tmp.u = load i8* %v.pntr.s0.u1
@@ -26,3 +23,16 @@ define i32 @test4() {
     %tmp1.s = sext i16 %tmp.s to i32
     ret i32 %tmp1.s
 }
+
+; CHECK: ldrb
+; CHECK-NOT: ldrb
+
+; CHECK: ldrh
+; CHECK-NOT: ldrh
+
+; CHECK: ldrsb
+; CHECK-NOT: ldrsb
+
+; CHECK: ldrsh
+; CHECK-NOT: ldrsh
+
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_post.ll b/test/CodeGen/Thumb2/thumb2-ldr_post.ll
index 4f04647..c26e6b1 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_post.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_post.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @test(i32 %a, i32 %b, i32 %c) {
         %tmp1 = mul i32 %a, %b          ; <i32> [#uses=2]
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_pre.ll b/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
index 4907dec..cafb02a 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
@@ -1,7 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | \
-; RUN:   grep "ldr.*\!" | count 3
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | \
-; RUN:   grep "ldrsb.*\!" | count 1
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32* @test1(i32* %X, i32* %dest) {
         %Y = getelementptr i32* %X, i32 4               ; <i32*> [#uses=2]
@@ -10,6 +7,8 @@ define i32* @test1(i32* %X, i32* %dest) {
         ret i32* %Y
 }
 
+; CHECK: ldr{{.*}}!
+
 define i32 @test2(i32 %a, i32 %b) {
         %tmp1 = sub i32 %a, 64          ; <i32> [#uses=2]
         %tmp2 = inttoptr i32 %tmp1 to i32*              ; <i32*> [#uses=1]
@@ -19,6 +18,8 @@ define i32 @test2(i32 %a, i32 %b) {
         ret i32 %tmp5
 }
 
+; CHECK: ldr{{.*}}!
+
 define i8* @test3(i8* %X, i32* %dest) {
         %tmp1 = getelementptr i8* %X, i32 4
         %tmp2 = load i8* %tmp1
@@ -26,3 +27,6 @@ define i8* @test3(i8* %X, i32* %dest) {
         store i32 %tmp3, i32* %dest
         ret i8* %tmp1
 }
+
+; CHECK: ldrsb{{.*}}!
+
diff --git a/test/CodeGen/Thumb2/thumb2-ldrb.ll b/test/CodeGen/Thumb2/thumb2-ldrb.ll
index c79f732..0b3441e 100644
--- a/test/CodeGen/Thumb2/thumb2-ldrb.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldrb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i8 @f1(i8* %v) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-ldrh.ll b/test/CodeGen/Thumb2/thumb2-ldrh.ll
index 7ba9f22..db5dcfa 100644
--- a/test/CodeGen/Thumb2/thumb2-ldrh.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldrh.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i16 @f1(i16* %v) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-lsl.ll b/test/CodeGen/Thumb2/thumb2-lsl.ll
index 015a9dd..05441c8 100644
--- a/test/CodeGen/Thumb2/thumb2-lsl.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsl2.ll b/test/CodeGen/Thumb2/thumb2-lsl2.ll
index c64897a..5a456b0 100644
--- a/test/CodeGen/Thumb2/thumb2-lsl2.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsl2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsr.ll b/test/CodeGen/Thumb2/thumb2-lsr.ll
index 24973c7..48c2ec4 100644
--- a/test/CodeGen/Thumb2/thumb2-lsr.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsr2.ll b/test/CodeGen/Thumb2/thumb2-lsr2.ll
index 0b199bb..5d158af 100644
--- a/test/CodeGen/Thumb2/thumb2-lsr2.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsr2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsr3.ll b/test/CodeGen/Thumb2/thumb2-lsr3.ll
index c814123..c9344c8 100644
--- a/test/CodeGen/Thumb2/thumb2-lsr3.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsr3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i1 @test1(i64 %poscnt, i32 %work) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-mla.ll b/test/CodeGen/Thumb2/thumb2-mla.ll
index a99ffe7..0c97d50 100644
--- a/test/CodeGen/Thumb2/thumb2-mla.ll
+++ b/test/CodeGen/Thumb2/thumb2-mla.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 \
-; RUN:  -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 \
+; RUN:  -arm-use-mulops=false %s -o - | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
diff --git a/test/CodeGen/Thumb2/thumb2-mls.ll b/test/CodeGen/Thumb2/thumb2-mls.ll
index 45d6d13..9b0e7ff 100644
--- a/test/CodeGen/Thumb2/thumb2-mls.ll
+++ b/test/CodeGen/Thumb2/thumb2-mls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
diff --git a/test/CodeGen/Thumb2/thumb2-mov.ll b/test/CodeGen/Thumb2/thumb2-mov.ll
index 7c0dc01..e563362 100644
--- a/test/CodeGen/Thumb2/thumb2-mov.ll
+++ b/test/CodeGen/Thumb2/thumb2-mov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; Test #<const>
 
diff --git a/test/CodeGen/Thumb2/thumb2-mul.ll b/test/CodeGen/Thumb2/thumb2-mul.ll
index 5f68250..4815f4b 100644
--- a/test/CodeGen/Thumb2/thumb2-mul.ll
+++ b/test/CodeGen/Thumb2/thumb2-mul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-mulhi.ll b/test/CodeGen/Thumb2/thumb2-mulhi.ll
index e32bd26..db9b644 100644
--- a/test/CodeGen/Thumb2/thumb2-mulhi.ll
+++ b/test/CodeGen/Thumb2/thumb2-mulhi.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2dsp | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2dsp %s -o - | FileCheck %s
 
 define i32 @smulhi(i32 %x, i32 %y) {
 ; CHECK: smulhi
diff --git a/test/CodeGen/Thumb2/thumb2-mvn.ll b/test/CodeGen/Thumb2/thumb2-mvn.ll
index a5592f6..adf982f 100644
--- a/test/CodeGen/Thumb2/thumb2-mvn.ll
+++ b/test/CodeGen/Thumb2/thumb2-mvn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
+; RUN: llc -mtriple=thumbv7-apple-darwin %s -o - | FileCheck %s
 
 ; 0x000000bb = 187
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-mvn2.ll b/test/CodeGen/Thumb2/thumb2-mvn2.ll
index cee6f23..323c2cc 100644
--- a/test/CodeGen/Thumb2/thumb2-mvn2.ll
+++ b/test/CodeGen/Thumb2/thumb2-mvn2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-neg.ll b/test/CodeGen/Thumb2/thumb2-neg.ll
index 491e4de..bec6097 100644
--- a/test/CodeGen/Thumb2/thumb2-neg.ll
+++ b/test/CodeGen/Thumb2/thumb2-neg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-orn.ll b/test/CodeGen/Thumb2/thumb2-orn.ll
index 08676b1..e1f0bba 100644
--- a/test/CodeGen/Thumb2/thumb2-orn.ll
+++ b/test/CodeGen/Thumb2/thumb2-orn.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
     %tmp = xor i32 %b, 4294967295
diff --git a/test/CodeGen/Thumb2/thumb2-orn2.ll b/test/CodeGen/Thumb2/thumb2-orn2.ll
index a8f4a84..c8347df 100644
--- a/test/CodeGen/Thumb2/thumb2-orn2.ll
+++ b/test/CodeGen/Thumb2/thumb2-orn2.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 0x000000bb = 187
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-orr.ll b/test/CodeGen/Thumb2/thumb2-orr.ll
index 776d7fe..f962866 100644
--- a/test/CodeGen/Thumb2/thumb2-orr.ll
+++ b/test/CodeGen/Thumb2/thumb2-orr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-orr2.ll b/test/CodeGen/Thumb2/thumb2-orr2.ll
index 37885e2..045cc1d 100644
--- a/test/CodeGen/Thumb2/thumb2-orr2.ll
+++ b/test/CodeGen/Thumb2/thumb2-orr2.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 0x000000bb = 187
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-pack.ll b/test/CodeGen/Thumb2/thumb2-pack.ll
index 9a0d889..4825628 100644
--- a/test/CodeGen/Thumb2/thumb2-pack.ll
+++ b/test/CodeGen/Thumb2/thumb2-pack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk %s -o - | FileCheck %s
 
 ; CHECK: test1
 ; CHECK: pkhbt   r0, r0, r1, lsl #16
diff --git a/test/CodeGen/Thumb2/thumb2-rev.ll b/test/CodeGen/Thumb2/thumb2-rev.ll
index d710113..873a2d4 100644
--- a/test/CodeGen/Thumb2/thumb2-rev.ll
+++ b/test/CodeGen/Thumb2/thumb2-rev.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+v7,+t2xtpk | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+v7,+t2xtpk %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-ror.ll b/test/CodeGen/Thumb2/thumb2-ror.ll
index 3a21560..71b0015 100644
--- a/test/CodeGen/Thumb2/thumb2-ror.ll
+++ b/test/CodeGen/Thumb2/thumb2-ror.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-; RUN: llc < %s -march=thumb | FileCheck %s -check-prefix=THUMB1
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s -check-prefix=THUMB1
 
 ; CHECK-LABEL: f1:
 ; CHECK: 	ror.w	r0, r0, #22
diff --git a/test/CodeGen/Thumb2/thumb2-rsb.ll b/test/CodeGen/Thumb2/thumb2-rsb.ll
index 94a1fb0..1c5acad 100644
--- a/test/CodeGen/Thumb2/thumb2-rsb.ll
+++ b/test/CodeGen/Thumb2/thumb2-rsb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
     %tmp = shl i32 %b, 5
diff --git a/test/CodeGen/Thumb2/thumb2-rsb2.ll b/test/CodeGen/Thumb2/thumb2-rsb2.ll
index 248ab16..838e55e 100644
--- a/test/CodeGen/Thumb2/thumb2-rsb2.ll
+++ b/test/CodeGen/Thumb2/thumb2-rsb2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-sbc.ll b/test/CodeGen/Thumb2/thumb2-sbc.ll
index 7c69451..b04dae6 100644
--- a/test/CodeGen/Thumb2/thumb2-sbc.ll
+++ b/test/CodeGen/Thumb2/thumb2-sbc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 < %s | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK: f1
diff --git a/test/CodeGen/Thumb2/thumb2-select.ll b/test/CodeGen/Thumb2/thumb2-select.ll
index 949b611..105c267 100644
--- a/test/CodeGen/Thumb2/thumb2-select.ll
+++ b/test/CodeGen/Thumb2/thumb2-select.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -show-mc-encoding | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 -show-mc-encoding %s -o - \
+; RUN:  | FileCheck %s
 
 define i32 @f1(i32 %a.s) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-select_xform.ll b/test/CodeGen/Thumb2/thumb2-select_xform.ll
index f8ceba2..20f0e5e 100644
--- a/test/CodeGen/Thumb2/thumb2-select_xform.ll
+++ b/test/CodeGen/Thumb2/thumb2-select_xform.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK: t1
diff --git a/test/CodeGen/Thumb2/thumb2-shifter.ll b/test/CodeGen/Thumb2/thumb2-shifter.ll
index 05dd90c..538fc22 100644
--- a/test/CodeGen/Thumb2/thumb2-shifter.ll
+++ b/test/CodeGen/Thumb2/thumb2-shifter.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=cortex-a8 | FileCheck %s --check-prefix=A8
-; RUN: llc < %s -march=thumb -mcpu=swift | FileCheck %s --check-prefix=SWIFT
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s --check-prefix=A8
+; RUN: llc -mtriple=thumb-eabi -mcpu=swift %s -o - | FileCheck %s --check-prefix=SWIFT
 
 ; rdar://12892707
 
diff --git a/test/CodeGen/Thumb2/thumb2-smla.ll b/test/CodeGen/Thumb2/thumb2-smla.ll
index f96263e..8573d39 100644
--- a/test/CodeGen/Thumb2/thumb2-smla.ll
+++ b/test/CodeGen/Thumb2/thumb2-smla.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp | FileCheck %s
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp -arm-use-mulops=false %s -o - | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f3(i32 %a, i16 %x, i32 %y) {
 ; CHECK: f3
diff --git a/test/CodeGen/Thumb2/thumb2-smul.ll b/test/CodeGen/Thumb2/thumb2-smul.ll
index 742e766..67783d2 100644
--- a/test/CodeGen/Thumb2/thumb2-smul.ll
+++ b/test/CodeGen/Thumb2/thumb2-smul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp |  FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp %s -o - |  FileCheck %s
 
 @x = weak global i16 0          ; <i16*> [#uses=1]
 @y = weak global i16 0          ; <i16*> [#uses=0]
diff --git a/test/CodeGen/Thumb2/thumb2-str.ll b/test/CodeGen/Thumb2/thumb2-str.ll
index f800974..4008145 100644
--- a/test/CodeGen/Thumb2/thumb2-str.ll
+++ b/test/CodeGen/Thumb2/thumb2-str.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32* %v) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-str_post.ll b/test/CodeGen/Thumb2/thumb2-str_post.ll
index 716c2d2..aed849e 100644
--- a/test/CodeGen/Thumb2/thumb2-str_post.ll
+++ b/test/CodeGen/Thumb2/thumb2-str_post.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i16 @test1(i32* %X, i16* %A) {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/Thumb2/thumb2-str_pre.ll b/test/CodeGen/Thumb2/thumb2-str_pre.ll
index 83b3779..e957400 100644
--- a/test/CodeGen/Thumb2/thumb2-str_pre.ll
+++ b/test/CodeGen/Thumb2/thumb2-str_pre.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define void @test1(i32* %X, i32* %A, i32** %dest) {
 ; CHECK: test1
diff --git a/test/CodeGen/Thumb2/thumb2-strb.ll b/test/CodeGen/Thumb2/thumb2-strb.ll
index 39e376d..a2558ec 100644
--- a/test/CodeGen/Thumb2/thumb2-strb.ll
+++ b/test/CodeGen/Thumb2/thumb2-strb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i8 @f1(i8 %a, i8* %v) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-strh.ll b/test/CodeGen/Thumb2/thumb2-strh.ll
index 9444383..cbe73d5 100644
--- a/test/CodeGen/Thumb2/thumb2-strh.ll
+++ b/test/CodeGen/Thumb2/thumb2-strh.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i16 @f1(i16 %a, i16* %v) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-sub.ll b/test/CodeGen/Thumb2/thumb2-sub.ll
index ad5eda1..1c69aeb 100644
--- a/test/CodeGen/Thumb2/thumb2-sub.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-sub2.ll b/test/CodeGen/Thumb2/thumb2-sub2.ll
index f114892..8afc4cb 100644
--- a/test/CodeGen/Thumb2/thumb2-sub2.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
     %tmp = sub i32 %a, 4095
diff --git a/test/CodeGen/Thumb2/thumb2-sub3.ll b/test/CodeGen/Thumb2/thumb2-sub3.ll
index ae12b28..a3702f4 100644
--- a/test/CodeGen/Thumb2/thumb2-sub3.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub3.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 < %s | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-sub4.ll b/test/CodeGen/Thumb2/thumb2-sub4.ll
index 873080a..0ff7567 100644
--- a/test/CodeGen/Thumb2/thumb2-sub4.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-sub5.ll b/test/CodeGen/Thumb2/thumb2-sub5.ll
index 02c83f6..e12d3e1 100644
--- a/test/CodeGen/Thumb2/thumb2-sub5.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+32bit \
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+32bit %s -o - \
 ; RUN:  | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
diff --git a/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll b/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll
index 792ebef..47b94c5 100644
--- a/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll
+++ b/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=cortex-m3 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s
 
 define i32 @test1(i16 zeroext %z) nounwind {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
index 75bbd83..cef3490 100644
--- a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk %s -o - \
+; RUN:  | FileCheck %s
 
 define i32 @test0(i8 %A) {
 ; CHECK: test0
diff --git a/test/CodeGen/Thumb2/thumb2-teq.ll b/test/CodeGen/Thumb2/thumb2-teq.ll
index 6b34e70..258b7e4 100644
--- a/test/CodeGen/Thumb2/thumb2-teq.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-teq2.ll b/test/CodeGen/Thumb2/thumb2-teq2.ll
index ea43e560..3b4970b 100644
--- a/test/CodeGen/Thumb2/thumb2-teq2.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; tst as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-tst.ll b/test/CodeGen/Thumb2/thumb2-tst.ll
index c17510d..8cf6f14 100644
--- a/test/CodeGen/Thumb2/thumb2-tst.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; tst as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-tst2.ll b/test/CodeGen/Thumb2/thumb2-tst2.ll
index 764e3d4..178a2a5 100644
--- a/test/CodeGen/Thumb2/thumb2-tst2.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; tst as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
index 61e849e..bcd4a0f 100644
--- a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=cortex-a8 | FileCheck %s --check-prefix=A8
-; RUN: llc < %s -march=thumb -mcpu=cortex-m3 | FileCheck %s --check-prefix=M3
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s --check-prefix=A8
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s --check-prefix=M3
 ; rdar://11318438
 
 define zeroext i8 @test1(i32 %A.u)  {
diff --git a/test/CodeGen/Thumb2/thumb2-uxtb.ll b/test/CodeGen/Thumb2/thumb2-uxtb.ll
index 2074f98..b8b1bc8 100644
--- a/test/CodeGen/Thumb2/thumb2-uxtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxtb.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=cortex-a8 | FileCheck %s -check-prefix=ARMv7A
-; RUN: llc < %s -march=thumb -mcpu=cortex-m3 | FileCheck %s -check-prefix=ARMv7M
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=ARMv7A
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s -check-prefix=ARMv7M
 
 define i32 @test1(i32 %x) {
 ; ARMv7A: test1
diff --git a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
index e1f8901..4d7c3a1 100644
--- a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
+++ b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
@@ -1,8 +1,14 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-darwin | \
-; RUN:   grep push | count 3
+; RUN: llc < %s -march=x86 -mtriple=i686-darwin | FileCheck %s
+; RUN: llc < %s -march=x86 -mtriple=i686-darwin -addr-sink-using-gep=1 | FileCheck %s
 
 define void @foo(i8** %buf, i32 %size, i32 %col, i8* %p) nounwind {
 entry:
+; CHECK-LABEL: @foo
+; CHECK: push
+; CHECK: push
+; CHECK: push
+; CHECK-NOT: push
+
 	icmp sgt i32 %size, 0		; <i1>:0 [#uses=1]
 	br i1 %0, label %bb.preheader, label %return
 
diff --git a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
index e673d31..e64375a 100644
--- a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
+++ b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=pic | grep TLSGD | count 2
+; RUN: llc < %s -relocation-model=pic | FileCheck %s
 ; PR2137
 
 ; ModuleID = '1.c'
@@ -11,6 +11,8 @@ target triple = "i386-pc-linux-gnu"
 @__libc_resp = hidden alias %struct.__res_state** @__resp		; <%struct.__res_state**> [#uses=2]
 
 define i32 @foo() {
+; CHECK-LABEL: foo:
+; CHECK: leal    __libc_resp@TLSLD
 entry:
 	%retval = alloca i32		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
@@ -24,6 +26,8 @@ return:		; preds = %entry
 }
 
 define i32 @bar() {
+; CHECK-LABEL: bar:
+; CHECK: leal    __libc_resp@TLSLD
 entry:
 	%retval = alloca i32		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index 91fec3b..09e34ef 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -76,7 +76,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!49}
-!46 = metadata !{metadata !0, metadata !9, metadata !16, metadata !17, metadata !20}
+!46 = metadata !{metadata !16, metadata !17, metadata !20}
 
 !0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index 0ef3aa5..f6d6852 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index d0a262d..8487c60 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -22,7 +22,7 @@ define i32 @foo_f() {
 @bar_i = alias internal i32* @bar
 
 ; CHECK-DAG: .globl	A
-@A = alias bitcast (i32* @bar to i64*)
+@A = alias i64, i32* @bar
 
 ; CHECK-DAG: .globl	bar_h
 ; CHECK-DAG: .hidden	bar_h
diff --git a/test/CodeGen/X86/atom-bypass-slow-division-64.ll b/test/CodeGen/X86/atom-bypass-slow-division-64.ll
index d1b52a4..5980b79 100644
--- a/test/CodeGen/X86/atom-bypass-slow-division-64.ll
+++ b/test/CodeGen/X86/atom-bypass-slow-division-64.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mcpu=atom -march=x86-64 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
 
 ; Additional tests for 64-bit divide bypass
 
diff --git a/test/CodeGen/X86/avoid_complex_am.ll b/test/CodeGen/X86/avoid_complex_am.ll
new file mode 100644
index 0000000..7f09519
--- /dev/null
+++ b/test/CodeGen/X86/avoid_complex_am.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+; Complex addressing mode are costly.
+; Make loop-reduce prefer unscaled accesses.
+; On X86, reg1 + 1*reg2 has the same cost as reg1 + 8*reg2.
+; Therefore, LSR currently prefers to fold as much computation as possible
+; in the addressing mode.
+; <rdar://problem/16730541>
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
+; CHECK: @mulDouble
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
+; Only one induction variable should have been generated.
+; CHECK-NOT: phi
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = add nsw i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds double* %b, i64 %tmp
+  %tmp1 = load double* %arrayidx, align 8
+; The induction variable should carry the scaling factor: 1.
+; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
+  %tmp2 = load double* %arrayidx2, align 8
+  %mul = fmul double %tmp1, %tmp2
+  %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
+  store double %mul, double* %arrayidx4, align 8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; Comparison should be 19 * 1 = 19.
+; CHECK: icmp eq i32 {{%[^,]+}}, 19
+  %exitcond = icmp eq i32 %lftr.wideiv, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
index 5fcd5ff..e21c7a0 100644
--- a/test/CodeGen/X86/avx-blend.ll
+++ b/test/CodeGen/X86/avx-blend.ll
@@ -3,7 +3,16 @@
 ; AVX128 tests:
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: vblendvps
+; select mask is <i1 true, i1 false, i1 true, i1 false>.
+; Big endian representation is 0101 = 5.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 1010 = 10.
+; According to the ABI:
+; v1 is in xmm0 => first argument is xmm0.
+; v2 is in xmm1 => second argument is xmm1.
+; result is in xmm0 => destination argument.
+;CHECK: vblendps    $10, %xmm1, %xmm0, %xmm0
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
@@ -12,7 +21,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 
 
 ;CHECK-LABEL: vsel_i32:
-;CHECK: vblendvps
+;CHECK: vblendps   $10, %xmm1, %xmm0, %xmm0
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
@@ -52,7 +61,13 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
 
 ;CHECK-LABEL: vsel_float8:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
+; which translates into the boolean mask (big endian representation):
+; 00010001 = 17.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 11101110 = 238.
+;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
 ;CHECK: ret
 define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
@@ -61,7 +76,7 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
 
 ;CHECK-LABEL: vsel_i328:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
 ;CHECK-NEXT: ret
 define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
@@ -69,7 +84,15 @@ define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
 }
 
 ;CHECK-LABEL: vsel_double8:
-;CHECK: vblendvpd
+; select mask is 2x: 0001 => intel mask: ~0001 = 14
+; ABI:
+; v1 is in ymm0 and ymm1.
+; v2 is in ymm2 and ymm3.
+; result is in ymm0 and ymm1.
+; Compute the low part: res.low = blend v1.low, v2.low, blendmask
+;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
+; Compute the high part.
+;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
 ;CHECK: ret
 define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
@@ -77,7 +100,8 @@ define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
 }
 
 ;CHECK-LABEL: vsel_i648:
-;CHECK: vblendvpd
+;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
+;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
 ;CHECK: ret
 define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
@@ -86,7 +110,7 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
 
 ;CHECK-LABEL: vsel_double4:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvpd
+;CHECK: vshufpd $10
 ;CHECK-NEXT: ret
 define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
@@ -112,4 +136,25 @@ define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
   ret <2 x double> %min
 }
 
+; If we can figure out a blend has a constant mask, we should emit the
+; blend instruction with an immediate mask
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; CHECK-LABEL: constant_blendvpd_avx:
+; CHECK-NOT: mov
+; CHECK: vblendpd
+; CHECK: ret
+  %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
+  ret <4 x double> %1
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; CHECK-LABEL: constant_blendvps_avx:
+; CHECK-NOT: mov
+; CHECK: vblendps
+; CHECK: ret
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
+  ret <8 x float> %1
+}
 
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 02aa617..f407ba4 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -306,3 +306,29 @@ define void @test20() {
   store <3 x double> %a1, <3 x double>* undef, align 1
   ret void
 }
+
+define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
+; CHECK-LABEL: test_insert_64_zext
+; CHECK-NOT: xor
+; CHECK: vmovq
+  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %1
+}
+
+;; Ensure we don't use insertps from non v4x32 vectors.
+;; On SSE4.1 it works because bigger vectors use more than 1 register.
+;; On AVX they get passed in a single register.
+;; FIXME: We could probably optimize this case, if we're only using the
+;; first 4 indices.
+define <4 x i32> @insert_from_diff_size(<8 x i32> %x) {
+; CHECK-LABEL: insert_from_diff_size:
+; CHECK-NOT: insertps
+; CHECK: ret
+  %vecext = extractelement <8 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+  %a.0 = extractelement <8 x i32> %x, i32 0
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a.0, i32 3
+  ret <4 x i32> %vecinit3
+}
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
new file mode 100644
index 0000000..6069c14
--- /dev/null
+++ b/test/CodeGen/X86/avx.ll
@@ -0,0 +1,136 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
+
+define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v4i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %ret
+}
+
+define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8f32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %ret
+}
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+  ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+  ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; X32: movl    8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: vinsertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+  %2 = load <4 x float>* %1, align 16
+  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+  ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl    4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %b, align 4
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK: vbroadcastss
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+  %11 = fadd <4 x float> %7, %8
+  %12 = fadd <4 x float> %9, %10
+  %13 = fadd <4 x float> %11, %12
+  ret <4 x float> %13
+}
diff --git a/test/CodeGen/X86/avx1-logical-load-folding.ll b/test/CodeGen/X86/avx1-logical-load-folding.ll
new file mode 100644
index 0000000..32301b1
--- /dev/null
+++ b/test/CodeGen/X86/avx1-logical-load-folding.ll
@@ -0,0 +1,60 @@
+; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @test1(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = and <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vandps LCPI0_0(%rip), %ymm0, %ymm0
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @test2(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = or <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vorps LCPI1_0(%rip), %ymm0, %ymm0
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @test3(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = xor <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vxorps LCPI2_0(%rip), %ymm0, %ymm0
+}
+
+define void @test4(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = xor <8 x i32> %tmp3, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %tmp5 = and <8 x i32> %tmp4, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp6 = bitcast <8 x i32> %tmp5 to <8 x float>
+  %tmp7 = extractelement <8 x float> %tmp6, i32 0
+  store float %tmp7, float * %C
+  ret void
+
+  ;CHECK: vandnps LCPI3_0(%rip), %ymm0, %ymm0
+}
diff --git a/test/CodeGen/X86/avx2-blend.ll b/test/CodeGen/X86/avx2-blend.ll
new file mode 100644
index 0000000..b02442b
--- /dev/null
+++ b/test/CodeGen/X86/avx2-blend.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; CHECK-LABEL: constant_pblendvb_avx2:
+; CHECK: vmovdqa
+; CHECK: vpblendvb
+  %1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
+  ret <32 x i8> %1
+}
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index 4ae2905..e355301 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -52,6 +52,16 @@ entry:
 ; CHECK: vpaddd  %ymm0, %ymm0, %ymm0
 ; CHECK: ret
 
+define <8 x i32> @test_vpslld_var(i32 %shift) {
+  %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
+  %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
+  ret <8 x i32> %tmp
+}
+
+; CHECK-LABEL: test_vpslld_var:
+; CHECK: vpslld %xmm0, %ymm1, %ymm0
+; CHECK: ret
+
 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
 entry:
   %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 1d83485..2476ea1 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -24,6 +24,22 @@ define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
   ret <16 x i32> %b
 }
 
+; CHECK-LABEL: fptoui_256
+; CHECK: vcvttps2udq
+; CHECK: ret
+define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
+  %b = fptoui <8 x float> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: fptoui_128
+; CHECK: vcvttps2udq
+; CHECK: ret
+define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
+  %b = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %b
+}
+
 ; CHECK-LABEL: fptoui01
 ; CHECK: vcvttpd2udq
 ; CHECK: ret
@@ -184,6 +200,22 @@ define <16 x float> @uitof32(<16 x i32> %a) nounwind {
   ret <16 x float> %b
 }
 
+; CHECK-LABEL: uitof32_256
+; CHECK: vcvtudq2ps
+; CHECK: ret
+define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
+  %b = uitofp <8 x i32> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: uitof32_128
+; CHECK: vcvtudq2ps
+; CHECK: ret
+define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
+  %b = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %b
+}
+
 ; CHECK-LABEL: @fptosi02
 ; CHECK: vcvttss2si {{.*}} encoding: [0x62
 ; CHECK: ret
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index e429a22..20bf7e4 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -1,14 +1,14 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-declare <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float>, i16, <16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dps.mask.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double>, i8, <8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpd.mask.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
+declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
+declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32)
+declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
 
-declare <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qps.mask.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
+declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
 
 ;CHECK-LABEL: gather_mask_dps
 ;CHECK: kmovw
@@ -17,9 +17,9 @@ declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x dou
 ;CHECK: vscatterdps
 ;CHECK: ret
 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
   %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
   ret void
 }
 
@@ -30,9 +30,9 @@ define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8*
 ;CHECK: vscatterdpd
 ;CHECK: ret
 define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
   ret void
 }
 
@@ -43,9 +43,9 @@ define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %b
 ;CHECK: vscatterqps
 ;CHECK: ret
 define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
   ret void
 }
 
@@ -56,23 +56,23 @@ define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %ba
 ;CHECK: vscatterqpd
 ;CHECK: ret
 define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
   ret void
 }
 ;;
 ;; Integer Gather/Scatter
 ;;
-declare <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32>, i16, <16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpi.mask.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64>, i8, <8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpq.mask.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
+declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32)
+declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32)
+declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
 
-declare <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpi.mask.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
+declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
 
 ;CHECK-LABEL: gather_mask_dd
 ;CHECK: kmovw
@@ -81,9 +81,9 @@ declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64
 ;CHECK: vpscatterdd
 ;CHECK: ret
 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
   %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpi.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
   ret void
 }
 
@@ -94,9 +94,9 @@ define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %ba
 ;CHECK: vpscatterqd
 ;CHECK: ret
 define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpi.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
   ret void
 }
 
@@ -107,9 +107,9 @@ define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base,
 ;CHECK: vpscatterqq
 ;CHECK: ret
 define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
   ret void
 }
 
@@ -120,116 +120,19 @@ define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base,
 ;CHECK: vpscatterdq
 ;CHECK: ret
 define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
   ret void
 }
 
-;; FP Intinsics without masks
-
-declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dps.512 (i8*, <16 x i32>, <16 x float>, i32)
-declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qps.512 (i8*, <8 x i64>, <8 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, <8 x i64>, <8 x double>, i32)
-
-;CHECK-LABEL: gather_dps
-;CHECK: kxnorw
-;CHECK: vgatherdps
-;CHECK: vscatterdps
-;CHECK: ret
-define void @gather_dps(<16 x i32> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>%ind, i8* %base, i32 4)
-  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, <16 x i32>%ind2, <16 x float> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qps
-;CHECK: kxnorw
-;CHECK: vgatherqps
-;CHECK: vscatterqps
-;CHECK: ret
-define void @gather_qps(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, <8 x i64>%ind2, <8 x float> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qpd
-;CHECK: kxnorw
-;CHECK: vgatherqpd
-;CHECK: vpadd
-;CHECK: vscatterqpd
-;CHECK: ret
-define void @gather_qpd(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, <8 x i64>%ind2, <8 x double> %x, i32 4)
-  ret void
-}
-
-;; Integer Intinsics without masks
-
-declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, <16 x i32>, <16 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, <8 x i32>, <8 x i64>, i32)
-
-declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, <8 x i64>, <8 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, <8 x i64>, <8 x i64>, i32)
-
-;CHECK-LABEL: gather_dpi
-;CHECK: kxnorw
-;CHECK: vpgatherdd
-;CHECK: vpscatterdd
-;CHECK: ret
-define void @gather_dpi(<16 x i32> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>%ind, i8* %base, i32 4)
-  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, <16 x i32>%ind2, <16 x i32> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qpq
-;CHECK: vpxord  %zmm
-;CHECK: kxnorw
-;CHECK: vpgatherqq
-;CHECK: vpadd
-;CHECK: vpscatterqq
-;CHECK: ret
-define void @gather_qpq(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i64> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qpi
-;CHECK: vpxor %ymm
-;CHECK: kxnorw
-;CHECK: vpgatherqd
-;CHECK: vpadd
-;CHECK: vpscatterqd
-;CHECK: ret
-define void @gather_qpi(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i32> %x, i32 4)
-  ret void
-}
 
 ;CHECK-LABEL: gather_mask_dpd_execdomain
 ;CHECK: vgatherdpd
 ;CHECK: vmovapd
 ;CHECK: ret
 define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   store <8 x double> %x, <8 x double>* %stbuf
   ret void
 }
@@ -239,7 +142,7 @@ define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %m
 ;CHECK: vmovapd
 ;CHECK: ret
 define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   store <8 x double> %x, <8 x double>* %stbuf
   ret void
 }
@@ -249,7 +152,7 @@ define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %m
 ;CHECK: vmovaps 
 ;CHECK: ret
 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base)  {
-  %res = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
   ret <16 x float> %res;
 }
 
@@ -258,7 +161,7 @@ define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %s
 ;CHECK: vmovaps
 ;CHECK: ret
 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base)  {
-  %res = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   ret <8 x float> %res;
 }
 
@@ -268,7 +171,7 @@ define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src,
 ;CHECK: ret
 define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
   %x = load <8 x double>* %src, align 64 
-  call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
   ret void
 }
 
@@ -278,7 +181,7 @@ define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8
 ;CHECK: ret
 define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
   %x = load <8 x double>* %src, align 64
-  call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
   ret void
 }
 
@@ -288,7 +191,7 @@ define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8
 ;CHECK: ret
 define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf)  {
   %x = load <16 x float>* %src, align 64
-  call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
   ret void
 }
 
@@ -298,6 +201,35 @@ define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i1
 ;CHECK: ret
 define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
   %x = load <8 x float>* %src, align 32 
-  call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_qps
+;CHECK: kxnorw
+;CHECK: vgatherqps
+;CHECK: vpadd
+;CHECK: vscatterqps
+;CHECK: ret
+define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf)  {
+  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: prefetch
+;CHECK: gatherpf0
+;CHECK: gatherpf1
+;CHECK: scatterpf0
+;CHECK: scatterpf1
+;CHECK: ret
+declare  void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
+declare  void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
+define void @prefetch(<8 x i64> %ind, i8* %base) {
+  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
+  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1)
+  call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
+  call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1)
   ret void
 }
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 6557ac3..b360c71 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -158,3 +158,41 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
   %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
   ret i64 %res
 }
+
+;CHECK-LABEL: test15
+;CHECK: kshiftlw
+;CHECK: kmovw
+;CHECK: ret
+define i16 @test15(i1 *%addr) {
+  %x = load i1 * %addr, align 128
+  %x1 = insertelement <16 x i1> undef, i1 %x, i32 10
+  %x2 = bitcast <16 x i1>%x1 to i16
+  ret i16 %x2
+}
+
+;CHECK-LABEL: test16
+;CHECK: kshiftlw
+;CHECK: kshiftrw
+;CHECK: korw
+;CHECK: ret
+define i16 @test16(i1 *%addr, i16 %a) {
+  %x = load i1 * %addr, align 128
+  %a1 = bitcast i16 %a to <16 x i1>
+  %x1 = insertelement <16 x i1> %a1, i1 %x, i32 10
+  %x2 = bitcast <16 x i1>%x1 to i16
+  ret i16 %x2
+}
+
+;CHECK-LABEL: test17
+;CHECK: kshiftlw
+;CHECK: kshiftrw
+;CHECK: korw
+;CHECK: ret
+define i8 @test17(i1 *%addr, i8 %a) {
+  %x = load i1 * %addr, align 128
+  %a1 = bitcast i8 %a to <8 x i1>
+  %x1 = insertelement <8 x i1> %a1, i1 %x, i32 10
+  %x2 = bitcast <8 x i1>%x1 to i8
+  ret i8 %x2
+}
+
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 3fb38ed..e19841a 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -78,7 +78,7 @@ declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8
 
 define <8 x double> @test7(<8 x double> %a) {
 ; CHECK: vrndscalepd {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x0b]
-  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> zeroinitializer, i8 -1, i32 4)
+  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
   ret <8 x double>%res
 }
 
@@ -86,7 +86,7 @@ declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <1
 
 define <16 x float> @test8(<16 x float> %a) {
 ; CHECK: vrndscaleps {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x0b]
-  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
   ret <16 x float>%res
 }
 
@@ -536,4 +536,12 @@ define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
   ret void
 }
 
-declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
-\ No newline at end of file
+declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
+
+define <16 x float> @test_vpermt2ps(<16 x float>%x, <16 x float>%y, <16 x i32>%perm) {
+; CHECK: vpermt2ps {{.*}}encoding: [0x62,0xf2,0x6d,0x48,0x7f,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>%perm, <16 x float>%x, <16 x float>%y, i16 -1)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 13e6843..009802f 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -153,3 +153,31 @@ define void @test18(i8 * %addr, <8 x i64> %data) {
   ret void
 }
 
+; CHECK-LABEL: store_i1_1
+; CHECK: movb
+; CHECK: movb
+; CHECK: ret
+define void @store_i1_1() {
+  store i1 true, i1 addrspace(3)* undef, align 128
+  store i1 false, i1 addrspace(2)* undef, align 128
+  ret void
+}
+
+; CHECK-LABEL: store_i1_2
+; CHECK: movb
+; CHECK: ret
+define void @store_i1_2(i64 %a, i64 %b) {
+  %res = icmp eq i64 %a, %b
+  store i1 %res, i1 addrspace(3)* undef, align 128
+  ret void
+}
+
+; CHECK-LABEL: store_i1_3
+; CHECK: kmovw
+; CHECK: ret
+define void @store_i1_3(i16 %a) {
+  %a_vec = bitcast i16 %a to <16 x i1>
+  %res = extractelement <16 x i1> %a_vec, i32 4
+  store i1 %res, i1 addrspace(3)* undef, align 128
+  ret void
+}
diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
index 59d7010..23ddc3a 100644
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ b/test/CodeGen/X86/avx512-shuffle.ll
@@ -231,3 +231,22 @@ define <16 x i32> @test27(<4 x i32>%a) {
  %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  ret <16 x i32> %res
 }
+
+; CHECK-LABEL: @test28
+; CHECK: vinserti64x4 $1
+; CHECK: ret
+define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) {
+ %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                                              i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i32> %res
+}
+
+; CHECK-LABEL: @test29
+; CHECK: vinserti64x4 $0
+; CHECK: ret
+define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) {
+ %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                                              i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i32> %res
+}
+
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 6b46596..34aaf2c 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -4,7 +4,7 @@
 ; Verify that we produce movss instead of blendvps when possible.
 
 ;CHECK-LABEL: vsel_float:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
 ;CHECK: movss
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
@@ -13,7 +13,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 }
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
 ;CHECK: movss
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
@@ -21,14 +21,18 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
   ret <4 x i8> %vsel
 }
 
-
-; We do not have native support for v8i16 blends and we have to use the
-; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not
-; reduce the mask in this case.
 ;CHECK-LABEL: vsel_8xi16:
-;CHECK: andps
-;CHECK: andps
-;CHECK: orps
+; The select mask is
+; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
+; which translates into the boolean mask (big endian representation):
+; 00010001 = 17.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 11101110 = 238.
+; According to the ABI:
+; v1 is in xmm0 => first argument is xmm0.
+; v2 is in xmm1 => second argument is xmm1.
+;CHECK: pblendw $238, %xmm1, %xmm0
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index 242075a..a707209 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -216,6 +216,23 @@ entry:
 ; CHECK: bzhiq
 }
 
+define i64 @bzhi64_constant_mask(i64 %x) #0 {
+entry:
+  %and = and i64 %x, 4611686018427387903
+  ret i64 %and
+; CHECK-LABEL: bzhi64_constant_mask:
+; CHECK: movb    $62, %al
+; CHECK: bzhiq   %rax, %r[[ARG1:di|cx]], %rax
+}
+
+define i64 @bzhi64_small_constant_mask(i64 %x) #0 {
+entry:
+  %and = and i64 %x, 2147483647
+  ret i64 %and
+; CHECK-LABEL: bzhi64_small_constant_mask:
+; CHECK: andq  $2147483647, %r[[ARG1]]
+}
+
 define i32 @blsi32(i32 %x) nounwind readnone {
   %tmp = sub i32 0, %x
   %tmp2 = and i32 %x, %tmp
diff --git a/test/CodeGen/X86/br-fold.ll b/test/CodeGen/X86/br-fold.ll
index 5223463..fd1e73b 100644
--- a/test/CodeGen/X86/br-fold.ll
+++ b/test/CodeGen/X86/br-fold.ll
@@ -1,7 +1,19 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck  -check-prefix=X64_DARWIN %s
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck  -check-prefix=X64_LINUX %s
+; RUN: llc -mtriple=x86_64-pc-windows < %s | FileCheck  -check-prefix=X64_WINDOWS %s
+; RUN: llc -mtriple=x86_64-pc-windows-gnu < %s | FileCheck  -check-prefix=X64_WINDOWS_GNU %s
 
-; CHECK: orq
-; CHECK-NEXT: %bb8.i329
+; X64_DARWIN: orq
+; X64_DARWIN-NEXT: %bb8.i329
+
+; X64_LINUX: orq %rax, %rcx
+; X64_LINUX-NEXT: %bb8.i329
+
+; X64_WINDOWS: orq %rax, %rcx
+; X64_WINDOWS-NEXT: ud2
+
+; X64_WINDOWS_GNU: orq %rax, %rcx
+; X64_WINDOWS_GNU-NEXT: ud2
 
 @_ZN11xercesc_2_513SchemaSymbols21fgURI_SCHEMAFORSCHEMAE = external constant [33 x i16], align 32 ; <[33 x i16]*> [#uses=1]
 @_ZN11xercesc_2_56XMLUni16fgNotationStringE = external constant [9 x i16], align 16 ; <[9 x i16]*> [#uses=1]
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll
index 6b77176..3c931db 100644
--- a/test/CodeGen/X86/bswap-vector.ll
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -1,19 +1,144 @@
-; RUN: llc < %s -mcpu=x86_64 | FileCheck %s
+; RUN: llc < %s -mcpu=x86-64 | FileCheck %s -check-prefix=CHECK-NOSSSE3
+; RUN: llc < %s -mcpu=core2 | FileCheck %s -check-prefix=CHECK-SSSE3
+; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
 
-define <2 x i64> @foo(<2 x i64> %v) #0 {
+define <8 x i16> @test1(<8 x i16> %v) #0 {
+entry:
+  %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
+  ret <8 x i16> %r
+
+; CHECK-NOSSSE3-LABEL: @test1
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test1
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test1
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <4 x i32> @test2(<4 x i32> %v) #0 {
+entry:
+  %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
+  ret <4 x i32> %r
+
+; CHECK-NOSSSE3-LABEL: @test2
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test2
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test2
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <2 x i64> @test3(<2 x i64> %v) #0 {
 entry:
   %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
   ret <2 x i64> %r
+
+; CHECK-NOSSSE3-LABEL: @test3
+; CHECK-NOSSSE3: bswapq
+; CHECK-NOSSSE3: bswapq
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test3
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test3
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)
+declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
+declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
+
+define <16 x i16> @test4(<16 x i16> %v) #0 {
+entry:
+  %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v)
+  ret <16 x i16> %r
+
+; CHECK-SSSE3-LABEL: @test4
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test4
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <8 x i32> @test5(<8 x i32> %v) #0 {
+entry:
+  %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v)
+  ret <8 x i32> %r
+
+; CHECK-SSSE3-LABEL: @test5
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test5
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <4 x i64> @test6(<4 x i64> %v) #0 {
+entry:
+  %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v)
+  ret <4 x i64> %r
+
+; CHECK-SSSE3-LABEL: @test6
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test6
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
 }
 
-; CHECK-LABEL: @foo
-; CHECK: bswapq
-; CHECK: bswapq
-; CHECK: retq
+declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
+
+define <4 x i16> @test7(<4 x i16> %v) #0 {
+entry:
+  %r = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v)
+  ret <4 x i16> %r
+
+; CHECK-SSSE3-LABEL: @test7
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: psrld $16
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test7
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2: vpsrld $16
+; CHECK-AVX2-NEXT: retq
+}
 
 attributes #0 = { nounwind uwtable }
 
diff --git a/test/CodeGen/X86/cdecl-method-return.ll b/test/CodeGen/X86/cdecl-method-return.ll
deleted file mode 100644
index 2baa47a..0000000
--- a/test/CodeGen/X86/cdecl-method-return.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -mtriple=i686-pc-win32 -mcpu=core2 | FileCheck %s
-
-; The sret flag causes the first two parameters to be reordered on the stack.
-
-define x86_cdeclmethodcc void @foo(i32* sret %dst, i32* %src) {
-  %v = load i32* %src
-  store i32 %v, i32* %dst
-  ret void
-}
-
-; CHECK-LABEL: _foo:
-; CHECK:  movl    8(%esp), %[[dst:[^ ]*]]
-; CHECK:  movl    4(%esp), %[[src:[^ ]*]]
-; CHECK:  movl    (%[[src]]), %[[v:[^ ]*]]
-; CHECK:  movl    %[[v]], (%[[dst]])
-; CHECK:  retl
-
-define i32 @bar() {
-  %src = alloca i32
-  %dst = alloca i32
-  store i32 42, i32* %src
-  call x86_cdeclmethodcc void @foo(i32* sret %dst, i32* %src)
-  %v = load i32* %dst
-  ret i32 %v
-}
-
-; CHECK-LABEL: _bar:
-; CHECK:  movl    $42, [[src:[^,]*]]
-; CHECK:  leal    [[src]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], (%esp)
-; CHECK:  leal    [[dst:[^,]*]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], 4(%esp)
-; CHECK:  calll   _foo
-; CHECK:  movl    [[dst]], %eax
-; CHECK:  retl
-
-; If we don't have the sret flag, parameters are not reordered.
-
-define x86_cdeclmethodcc void @baz(i32* %dst, i32* %src) {
-  %v = load i32* %src
-  store i32 %v, i32* %dst
-  ret void
-}
-
-; CHECK-LABEL: _baz:
-; CHECK:  movl    4(%esp), %[[dst:[^ ]*]]
-; CHECK:  movl    8(%esp), %[[src:[^ ]*]]
-; CHECK:  movl    (%[[src]]), %[[v:[^ ]*]]
-; CHECK:  movl    %[[v]], (%[[dst]])
-; CHECK:  retl
-
-define i32 @qux() {
-  %src = alloca i32
-  %dst = alloca i32
-  store i32 42, i32* %src
-  call x86_cdeclmethodcc void @baz(i32* %dst, i32* %src)
-  %v = load i32* %dst
-  ret i32 %v
-}
-
-; CHECK-LABEL: _qux:
-; CHECK:  movl    $42, [[src:[^,]*]]
-; CHECK:  leal    [[src]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], 4(%esp)
-; CHECK:  leal    [[dst:[^,]*]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], (%esp)
-; CHECK:  calll   _baz
-; CHECK:  movl    [[dst]], %eax
-; CHECK:  retl
diff --git a/test/CodeGen/X86/cfi.ll b/test/CodeGen/X86/cfi.ll
new file mode 100644
index 0000000..b57ff45
--- /dev/null
+++ b/test/CodeGen/X86/cfi.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck --check-prefix=STATIC %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic | FileCheck --check-prefix=PIC %s
+
+; STATIC: .cfi_personality 3, __gxx_personality_v0
+; STATIC: .cfi_lsda 3, .Lexception0
+
+; PIC: .cfi_personality 155, DW.ref.__gxx_personality_v0
+; PIC: .cfi_lsda 27, .Lexception0
+
+
+define void @bar() {
+entry:
+  %call = invoke i32 @foo()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret void
+
+lpad:
+  %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
+            catch i8* null
+  ret void
+}
+
+declare i32 @foo()
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index 551d9bc..cdcdc96 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -26,9 +26,22 @@ cond_true:		; preds = %0
 ReturnBlock:		; preds = %0
 	ret i32 0
 ; CHECK-LABEL: test2:
-; CHECK: movl	(%rsi), %eax
-; CHECK: shll	$3, %eax
-; CHECK: testl	%eax, %eax
+; CHECK: testl	$536870911, (%rsi)
+}
+
+define i8 @test2b(i8 %X, i8* %y) nounwind {
+	%tmp = load i8* %y		; <i8> [#uses=1]
+	%tmp1 = shl i8 %tmp, 3		; <i8> [#uses=1]
+	%tmp1.upgrd.2 = icmp eq i8 %tmp1, 0		; <i1> [#uses=1]
+	br i1 %tmp1.upgrd.2, label %ReturnBlock, label %cond_true
+
+cond_true:		; preds = %0
+	ret i8 1
+
+ReturnBlock:		; preds = %0
+	ret i8 0
+; CHECK-LABEL: test2b:
+; CHECK: testb	$31, (%rsi)
 }
 
 define i64 @test3(i64 %x) nounwind {
@@ -68,8 +81,8 @@ define i32 @test5(double %A) nounwind  {
  bb12:; preds = %entry
  ret i32 32
 ; CHECK-LABEL: test5:
-; CHECK: ucomisd	LCPI4_0(%rip), %xmm0
-; CHECK: ucomisd	LCPI4_1(%rip), %xmm0
+; CHECK: ucomisd	LCPI5_0(%rip), %xmm0
+; CHECK: ucomisd	LCPI5_1(%rip), %xmm0
 }
 
 declare i32 @foo(...)
@@ -163,3 +176,25 @@ define i32 @test12() uwtable ssp {
 }
 
 declare zeroext i1 @test12b()
+
+define i32 @test13(i32 %mask, i32 %base, i32 %intra) {
+  %and = and i32 %mask, 8
+  %tobool = icmp ne i32 %and, 0
+  %cond = select i1 %tobool, i32 %intra, i32 %base
+  ret i32 %cond
+
+; CHECK-LABEL: test13:
+; CHECK: testb	$8, %dil
+; CHECK: cmovnel
+}
+
+define i32 @test14(i32 %mask, i32 %base, i32 %intra) #0 {
+  %s = lshr i32 %mask, 7
+  %tobool = icmp sgt i32 %s, -1
+  %cond = select i1 %tobool, i32 %intra, i32 %base
+  ret i32 %cond
+
+; CHECK-LABEL: test14:
+; CHECK: 	shrl	$7, %edi
+; CHECK-NEXT: 	cmovnsl	%edx, %esi
+}
diff --git a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index e3d6b34..78e1dd2 100644
--- a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
+; RUN: opt -S -codegenprepare -addr-sink-using-gep=1 %s -o - | FileCheck -check-prefix=CHECK-GEP %s
 ; This file tests the different cases what are involved when codegen prepare
 ; tries to get sign extension out of the way of addressing mode.
 ; This tests require an actual target as addressing mode decisions depends
@@ -281,6 +282,25 @@ define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, i8* %base) {
 ; CHECK: [[ADDR2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[BASE2]] to i32*
 ; CHECK: load i32* [[ADDR2]]
 ; CHECK: ret
+; CHECK-GEP-LABEL: @checkProfitability
+; CHECK-GEP-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg1 to i64
+; CHECK-GEP-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg2 to i64
+; CHECK-GEP: [[SHL:%[a-zA-Z_0-9-]+]] = shl nsw i32 %arg1, 1
+; CHECK-GEP: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SHL]], %arg2
+; CHECK-GEP: [[SEXTADD:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
+; BB then
+; CHECK-GEP: [[BASE1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to i32*
+; CHECK-GEP: [[BCC1:%[a-zA-Z_0-9-]+]] = bitcast i32* [[BASE1]] to i8*
+; CHECK-GEP: [[FULL1:%[a-zA-Z_0-9-]+]] = getelementptr i8* [[BCC1]], i64 48
+; CHECK-GEP: [[ADDR1:%[a-zA-Z_0-9-]+]] = bitcast i8* [[FULL1]] to i32*
+; CHECK-GEP: load i32* [[ADDR1]]
+; BB else
+; CHECK-GEP: [[BASE2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to i32*
+; CHECK-GEP: [[BCC2:%[a-zA-Z_0-9-]+]] = bitcast i32* [[BASE2]] to i8*
+; CHECK-GEP: [[FULL2:%[a-zA-Z_0-9-]+]] = getelementptr i8* [[BCC2]], i64 48
+; CHECK-GEP: [[ADDR2:%[a-zA-Z_0-9-]+]] = bitcast i8* [[FULL2]] to i32*
+; CHECK-GEP: load i32* [[ADDR2]]
+; CHECK-GEP: ret
 define i32 @checkProfitability(i32 %arg1, i32 %arg2, i1 %test) {
   %shl = shl nsw i32 %arg1, 1
   %add1 = add nsw i32 %shl, %arg2
diff --git a/test/CodeGen/X86/codegen-prepare-crash.ll b/test/CodeGen/X86/codegen-prepare-crash.ll
new file mode 100644
index 0000000..c328817
--- /dev/null
+++ b/test/CodeGen/X86/codegen-prepare-crash.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external global [10 x i32]
+
+define void @f(i32 %u) {
+  %1 = add i32 %u, 4
+  br label %P.Proc8.exit
+
+P.Proc8.exit:
+  %valueindex35.i = getelementptr [10 x i32]* @g, i32 0, i32 %1
+  store i32 %u, i32* %valueindex35.i
+  ret void
+}
diff --git a/test/CodeGen/X86/codegen-prepare.ll b/test/CodeGen/X86/codegen-prepare.ll
index 316accf..4ff0f1c 100644
--- a/test/CodeGen/X86/codegen-prepare.ll
+++ b/test/CodeGen/X86/codegen-prepare.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -addr-sink-using-gep=1 | FileCheck %s
 
 ; Check that the CodeGenPrepare Pass
 ; does not wrongly rewrite the address computed by Instruction %4
diff --git a/test/CodeGen/X86/combine-avx-intrinsics.ll b/test/CodeGen/X86/combine-avx-intrinsics.ll
new file mode 100644
index 0000000..f610f7f
--- /dev/null
+++ b/test/CodeGen/X86/combine-avx-intrinsics.ll
@@ -0,0 +1,119 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s
+
+
+define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0) {
+  %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a0, i32 7)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test_x86_avx_blend_pd_256
+; CHECK-NOT: vblendpd
+; CHECK: ret
+
+
+define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0) {
+  %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a0, i32 7)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test_x86_avx_blend_ps_256
+; CHECK-NOT: vblendps
+; CHECK: ret
+
+
+define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a0, <4 x double> %a1)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test_x86_avx_blendv_pd_256
+; CHECK-NOT: vblendvpd
+; CHECK: ret
+
+
+define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test_x86_avx_blendv_ps_256
+; CHECK-NOT: vblendvps
+; CHECK: ret
+
+
+define <4 x double> @test2_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test2_x86_avx_blend_pd_256
+; CHECK-NOT: vblendpd
+; CHECK: ret
+
+
+define <8 x float> @test2_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test2_x86_avx_blend_ps_256
+; CHECK-NOT: vblendps
+; CHECK: ret
+
+
+define <4 x double> @test2_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> zeroinitializer)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test2_x86_avx_blendv_pd_256
+; CHECK-NOT: vblendvpd
+; CHECK: ret
+
+
+define <8 x float> @test2_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test2_x86_avx_blendv_ps_256
+; CHECK-NOT: vblendvps
+; CHECK: ret
+
+
+define <4 x double> @test3_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 -1)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test3_x86_avx_blend_pd_256
+; CHECK-NOT: vblendpd
+; CHECK: ret
+
+
+define <8 x float> @test3_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 -1)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test3_x86_avx_blend_ps_256
+; CHECK-NOT: vblendps
+; CHECK: ret
+
+
+define <4 x double> @test3_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %Mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <4 x double>
+  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %Mask)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test3_x86_avx_blendv_pd_256
+; CHECK-NOT: vblendvpd
+; CHECK: ret
+
+
+define <8 x float> @test3_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %Mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x float>
+  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %Mask)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test3_x86_avx_blendv_ps_256
+; CHECK-NOT: vblendvps
+; CHECK: ret
+
+
+
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32)
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
diff --git a/test/CodeGen/X86/combine-avx2-intrinsics.ll b/test/CodeGen/X86/combine-avx2-intrinsics.ll
new file mode 100644
index 0000000..8794f8b
--- /dev/null
+++ b/test/CodeGen/X86/combine-avx2-intrinsics.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s
+
+; Verify that the backend correctly combines AVX2 builtin intrinsics.
+
+
+define <8 x i32> @test_psra_1(<8 x i32> %A) {
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 3)
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 2)
+  ret <8 x i32> %3
+}
+; CHECK-LABEL: test_psra_1
+; CHECK: vpsrad $8, %ymm0, %ymm0
+; CHECK-NEXT: ret
+
+define <16 x i16> @test_psra_2(<16 x i16> %A) {
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 3)
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 2)
+  ret <16 x i16> %3
+}
+; CHECK-LABEL: test_psra_2
+; CHECK: vpsraw $8, %ymm0, %ymm0
+; CHECK-NEXT: ret
+
+define <16 x i16> @test_psra_3(<16 x i16> %A) {
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0)
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0)
+  ret <16 x i16> %3
+}
+; CHECK-LABEL: test_psra_3
+; CHECK-NOT: vpsraw
+; CHECK: ret
+
+define <8 x i32> @test_psra_4(<8 x i32> %A) {
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0)
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0)
+  ret <8 x i32> %3
+}
+; CHECK-LABEL: test_psra_4
+; CHECK-NOT: vpsrad
+; CHECK: ret
+
+
+define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
+  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendvb
+; CHECK-NOT: vpblendvb
+; CHECK: ret
+
+
+define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0) {
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a0, i32 7)
+  ret <16 x i16> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendw
+; CHECK-NOT: vpblendw
+; CHECK: ret
+
+
+define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0) {
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a0, i32 7)
+  ret <4 x i32> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendd_128
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0) {
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a0, i32 7)
+  ret <8 x i32> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendd_256
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <32 x i8> @test2_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
+  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> zeroinitializer)
+  ret <32 x i8> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendvb
+; CHECK-NOT: vpblendvb
+; CHECK: ret
+
+
+define <16 x i16> @test2_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 0)
+  ret <16 x i16> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendw
+; CHECK-NOT: vpblendw
+; CHECK: ret
+
+
+define <4 x i32> @test2_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 0)
+  ret <4 x i32> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendd_128
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <8 x i32> @test2_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 0)
+  ret <8 x i32> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendd_256
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <32 x i8> @test3_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
+  %1 = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <32 x i8>
+  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %1)
+  ret <32 x i8> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendvb
+; CHECK-NOT: vpblendvb
+; CHECK: ret
+
+
+define <16 x i16> @test3_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 -1)
+  ret <16 x i16> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendw
+; CHECK-NOT: vpblendw
+; CHECK: ret
+
+
+define <4 x i32> @test3_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 -1)
+  ret <4 x i32> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendd_128
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <8 x i32> @test3_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 -1)
+  ret <8 x i32> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendd_256
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32)
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32)
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32)
+declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>)
+declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32)
+declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32)
+
diff --git a/test/CodeGen/X86/combine-sse2-intrinsics.ll b/test/CodeGen/X86/combine-sse2-intrinsics.ll
new file mode 100644
index 0000000..fa500e5
--- /dev/null
+++ b/test/CodeGen/X86/combine-sse2-intrinsics.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; Verify that the backend correctly combines SSE2 builtin intrinsics.
+
+
+define <4 x i32> @test_psra_1(<4 x i32> %A) {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 3)
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 2)
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test_psra_1
+; CHECK: psrad $8, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test_psra_2(<8 x i16> %A) {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 3)
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 2)
+  ret <8 x i16> %3
+}
+; CHECK-LABEL: test_psra_2
+; CHECK: psraw $8, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_psra_3(<4 x i32> %A) {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0)
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 0)
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test_psra_3
+; CHECK-NOT: psrad
+; CHECK: ret
+
+
+define <8 x i16> @test_psra_4(<8 x i16> %A) {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0)
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0)
+  ret <8 x i16> %3
+}
+; CHECK-LABEL: test_psra_4
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
+
diff --git a/test/CodeGen/X86/combine-sse41-intrinsics.ll b/test/CodeGen/X86/combine-sse41-intrinsics.ll
new file mode 100644
index 0000000..254991a
--- /dev/null
+++ b/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -0,0 +1,182 @@
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7 | FileCheck %s
+
+
+define <2 x double> @test_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 0)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test_x86_sse41_blend_pd
+; CHECK-NOT: blendpd
+; CHECK: ret
+
+
+define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 0)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test_x86_sse41_blend_ps
+; CHECK-NOT: blendps
+; CHECK: ret
+
+
+define <2 x double> @test_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> zeroinitializer)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test_x86_sse41_blendv_pd
+; CHECK-NOT: blendvpd
+; CHECK: ret
+
+
+define <4 x float> @test_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test_x86_sse41_blendv_ps
+; CHECK-NOT: blendvps
+; CHECK: ret
+
+
+define <16 x i8> @test_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1) {
+  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> zeroinitializer)
+  ret <16 x i8> %1
+}
+; CHECK-LABEL: test_x86_sse41_pblendv_b
+; CHECK-NOT: pblendvb
+; CHECK: ret
+
+
+define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
+  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 0)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test_x86_sse41_pblend_w
+; CHECK-NOT: pblendw
+; CHECK: ret
+
+
+define <2 x double> @test2_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 -1)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blend_pd
+; CHECK-NOT: blendpd
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 -1)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blend_ps
+; CHECK-NOT: blendps
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <2 x double> @test2_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
+  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <2 x double>
+  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %Mask )
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blendv_pd
+; CHECK-NOT: blendvpd
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test2_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
+  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x float>
+  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %Mask)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blendv_ps
+; CHECK-NOT: blendvps
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <16 x i8> @test2_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <16 x i8>
+  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %Mask)
+  ret <16 x i8> %1
+}
+; CHECK-LABEL: test2_x86_sse41_pblendv_b
+; CHECK-NOT: pblendvb
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
+  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 -1)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test2_x86_sse41_pblend_w
+; CHECK-NOT: pblendw
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <2 x double> @test3_x86_sse41_blend_pd(<2 x double> %a0) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a0, i32 7)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blend_pd
+; CHECK-NOT: blendpd
+; CHECK: ret
+
+
+define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a0, i32 7)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blend_ps
+; CHECK-NOT: blendps
+; CHECK: ret
+
+
+define <2 x double> @test3_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a0, <2 x double> %a1 )
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blendv_pd
+; CHECK-NOT: blendvpd
+; CHECK: ret
+
+
+define <4 x float> @test3_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blendv_ps
+; CHECK-NOT: blendvps
+; CHECK: ret
+
+
+define <16 x i8> @test3_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1) {
+  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %1
+}
+; CHECK-LABEL: test3_x86_sse41_pblendv_b
+; CHECK-NOT: pblendvb
+; CHECK: ret
+
+
+define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) {
+  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a0, i32 7)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test3_x86_sse41_pblend_w
+; CHECK-NOT: pblendw
+; CHECK: ret
+
+
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32)
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>)
+
diff --git a/test/CodeGen/X86/constant-hoisting-shift-immediate.ll b/test/CodeGen/X86/constant-hoisting-shift-immediate.ll
new file mode 100644
index 0000000..883be35
--- /dev/null
+++ b/test/CodeGen/X86/constant-hoisting-shift-immediate.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -O3 -march=x86-64 |FileCheck %s
+define i64 @foo(i1 %z, i192* %p, i192* %q)
+{
+; If const 128 is hoisted to a variable, then in basic block L_val2 we would
+; have %lshr2 = lshr i192 %data2, %const, and the definition of %const would
+; be in another basic block. As a result, a very inefficient code might be
+; produced. Here we check that this doesn't occur.
+entry:
+  %data1 = load i192* %p, align 8
+  %lshr1 = lshr i192 %data1, 128
+  %val1  = trunc i192 %lshr1 to i64
+  br i1 %z, label %End, label %L_val2
+
+; CHECK: movq    16(%rdx), %rax
+; CHECK-NEXT: retq
+L_val2:
+  %data2 = load i192* %q, align 8
+  %lshr2 = lshr i192 %data2, 128
+  %val2  = trunc i192 %lshr2 to i64
+  br label %End
+
+End:
+  %p1 = phi i64 [%val1,%entry], [%val2,%L_val2]
+  ret i64 %p1
+}
diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index 98ae1d5..21225e3 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll
@@ -7,7 +7,7 @@ entry:
 	%div = udiv i16 %x, 33
 	ret i16 %div
 ; CHECK-LABEL: test1:
-; CHECK: imull	$63551, %eax, %eax
+; CHECK: imull	$63551, %eax
 ; CHECK-NEXT: shrl	$21, %eax
 ; CHECK-NEXT: ret
 }
@@ -18,7 +18,7 @@ entry:
   ret i16 %div
 
 ; CHECK-LABEL: test2:
-; CHECK: imull	$43691, %eax, %eax
+; CHECK: imull	$43691, %eax
 ; CHECK-NEXT: shrl	$17, %eax
 ; CHECK-NEXT: ret
 }
@@ -30,7 +30,7 @@ entry:
 
 ; CHECK-LABEL: test3:
 ; CHECK: movzbl  8(%esp), %eax
-; CHECK-NEXT: imull	$171, %eax, %eax
+; CHECK-NEXT: imull	$171, %eax
 ; CHECK-NEXT: shrl	$9, %eax
 ; CHECK-NEXT: ret
 }
@@ -40,7 +40,7 @@ entry:
 	%div = sdiv i16 %x, 33		; <i32> [#uses=1]
 	ret i16 %div
 ; CHECK-LABEL: test4:
-; CHECK: imull	$1986, %eax, %
+; CHECK: imull	$1986, %eax
 }
 
 define i32 @test5(i32 %A) nounwind {
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index a38c2d8..f4dec4f 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -40,18 +40,18 @@ define weak_odr dllexport void @weak1() {
 ; CHECK: .globl Var1
 @Var1 = dllexport global i32 1, align 4
 
-; CHECK: .rdata,"r"
+; CHECK: .rdata,"rd"
 ; CHECK: .globl Var2
 @Var2 = dllexport unnamed_addr constant i32 1
 
 ; CHECK: .comm Var3
 @Var3 = common dllexport global i32 0, align 4
 
-; CHECK: .section .data,"w",discard,WeakVar1
+; CHECK: .section .data,"wd",discard,WeakVar1
 ; CHECK: .globl WeakVar1
 @WeakVar1 = weak_odr dllexport global i32 1, align 4
 
-; CHECK: .section .rdata,"r",discard,WeakVar2
+; CHECK: .section .rdata,"rd",discard,WeakVar2
 ; CHECK: .globl WeakVar2
 @WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
 
@@ -66,39 +66,43 @@ define weak_odr dllexport void @weak1() {
 
 ; CHECK: .globl alias3
 ; CHECK: alias3 = notExported
-@alias3 = dllexport alias void()* @alias
+@alias3 = dllexport alias void()* @notExported
 
 ; CHECK: .weak weak_alias
 ; CHECK: weak_alias = f1
 @weak_alias = dllexport alias weak_odr void()* @f1
 
+@blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
+@blob_alias = dllexport alias i32 (), [6 x i8]* @blob
 
 ; CHECK: .section .drectve
-; WIN32: /EXPORT:Var1,DATA
-; WIN32: /EXPORT:Var2,DATA
-; WIN32: /EXPORT:Var3,DATA
-; WIN32: /EXPORT:WeakVar1,DATA
-; WIN32: /EXPORT:WeakVar2,DATA
-; WIN32: /EXPORT:f1
-; WIN32: /EXPORT:f2
-; WIN32: /EXPORT:lnk1
-; WIN32: /EXPORT:lnk2
-; WIN32: /EXPORT:weak1
-; WIN32: /EXPORT:alias
-; WIN32: /EXPORT:alias2
-; WIN32: /EXPORT:alias3
-; WIN32: /EXPORT:weak_alias
-; MINGW: -export:Var1,data
-; MINGW: -export:Var2,data
-; MINGW: -export:Var3,data
-; MINGW: -export:WeakVar1,data
-; MINGW: -export:WeakVar2,data
-; MINGW: -export:f1
-; MINGW: -export:f2
-; MINGW: -export:lnk1
-; MINGW: -export:lnk2
-; MINGW: -export:weak1
-; MINGW: -export:alias
-; MINGW: -export:alias2
-; MINGW: -export:alias3
-; MINGW: -export:weak_alias
+; WIN32: " /EXPORT:Var1,DATA"
+; WIN32: " /EXPORT:Var2,DATA"
+; WIN32: " /EXPORT:Var3,DATA"
+; WIN32: " /EXPORT:WeakVar1,DATA"
+; WIN32: " /EXPORT:WeakVar2,DATA"
+; WIN32: " /EXPORT:f1"
+; WIN32: " /EXPORT:f2"
+; WIN32: " /EXPORT:lnk1"
+; WIN32: " /EXPORT:lnk2"
+; WIN32: " /EXPORT:weak1"
+; WIN32: " /EXPORT:alias"
+; WIN32: " /EXPORT:alias2"
+; WIN32: " /EXPORT:alias3"
+; WIN32: " /EXPORT:weak_alias"
+; WIN32: " /EXPORT:blob_alias"
+; MINGW: " -export:Var1,data"
+; MINGW: " -export:Var2,data"
+; MINGW: " -export:Var3,data"
+; MINGW: " -export:WeakVar1,data"
+; MINGW: " -export:WeakVar2,data"
+; MINGW: " -export:f1"
+; MINGW: " -export:f2"
+; MINGW: " -export:lnk1"
+; MINGW: " -export:lnk2"
+; MINGW: " -export:weak1"
+; MINGW: " -export:alias"
+; MINGW: " -export:alias2"
+; MINGW: " -export:alias3"
+; MINGW: " -export:weak_alias"
+; MINGW: " -export:blob_alias"
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index 1b34d23..e2c3f13 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -1,5 +1,9 @@
-; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
-; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck -check-prefix=CHECK -check-prefix=MINGW %s
+; RUN: llc -mtriple i386-pc-win32 < %s \
+; RUN:    | FileCheck -check-prefix CHECK -check-prefix CHECK-CL %s
+; RUN: llc -mtriple i386-pc-mingw32 < %s \
+; RUN:    | FileCheck -check-prefix CHECK -check-prefix CHECK-GCC %s
+; RUN: llc -mtriple i686-pc-cygwin %s -o - \
+; RUN:    | FileCheck -check-prefix CHECK -check-prefix CHECK-GCC %s
 
 ; CHECK: .text
 
@@ -55,18 +59,18 @@ define weak_odr dllexport void @weak1() {
 ; CHECK: .globl _Var1
 @Var1 = dllexport global i32 1, align 4
 
-; CHECK: .rdata,"r"
+; CHECK: .rdata,"rd"
 ; CHECK: .globl _Var2
 @Var2 = dllexport unnamed_addr constant i32 1
 
 ; CHECK: .comm _Var3
 @Var3 = common dllexport global i32 0, align 4
 
-; CHECK: .section .data,"w",discard,_WeakVar1
+; CHECK: .section .data,"wd",discard,_WeakVar1
 ; CHECK: .globl _WeakVar1
 @WeakVar1 = weak_odr dllexport global i32 1, align 4
 
-; CHECK: .section .rdata,"r",discard,_WeakVar2
+; CHECK: .section .rdata,"rd",discard,_WeakVar2
 ; CHECK: .globl _WeakVar2
 @WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
 
@@ -81,7 +85,7 @@ define weak_odr dllexport void @weak1() {
 
 ; CHECK: .globl _alias3
 ; CHECK: _alias3 = _notExported
-@alias3 = dllexport alias void()* @alias
+@alias3 = dllexport alias void()* @notExported
 
 ; CHECK: .weak _weak_alias
 ; CHECK: _weak_alias = _f1
@@ -89,37 +93,38 @@ define weak_odr dllexport void @weak1() {
 
 
 ; CHECK: .section .drectve
-; WIN32: /EXPORT:_Var1,DATA
-; WIN32: /EXPORT:_Var2,DATA
-; WIN32: /EXPORT:_Var3,DATA
-; WIN32: /EXPORT:_WeakVar1,DATA
-; WIN32: /EXPORT:_WeakVar2,DATA
-; WIN32: /EXPORT:_f1
-; WIN32: /EXPORT:_f2
-; WIN32: /EXPORT:_stdfun@0
-; WIN32: /EXPORT:@fastfun@0
-; WIN32: /EXPORT:_thisfun
-; WIN32: /EXPORT:_lnk1
-; WIN32: /EXPORT:_lnk2
-; WIN32: /EXPORT:_weak1
-; WIN32: /EXPORT:_alias
-; WIN32: /EXPORT:_alias2
-; WIN32: /EXPORT:_alias3
-; WIN32: /EXPORT:_weak_alias
-; MINGW: -export:_Var1,data
-; MINGW: -export:_Var2,data
-; MINGW: -export:_Var3,data
-; MINGW: -export:_WeakVar1,data
-; MINGW: -export:_WeakVar2,data
-; MINGW: -export:_f1
-; MINGW: -export:_f2
-; MINGW: -export:_stdfun@0
-; MINGW: -export:@fastfun@0
-; MINGW: -export:_thisfun
-; MINGW: -export:_lnk1
-; MINGW: -export:_lnk2
-; MINGW: -export:_weak1
-; MINGW: -export:_alias
-; MINGW: -export:_alias2
-; MINGW: -export:_alias3
-; MINGW: -export:_weak_alias
+; CHECK-CL: " /EXPORT:_Var1,DATA"
+; CHECK-CL: " /EXPORT:_Var2,DATA"
+; CHECK-CL: " /EXPORT:_Var3,DATA"
+; CHECK-CL: " /EXPORT:_WeakVar1,DATA"
+; CHECK-CL: " /EXPORT:_WeakVar2,DATA"
+; CHECK-CL: " /EXPORT:_f1"
+; CHECK-CL: " /EXPORT:_f2"
+; CHECK-CL: " /EXPORT:_stdfun@0"
+; CHECK-CL: " /EXPORT:@fastfun@0"
+; CHECK-CL: " /EXPORT:_thisfun"
+; CHECK-CL: " /EXPORT:_lnk1"
+; CHECK-CL: " /EXPORT:_lnk2"
+; CHECK-CL: " /EXPORT:_weak1"
+; CHECK-CL: " /EXPORT:_alias"
+; CHECK-CL: " /EXPORT:_alias2"
+; CHECK-CL: " /EXPORT:_alias3"
+; CHECK-CL: " /EXPORT:_weak_alias"
+; CHECK-GCC: " -export:Var1,data"
+; CHECK-GCC: " -export:Var2,data"
+; CHECK-GCC: " -export:Var3,data"
+; CHECK-GCC: " -export:WeakVar1,data"
+; CHECK-GCC: " -export:WeakVar2,data"
+; CHECK-GCC: " -export:f1"
+; CHECK-GCC: " -export:f2"
+; CHECK-GCC: " -export:stdfun@0"
+; CHECK-GCC: " -export:@fastfun@0"
+; CHECK-GCC: " -export:thisfun"
+; CHECK-GCC: " -export:lnk1"
+; CHECK-GCC: " -export:lnk2"
+; CHECK-GCC: " -export:weak1"
+; CHECK-GCC: " -export:alias"
+; CHECK-GCC: " -export:alias2"
+; CHECK-GCC: " -export:alias3"
+; CHECK-GCC: " -export:weak_alias"
+
diff --git a/test/CodeGen/X86/expand-opaque-const.ll b/test/CodeGen/X86/expand-opaque-const.ll
new file mode 100644
index 0000000..6e461cf
--- /dev/null
+++ b/test/CodeGen/X86/expand-opaque-const.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mcpu=generic -O1 -relocation-model=pic < %s | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i686-apple-darwin"
+
+define i64 @test_lshr() {
+entry:
+; CHECK-NOT: movl $-1, 16(%esp)
+; CHECK-NOT: movl  $-1, %eax
+  %retval = alloca i64
+  %op1 = alloca i64
+  %op2 = alloca i64
+  store i64 -6687208052682386272, i64* %op1
+  store i64 7106745059734980448, i64* %op2
+  %tmp1 = load i64* %op1
+  %tmp2 = load i64* %op2
+  %tmp = xor i64 %tmp2, 7106745059734980448
+  %tmp3 = lshr i64 %tmp1, %tmp
+  store i64 %tmp3, i64* %retval
+  %tmp4 = load i64* %retval
+  ret i64 %tmp4
+}
diff --git a/test/CodeGen/X86/f16c-intrinsics.ll b/test/CodeGen/X86/f16c-intrinsics.ll
index 2135f94..514d929 100644
--- a/test/CodeGen/X86/f16c-intrinsics.ll
+++ b/test/CodeGen/X86/f16c-intrinsics.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86 -mattr=+avx,+f16c | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx,+f16c | FileCheck %s
 
 define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
   ; CHECK: vcvtph2ps
@@ -30,3 +31,16 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
+
+define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
+; CHECK-LABEL: test_x86_vcvtps2ph_128_scalar
+; CHECK-NOT: vmov
+; CHECK: vcvtph2ps (%
+
+  %load = load i64* %ptr
+  %ins1 = insertelement <2 x i64> undef, i64 %load, i32 0
+  %ins2 = insertelement <2 x i64> %ins1, i64 0, i32 1
+  %bc = bitcast <2 x i64> %ins2 to <8 x i16>
+  %res = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %bc) #2
+  ret <4 x float> %res
+}
diff --git a/test/CodeGen/X86/fma-do-not-commute.ll b/test/CodeGen/X86/fma-do-not-commute.ll
new file mode 100644
index 0000000..4e21172
--- /dev/null
+++ b/test/CodeGen/X86/fma-do-not-commute.ll
@@ -0,0 +1,30 @@
+; RUN: llc -fp-contract=fast -mattr=+fma -disable-cgp < %s -o - | FileCheck %s
+; Check that the 2nd and 3rd arguments of fmaXXX231 reg1, reg2, mem3 are not commuted.
+; <rdar://problem/16800495> 
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; CHECK-LABEL: test1:
+; %arg lives in xmm0 and it shouldn't be redefined until it is used in the FMA.
+; CHECK-NOT {{.*}}, %xmm0
+; %addr lives in rdi.
+; %addr2 lives in rsi.
+; CHECK: vmovss (%rsi), [[ADDR2:%xmm[0-9]+]]
+; The assembly syntax is in the reverse order.
+; CHECK: vfmadd231ss (%rdi), [[ADDR2]], %xmm0
+define void @test1(float* %addr, float* %addr2, float %arg) {
+entry:
+  br label %loop
+
+loop:
+  %sum0 = phi float [ %fma, %loop ], [ %arg, %entry ]
+  %addrVal = load float* %addr, align 4
+  %addr2Val = load float* %addr2, align 4
+  %fmul = fmul float %addrVal, %addr2Val
+  %fma = fadd float %sum0, %fmul
+  br i1 true, label %exit, label %loop
+
+exit:
+  store float %fma, float* %addr, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll
index e85d8f7..96c5be4 100644
--- a/test/CodeGen/X86/fold-load-vec.ll
+++ b/test/CodeGen/X86/fold-load-vec.ll
@@ -5,7 +5,7 @@
 ; loads from m32.
 define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
 ; CHECK: sample_test
-; CHECK: movaps
+; CHECK-NOT: movaps
 ; CHECK: insertps
 entry:
   %source.addr = alloca <4 x float>*, align 8
diff --git a/test/CodeGen/X86/gcc_except_table.ll b/test/CodeGen/X86/gcc_except_table.ll
index 7a29b07..8c328ec 100644
--- a/test/CodeGen/X86/gcc_except_table.ll
+++ b/test/CodeGen/X86/gcc_except_table.ll
@@ -50,7 +50,3 @@ eh.resume:
 declare void @_Z1fv() optsize
 
 declare i32 @__gxx_personality_v0(...)
-
-; CHECK: Leh_func_end0:
-; CHECK: GCC_except_table0
-; CHECK: = Leh_func_end0-
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index 5ad5047..c763f39 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin9.7 | FileCheck %s -check-prefix=DARWIN
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -relocation-model=static | FileCheck %s -check-prefix=DARWIN-STATIC
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=DARWIN64
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -fdata-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
-; RUN: llc < %s -mtriple=i686-pc-win32 -fdata-sections -ffunction-sections | FileCheck %s -check-prefix=WIN32-SECTIONS
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -data-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
+; RUN: llc < %s -mtriple=i686-pc-win32 -data-sections -function-sections | FileCheck %s -check-prefix=WIN32-SECTIONS
 
 define void @F1() {
   ret void
@@ -18,13 +18,13 @@ define void @F1() {
 ; LINUX: .type   G1,@object
 ; LINUX: .comm  G1,4,4
 
-; DARWIN: .comm	_G1,4,2
+; DARWIN: .comm _G1,4,2
 
 
 
 
 ; const int G2 __attribute__((weak)) = 42;
-@G2 = weak_odr unnamed_addr constant i32 42	
+@G2 = weak_odr unnamed_addr constant i32 42     
 
 
 ; TODO: linux drops this into .rodata, we drop it into ".gnu.linkonce.r.G2"
@@ -48,7 +48,7 @@ define void @F1() {
 ; LINUX-SECTIONS: .section        .rodata.G3,"a",@progbits
 ; LINUX-SECTIONS: .globl  G3
 
-; WIN32-SECTIONS: .section        .rdata,"r",one_only,_G3
+; WIN32-SECTIONS: .section        .rdata,"rd",one_only,_G3
 ; WIN32-SECTIONS: .globl  _G3
 
 
@@ -85,25 +85,25 @@ define void @F1() {
 ; PR4584
 @"foo bar" = linkonce global i32 42
 
-; LINUX: .type	"foo bar",@object
+; LINUX: .type  "foo bar",@object
 ; LINUX: .section ".data.foo bar","aGw",@progbits,"foo bar",comdat
-; LINUX: .weak	"foo bar"
+; LINUX: .weak  "foo bar"
 ; LINUX: "foo bar":
 
-; DARWIN: .section		__DATA,__datacoal_nt,coalesced
-; DARWIN: .globl	"_foo bar"
-; DARWIN:	.weak_definition "_foo bar"
+; DARWIN: .section              __DATA,__datacoal_nt,coalesced
+; DARWIN: .globl        "_foo bar"
+; DARWIN:       .weak_definition "_foo bar"
 ; DARWIN: "_foo bar":
 
 ; PR4650
 @G6 = weak_odr unnamed_addr constant [1 x i8] c"\01"
 
-; LINUX:   .type	G6,@object
-; LINUX:   .section	.rodata.G6,"aG",@progbits,G6,comdat
-; LINUX:   .weak	G6
+; LINUX:   .type        G6,@object
+; LINUX:   .section     .rodata.G6,"aG",@progbits,G6,comdat
+; LINUX:   .weak        G6
 ; LINUX: G6:
-; LINUX:   .byte	1
-; LINUX:   .size	G6, 1
+; LINUX:   .byte        1
+; LINUX:   .size        G6, 1
 
 ; DARWIN:  .section __TEXT,__const_coal,coalesced
 ; DARWIN:  .globl _G6
@@ -114,58 +114,58 @@ define void @F1() {
 
 @G7 = unnamed_addr constant [10 x i8] c"abcdefghi\00"
 
-; DARWIN:	__TEXT,__cstring,cstring_literals
-; DARWIN:	.globl _G7
+; DARWIN:       __TEXT,__cstring,cstring_literals
+; DARWIN:       .globl _G7
 ; DARWIN: _G7:
-; DARWIN:	.asciz	"abcdefghi"
+; DARWIN:       .asciz  "abcdefghi"
 
-; LINUX:	.section	.rodata.str1.1,"aMS",@progbits,1
-; LINUX:	.globl G7
+; LINUX:        .section        .rodata.str1.1,"aMS",@progbits,1
+; LINUX:        .globl G7
 ; LINUX: G7:
-; LINUX:	.asciz	"abcdefghi"
+; LINUX:        .asciz  "abcdefghi"
 
 ; LINUX-SECTIONS: .section        .rodata.G7,"aMS",@progbits,1
-; LINUX-SECTIONS:	.globl G7
+; LINUX-SECTIONS:       .globl G7
 
-; WIN32-SECTIONS: .section        .rdata,"r",one_only,_G7
-; WIN32-SECTIONS:	.globl _G7
+; WIN32-SECTIONS: .section        .rdata,"rd",one_only,_G7
+; WIN32-SECTIONS:       .globl _G7
 
 
 @G8 = unnamed_addr constant [4 x i16] [ i16 1, i16 2, i16 3, i16 0 ]
 
-; DARWIN:	.section	__TEXT,__const
-; DARWIN:	.globl _G8
+; DARWIN:       .section        __TEXT,__const
+; DARWIN:       .globl _G8
 ; DARWIN: _G8:
 
-; LINUX:	.section	.rodata.str2.2,"aMS",@progbits,2
-; LINUX:	.globl G8
+; LINUX:        .section        .rodata.str2.2,"aMS",@progbits,2
+; LINUX:        .globl G8
 ; LINUX:G8:
 
 @G9 = unnamed_addr constant [4 x i32] [ i32 1, i32 2, i32 3, i32 0 ]
 
-; DARWIN:	.globl _G9
+; DARWIN:       .globl _G9
 ; DARWIN: _G9:
 
-; LINUX:	.section	.rodata.str4.4,"aMS",@progbits,4
-; LINUX:	.globl G9
+; LINUX:        .section        .rodata.str4.4,"aMS",@progbits,4
+; LINUX:        .globl G9
 ; LINUX:G9
 
 
 @G10 = weak global [100 x i32] zeroinitializer, align 32 ; <[100 x i32]*> [#uses=0]
 
 
-; DARWIN: 	.section	__DATA,__datacoal_nt,coalesced
+; DARWIN:       .section        __DATA,__datacoal_nt,coalesced
 ; DARWIN: .globl _G10
-; DARWIN:	.weak_definition _G10
-; DARWIN:	.align	5
+; DARWIN:       .weak_definition _G10
+; DARWIN:       .align  5
 ; DARWIN: _G10:
-; DARWIN:	.space	400
+; DARWIN:       .space  400
 
-; LINUX:	.bss
-; LINUX:	.weak	G10
-; LINUX:	.align	32
+; LINUX:        .bss
+; LINUX:        .weak   G10
+; LINUX:        .align  32
 ; LINUX: G10:
-; LINUX:	.zero	400
+; LINUX:        .zero   400
 
 
 
@@ -190,7 +190,7 @@ define void @F1() {
 ; LINUX-SECTIONS:        .asciz  "foo"
 ; LINUX-SECTIONS:        .size   .LG14, 4
 
-; WIN32-SECTIONS:        .section        .rdata,"r"
+; WIN32-SECTIONS:        .section        .rdata,"rd"
 ; WIN32-SECTIONS: L_G14:
 ; WIN32-SECTIONS:        .asciz  "foo"
 
diff --git a/test/CodeGen/X86/indirect-hidden.ll b/test/CodeGen/X86/indirect-hidden.ll
new file mode 100644
index 0000000..309375d
--- /dev/null
+++ b/test/CodeGen/X86/indirect-hidden.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=i686-apple-macosx -o - %s | FileCheck %s
+
+; x86 doesn't normally use indirect symbols, particularly hidden ones, but it
+; can be tricked into it for exception-handling typeids.
+
+@hidden_typeid = external hidden constant i8*
+@normal_typeid = external constant i8*
+
+declare void @throws()
+
+define void @get_indirect_hidden() {
+  invoke void @throws() to label %end unwind label %lpad
+lpad:
+  %tmp = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @hidden_typeid to i8*)
+  br label %end
+
+end:
+  ret void
+}
+
+define void @get_indirect() {
+  invoke void @throws() to label %end unwind label %lpad
+lpad:
+  %tmp = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @normal_typeid to i8*)
+  br label %end
+
+end:
+  ret void
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+; CHECK: .section __IMPORT,__pointers,non_lazy_symbol_pointers
+
+; CHECK-NOT: __DATA,__data
+; CHECK: .indirect_symbol _normal_typeid
+; CHECK-NEXT: .long 0
+
+; CHECK-NOT: __DATA,__data
+; CHECK: .indirect_symbol _hidden_typeid
+; CHECK-NEXT: .long 0
diff --git a/test/CodeGen/X86/isel-sink.ll b/test/CodeGen/X86/isel-sink.ll
index 458f19d..e4af9b6 100644
--- a/test/CodeGen/X86/isel-sink.ll
+++ b/test/CodeGen/X86/isel-sink.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -addr-sink-using-gep=1 | FileCheck %s
 
 define i32 @test(i32* %X, i32 %B) {
 ; CHECK-LABEL: test:
diff --git a/test/CodeGen/X86/lit.local.cfg b/test/CodeGen/X86/lit.local.cfg
index 1637fa4..3d91b03 100644
--- a/test/CodeGen/X86/lit.local.cfg
+++ b/test/CodeGen/X86/lit.local.cfg
@@ -4,7 +4,7 @@
 #
 # It should be possible to remove this override once all the bots have cycled
 # cleanly.
-config.suffixes = ['.ll', '.c', '.cpp', '.test', '.txt']
+config.suffixes = ['.ll', '.test', '.txt']
 
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
diff --git a/test/CodeGen/X86/live-out-reg-info.ll b/test/CodeGen/X86/live-out-reg-info.ll
index 8cd9774..283ee3a 100644
--- a/test/CodeGen/X86/live-out-reg-info.ll
+++ b/test/CodeGen/X86/live-out-reg-info.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86-64 | grep testb
 
 ; Make sure dagcombine doesn't eliminate the comparison due
-; to an off-by-one bug with ComputeMaskedBits information.
+; to an off-by-one bug with computeKnownBits information.
 
 declare void @qux()
 
diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll
new file mode 100644
index 0000000..b9b29a5
--- /dev/null
+++ b/test/CodeGen/X86/lower-bitcast.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -mattr=+sse2 | FileCheck %s
+
+
+define double @test1(double %A) {
+  %1 = bitcast double %A to <2 x i32>
+  %add = add <2 x i32> %1, <i32 3, i32 5>
+  %2 = bitcast <2 x i32> %add to double
+  ret double %2
+}
+; FIXME: Ideally we should be able to fold the entire body of @test1 into a
+; single paddd instruction. At the moment we produce the sequence 
+; pshufd+paddq+pshufd.
+
+; CHECK-LABEL: test1
+; CHECK-NOT: movsd
+; CHECK: pshufd
+; CHECK-NEXT: paddq
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define double @test2(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %add = add <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %add to double
+  ret double %3
+}
+; FIXME: Ideally we should be able to fold the entire body of @test2 into a
+; single 'paddd %xmm1, %xmm0' instruction. At the moment we produce the
+; sequence pshufd+pshufd+paddq+pshufd.
+
+; CHECK-LABEL: test2
+; CHECK-NOT: movsd
+; CHECK: pshufd
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: paddq
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define i64 @test3(i64 %A) {
+  %1 = bitcast i64 %A to <2 x float>
+  %add = fadd <2 x float> %1, <float 3.0, float 5.0>
+  %2 = bitcast <2 x float> %add to i64
+  ret i64 %2
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: pshufd
+; CHECK: addps
+; CHECK-NOT: pshufd
+; CHECK: ret
+
+
+define i64 @test4(i64 %A) {
+  %1 = bitcast i64 %A to <2 x i32>
+  %add = add <2 x i32> %1, <i32 3, i32 5>
+  %2 = bitcast <2 x i32> %add to i64
+  ret i64 %2
+}
+; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd.
+; Ideally, we should fold that sequence into a single paddd.
+
+; CHECK-LABEL: test4
+; CHECK: pshufd
+; CHECK-NEXT: paddq
+; CHECK-NEXT: pshufd
+; CHECK: ret
+
+
+define double @test5(double %A) {
+  %1 = bitcast double %A to <2 x float>
+  %add = fadd <2 x float> %1, <float 3.0, float 5.0>
+  %2 = bitcast <2 x float> %add to double
+  ret double %2
+}
+; CHECK-LABEL: test5
+; CHECK: addps
+; CHECK-NEXT: ret
+
+
+define double @test6(double %A) {
+  %1 = bitcast double %A to <4 x i16>
+  %add = add <4 x i16> %1, <i16 3, i16 4, i16 5, i16 6>
+  %2 = bitcast <4 x i16> %add to double
+  ret double %2
+}
+; FIXME: Ideally we should be able to fold the entire body of @test6 into a
+; single paddw instruction.
+
+; CHECK-LABEL: test6
+; CHECK-NOT: movsd
+; CHECK: punpcklwd
+; CHECK-NEXT: paddd
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test7(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %add = add <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %add to double
+  ret double %3
+}
+; FIXME: Ideally we should be able to fold the entire body of @test7 into a
+; single 'paddw %xmm1, %xmm0' instruction. At the moment we produce the
+; sequence pshufd+pshufd+paddd+pshufd.
+
+; CHECK-LABEL: test7
+; CHECK-NOT: movsd
+; CHECK: punpcklwd
+; CHECK-NEXT: punpcklwd
+; CHECK-NEXT: paddd
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test8(double %A) {
+  %1 = bitcast double %A to <8 x i8>
+  %add = add <8 x i8> %1, <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10>
+  %2 = bitcast <8 x i8> %add to double
+  ret double %2
+}
+; FIXME: Ideally we should be able to fold the entire body of @test8 into a
+; single paddb instruction. At the moment we produce the sequence 
+; pshufd+paddw+pshufd.
+
+; CHECK-LABEL: test8
+; CHECK-NOT: movsd
+; CHECK: punpcklbw
+; CHECK-NEXT: paddw
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test9(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %add = add <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %add to double
+  ret double %3
+}
+; FIXME: Ideally we should be able to fold the entire body of @test9 into a
+; single 'paddb %xmm1, %xmm0' instruction. At the moment we produce the
+; sequence pshufd+pshufd+paddw+pshufd.
+
+; CHECK-LABEL: test9
+; CHECK-NOT: movsd
+; CHECK: punpcklbw
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: paddw
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
diff --git a/test/CodeGen/X86/lower-vec-shift.ll b/test/CodeGen/X86/lower-vec-shift.ll
new file mode 100644
index 0000000..c28f82a
--- /dev/null
+++ b/test/CodeGen/X86/lower-vec-shift.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+
+
+; Verify that the following shifts are lowered into a sequence of two shifts plus
+; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic
+; packed shift right by a constant build_vector the backend should always try to
+; emit a simpler sequence of two shifts + blend when possible.
+
+define <8 x i16> @test1(<8 x i16> %a) {
+  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+; CHECK-LABEL: test1
+; SSE: psrlw
+; SSE-NEXT: psrlw
+; SSE-NEXT: movss
+; AVX: vpsrlw
+; AVX-NEXT: vpsrlw
+; AVX-NEXT: vmovss
+; AVX2: vpsrlw
+; AVX2-NEXT: vpsrlw
+; AVX2-NEXT: vmovss
+; CHECK: ret
+
+
+define <8 x i16> @test2(<8 x i16> %a) {
+  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+; CHECK-LABEL: test2
+; SSE: psrlw
+; SSE-NEXT: psrlw
+; SSE-NEXT: movsd
+; AVX: vpsrlw
+; AVX-NEXT: vpsrlw
+; AVX-NEXT: vmovsd
+; AVX2: vpsrlw
+; AVX2-NEXT: vpsrlw
+; AVX2-NEXT: vmovsd
+; CHECK: ret
+
+
+define <4 x i32> @test3(<4 x i32> %a) {
+  %lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test3
+; SSE: psrld
+; SSE-NEXT: psrld
+; SSE-NEXT: movss
+; AVX: vpsrld
+; AVX-NEXT: vpsrld
+; AVX-NEXT: vmovss
+; AVX2: vpsrlvd
+; CHECK: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a) {
+  %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test4
+; SSE: psrld
+; SSE-NEXT: psrld
+; SSE-NEXT: movsd
+; AVX: vpsrld
+; AVX-NEXT: vpsrld
+; AVX-NEXT: vmovsd
+; AVX2: vpsrlvd
+; CHECK: ret
+
+
+define <8 x i16> @test5(<8 x i16> %a) {
+  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+
+define <8 x i16> @test6(<8 x i16> %a) {
+  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+; CHECK-LABEL: test6
+; SSE: psraw
+; SSE-NEXT: psraw
+; SSE-NEXT: movsd
+; AVX: vpsraw
+; AVX-NEXT: vpsraw
+; AVX-NEXT: vmovsd
+; AVX2: vpsraw
+; AVX2-NEXT: vpsraw
+; AVX2-NEXT: vmovsd
+; CHECK: ret
+
+
+define <4 x i32> @test7(<4 x i32> %a) {
+  %lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test7
+; SSE: psrad
+; SSE-NEXT: psrad
+; SSE-NEXT: movss
+; AVX: vpsrad
+; AVX-NEXT: vpsrad
+; AVX-NEXT: vmovss
+; AVX2: vpsravd
+; CHECK: ret
+
+
+define <4 x i32> @test8(<4 x i32> %a) {
+  %lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test8
+; SSE: psrad
+; SSE-NEXT: psrad
+; SSE-NEXT: movsd
+; AVX: vpsrad
+; AVX-NEXT: vpsrad
+; AVX-NEXT: vmovsd
+; AVX2: vpsravd
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/lzcnt-tzcnt.ll b/test/CodeGen/X86/lzcnt-tzcnt.ll
new file mode 100644
index 0000000..07e4b9d
--- /dev/null
+++ b/test/CodeGen/X86/lzcnt-tzcnt.ll
@@ -0,0 +1,447 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+bmi,+lzcnt | FileCheck %s
+
+; LZCNT and TZCNT will always produce the operand size when the input operand
+; is zero. This test is to verify that we efficiently select LZCNT/TZCNT
+; based on the fact that the 'icmp+select' sequence is always redundant
+; in every function defined below.
+
+
+define i16 @test1_ctlz(i16 %v) {
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test1_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test2_ctlz(i32 %v) {
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test2_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test3_ctlz(i64 %v) {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test3_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test4_ctlz(i16 %v) {
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test4_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test5_ctlz(i32 %v) {
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test5_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test6_ctlz(i64 %v) {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test6_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test7_ctlz(i16 %v) {
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test7_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test8_ctlz(i32 %v) {
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test8_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test9_ctlz(i64 %v) {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test9_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test10_ctlz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test10_ctlz
+; CHECK-NOT: movw
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test11_ctlz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test11_ctlz
+; CHECK-NOT: movd
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test12_ctlz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test12_ctlz
+; CHECK-NOT: movq
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test13_ctlz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test13_ctlz
+; CHECK-NOT: movw
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test14_ctlz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test14_ctlz
+; CHECK-NOT: movd
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test15_ctlz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test15_ctlz
+; CHECK-NOT: movq
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test16_ctlz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test16_ctlz
+; CHECK-NOT: movw
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test17_ctlz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test17_ctlz
+; CHECK-NOT: movd
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test18_ctlz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test18_ctlz
+; CHECK-NOT: movq
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test1_cttz(i16 %v) {
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test1_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test2_cttz(i32 %v) {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test2_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test3_cttz(i64 %v) {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test3_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test4_cttz(i16 %v) {
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test4_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test5_cttz(i32 %v) {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test5_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test6_cttz(i64 %v) {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test6_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test7_cttz(i16 %v) {
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test7_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test8_cttz(i32 %v) {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test8_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test9_cttz(i64 %v) {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test9_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test10_cttz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test10_cttz
+; CHECK-NOT: movw
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test11_cttz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test11_cttz
+; CHECK-NOT: movd
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test12_cttz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test12_cttz
+; CHECK-NOT: movq
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test13_cttz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test13_cttz
+; CHECK-NOT: movw
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test14_cttz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test14_cttz
+; CHECK-NOT: movd
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test15_cttz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test15_cttz
+; CHECK-NOT: movq
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test16_cttz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test16_cttz
+; CHECK-NOT: movw
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test17_cttz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test17_cttz
+; CHECK-NOT: movd
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test18_cttz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test18_cttz
+; CHECK-NOT: movq
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll
index 4a4d178..9ddc847 100644
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: count_up
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: inc
+; CHECK: incq
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up(double* %d, i64 %n) nounwind {
@@ -71,7 +71,7 @@ return:
 
 ; CHECK-LABEL: count_up_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: inc
+; CHECK: incq
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up_signed(double* %d, i64 %n) nounwind {
@@ -174,7 +174,7 @@ return:
 
 ; CHECK-LABEL: another_count_down
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: decq
+; CHECK: addq $-8,
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_down(double* %d, i64 %n) nounwind {
diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll
index 940688c..f98963d 100644
--- a/test/CodeGen/X86/merge_store.ll
+++ b/test/CodeGen/X86/merge_store.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -addr-sink-using-gep=1 | FileCheck %s
 
 define void @merge_store(i32* nocapture %a) {
 ; CHECK-LABEL: merge_store:
diff --git a/test/CodeGen/X86/mod128.ll b/test/CodeGen/X86/mod128.ll
new file mode 100644
index 0000000..4fdee11
--- /dev/null
+++ b/test/CodeGen/X86/mod128.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86-64
+; RUN: llc < %s -mtriple=x86_64-cygwin | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-mingw32 | FileCheck %s -check-prefix=WIN64
+
+define i64 @mod128(i128 %x) {
+  ; X86-64: movl  $3, %edx
+  ; X86-64: xorl  %ecx, %ecx
+  ; X86-64: callq __modti3
+  ; X86-64-NOT: movd %xmm0, %rax
+
+  ; WIN64-NOT: movl $3, %r8d
+  ; WIN64-NOT: xorl %r9d, %r9d
+  ; WIN64-DAG: movq %rdx, 56(%rsp)
+  ; WIN64-DAG: movq %rcx, 48(%rsp)
+  ; WIN64-DAG: leaq 48(%rsp), %rcx
+  ; WIN64-DAG: leaq 32(%rsp), %rdx
+  ; WIN64-DAG: movq $0, 40(%rsp)
+  ; WIN64-DAG: movq $3, 32(%rsp)
+  ; WIN64: callq   __modti3
+  ; WIN64: movd    %xmm0, %rax
+
+  %1 = srem i128 %x, 3
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
diff --git a/test/CodeGen/X86/musttail-indirect.ll b/test/CodeGen/X86/musttail-indirect.ll
new file mode 100644
index 0000000..9d21b5e
--- /dev/null
+++ b/test/CodeGen/X86/musttail-indirect.ll
@@ -0,0 +1,124 @@
+; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-win32 -O0 | FileCheck %s
+
+; IR simplified from the following C++ snippet compiled for i686-windows-msvc:
+
+; struct A { A(); ~A(); int a; };
+;
+; struct B {
+;   virtual int  f(int);
+;   virtual int  g(A, int, A);
+;   virtual void h(A, int, A);
+;   virtual A    i(A, int, A);
+;   virtual A    j(int);
+; };
+;
+; int  (B::*mp_f)(int)       = &B::f;
+; int  (B::*mp_g)(A, int, A) = &B::g;
+; void (B::*mp_h)(A, int, A) = &B::h;
+; A    (B::*mp_i)(A, int, A) = &B::i;
+; A    (B::*mp_j)(int)       = &B::j;
+
+; Each member pointer creates a thunk.  The ones with inalloca are required to
+; tail calls by the ABI, even at O0.
+
+%struct.B = type { i32 (...)** }
+%struct.A = type { i32 }
+
+; CHECK-LABEL: f_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @f_thunk(%struct.B* %this, i32) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, i32)***
+  %vtable = load i32 (%struct.B*, i32)*** %1
+  %2 = load i32 (%struct.B*, i32)** %vtable
+  %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, i32 %0)
+  ret i32 %3
+}
+
+; Inalloca thunks shouldn't require any stores to the stack.
+; CHECK-LABEL: g_thunk:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @g_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
+  ret i32 %3
+}
+
+; CHECK-LABEL: h_thunk:
+; CHECK: jmpl
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK-NOT: ret
+define x86_thiscallcc void @h_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 2
+  %2 = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  musttail call x86_thiscallcc void %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
+  ret void
+}
+
+; CHECK-LABEL: i_thunk:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc %struct.A* @i_thunk(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)***
+  %vtable = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vtable, i32 3
+  %2 = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vfn
+  %3 = musttail call x86_thiscallcc %struct.A* %2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca %0)
+  ret %struct.A* %3
+}
+
+; CHECK-LABEL: j_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc void @j_thunk(%struct.A* noalias sret %agg.result, %struct.B* %this, i32) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.A*, %struct.B*, i32)***
+  %vtable = load void (%struct.A*, %struct.B*, i32)*** %1
+  %vfn = getelementptr inbounds void (%struct.A*, %struct.B*, i32)** %vtable, i32 4
+  %2 = load void (%struct.A*, %struct.B*, i32)** %vfn
+  musttail call x86_thiscallcc void %2(%struct.A* sret %agg.result, %struct.B* %this, i32 %0)
+  ret void
+}
+
+; CHECK-LABEL: _stdcall_thunk@8:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_stdcallcc i32 @stdcall_thunk(<{ %struct.B*, %struct.A }>* inalloca) {
+entry:
+  %this_ptr = getelementptr inbounds <{ %struct.B*, %struct.A }>* %0, i32 0, i32 0
+  %this = load %struct.B** %this_ptr
+  %1 = bitcast %struct.B* %this to i32 (<{ %struct.B*, %struct.A }>*)***
+  %vtable = load i32 (<{ %struct.B*, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (<{ %struct.B*, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (<{ %struct.B*, %struct.A }>*)** %vfn
+  %3 = musttail call x86_stdcallcc i32 %2(<{ %struct.B*, %struct.A }>* inalloca %0)
+  ret i32 %3
+}
+
+; CHECK-LABEL: @fastcall_thunk@8:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_fastcallcc i32 @fastcall_thunk(%struct.B* inreg %this, <{ %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A }>*)** %vfn
+  %3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* inalloca %0)
+  ret i32 %3
+}
diff --git a/test/CodeGen/X86/musttail-thiscall.ll b/test/CodeGen/X86/musttail-thiscall.ll
new file mode 100644
index 0000000..8ea1248
--- /dev/null
+++ b/test/CodeGen/X86/musttail-thiscall.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -march=x86 -O0 < %s | FileCheck %s
+
+; CHECK-LABEL: t1:
+; CHECK: jmp {{_?}}t1_callee
+define x86_thiscallcc void @t1(i8* %this) {
+  %adj = getelementptr i8* %this, i32 4
+  musttail call x86_thiscallcc void @t1_callee(i8* %adj)
+  ret void
+}
+declare x86_thiscallcc void @t1_callee(i8* %this)
+
+; CHECK-LABEL: t2:
+; CHECK: jmp {{_?}}t2_callee
+define x86_thiscallcc i32 @t2(i8* %this, i32 %a) {
+  %adj = getelementptr i8* %this, i32 4
+  %rv = musttail call x86_thiscallcc i32 @t2_callee(i8* %adj, i32 %a)
+  ret i32 %rv
+}
+declare x86_thiscallcc i32 @t2_callee(i8* %this, i32 %a)
+
+; CHECK-LABEL: t3:
+; CHECK: jmp {{_?}}t3_callee
+define x86_thiscallcc i8* @t3(i8* %this, <{ i8*, i32 }>* inalloca %args) {
+  %adj = getelementptr i8* %this, i32 4
+  %a_ptr = getelementptr <{ i8*, i32 }>* %args, i32 0, i32 1
+  store i32 0, i32* %a_ptr
+  %rv = musttail call x86_thiscallcc i8* @t3_callee(i8* %adj, <{ i8*, i32 }>* inalloca %args)
+  ret i8* %rv
+}
+declare x86_thiscallcc i8* @t3_callee(i8* %this, <{ i8*, i32 }>* inalloca %args);
diff --git a/test/CodeGen/X86/musttail.ll b/test/CodeGen/X86/musttail.ll
new file mode 100644
index 0000000..ca5d311
--- /dev/null
+++ b/test/CodeGen/X86/musttail.ll
@@ -0,0 +1,90 @@
+; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -march=x86 -O0 < %s | FileCheck %s
+; RUN: llc -march=x86 -disable-tail-calls < %s | FileCheck %s
+
+declare void @t1_callee(i8*)
+define void @t1(i32* %a) {
+; CHECK-LABEL: t1:
+; CHECK: jmp {{_?}}t1_callee
+  %b = bitcast i32* %a to i8*
+  musttail call void @t1_callee(i8* %b)
+  ret void
+}
+
+declare i8* @t2_callee()
+define i32* @t2() {
+; CHECK-LABEL: t2:
+; CHECK: jmp {{_?}}t2_callee
+  %v = musttail call i8* @t2_callee()
+  %w = bitcast i8* %v to i32*
+  ret i32* %w
+}
+
+; Complex frame layout: stack realignment with dynamic alloca.
+define void @t3(i32 %n) alignstack(32) nounwind {
+entry:
+; CHECK: t3:
+; CHECK: pushl %ebp
+; CHECK: pushl %esi
+; CHECK: andl $-32, %esp
+; CHECK: movl %esp, %esi
+; CHECK: popl %esi
+; CHECK: popl %ebp
+; CHECK-NEXT: jmp {{_?}}t3_callee
+  %a = alloca i8, i32 %n
+  call void @capture(i8* %a)
+  musttail call void @t3_callee(i32 %n) nounwind
+  ret void
+}
+
+declare void @capture(i8*)
+declare void @t3_callee(i32)
+
+; Test that we actually copy in and out stack arguments that aren't forwarded
+; without modification.
+define i32 @t4({}* %fn, i32 %n, i32 %r) {
+; CHECK-LABEL: t4:
+; CHECK: incl %[[r:.*]]
+; CHECK: decl %[[n:.*]]
+; CHECK: movl %[[r]], {{[0-9]+}}(%esp)
+; CHECK: movl %[[n]], {{[0-9]+}}(%esp)
+; CHECK: jmpl *%{{.*}}
+
+entry:
+  %r1 = add i32 %r, 1
+  %n1 = sub i32 %n, 1
+  %fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)*
+  %r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1)
+  ret i32 %r2
+}
+
+; Combine the complex stack frame with the parameter modification.
+define i32 @t5({}* %fn, i32 %n, i32 %r) alignstack(32) {
+; CHECK-LABEL: t5:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: pushl %esi
+; 	Align the stack.
+; CHECK: andl $-32, %esp
+; CHECK: movl %esp, %esi
+; 	Modify the args.
+; CHECK: incl %[[r:.*]]
+; CHECK: decl %[[n:.*]]
+; 	Store them through ebp, since that's the only stable arg pointer.
+; CHECK: movl %[[r]], {{[0-9]+}}(%ebp)
+; CHECK: movl %[[n]], {{[0-9]+}}(%ebp)
+; 	Epilogue.
+; CHECK: leal {{[-0-9]+}}(%ebp), %esp
+; CHECK: popl %esi
+; CHECK: popl %ebp
+; CHECK: jmpl *%{{.*}}
+
+entry:
+  %a = alloca i8, i32 %n
+  call void @capture(i8* %a)
+  %r1 = add i32 %r, 1
+  %n1 = sub i32 %n, 1
+  %fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)*
+  %r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1)
+  ret i32 %r2
+}
diff --git a/test/CodeGen/X86/named-reg-alloc.ll b/test/CodeGen/X86/named-reg-alloc.ll
new file mode 100644
index 0000000..9463ea3
--- /dev/null
+++ b/test/CodeGen/X86/named-reg-alloc.ll
@@ -0,0 +1,14 @@
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"eax\00"}
diff --git a/test/CodeGen/X86/named-reg-notareg.ll b/test/CodeGen/X86/named-reg-notareg.ll
new file mode 100644
index 0000000..d85dddd
--- /dev/null
+++ b/test/CodeGen/X86/named-reg-notareg.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"notareg\00"}
diff --git a/test/CodeGen/X86/no-cfi.ll b/test/CodeGen/X86/no-cfi.ll
deleted file mode 100644
index 5bb9bb2..0000000
--- a/test/CodeGen/X86/no-cfi.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -disable-cfi | FileCheck --check-prefix=STATIC %s
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -disable-cfi -relocation-model=pic | FileCheck --check-prefix=PIC %s
-
-; STATIC:      .ascii   "zPLR"
-; STATIC:      .byte   3
-; STATIC-NEXT: .long   __gxx_personality_v0
-; STATIC-NEXT: .byte   3
-; STATIC-NEXT: .byte   3
-
-; PIC:      .ascii   "zPLR"
-; PIC:      .byte   155
-; PIC-NEXT: .L
-; PIC-NEXT: .long   DW.ref.__gxx_personality_v0-.L
-; PIC-NEXT: .byte   27
-; PIC-NEXT: .byte   27
-
-
-define void @bar() {
-entry:
-  %call = invoke i32 @foo()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:
-  ret void
-
-lpad:
-  %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
-            catch i8* null
-  ret void
-}
-
-declare i32 @foo()
-
-declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/X86/peep-test-4.ll b/test/CodeGen/X86/peep-test-4.ll
index 884ee7c..1ae621f 100644
--- a/test/CodeGen/X86/peep-test-4.ll
+++ b/test/CodeGen/X86/peep-test-4.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+bmi,+bmi2,+popcnt | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+bmi,+bmi2,+popcnt,+lzcnt | FileCheck %s
 declare void @foo(i32)
+declare void @foo32(i32)
 declare void @foo64(i64)
 
 ; CHECK-LABEL: neg:
@@ -189,3 +190,76 @@ bb:
 return:
   ret void
 }
+
+; CHECK-LABEL: testCTZ
+; CHECK: tzcntq
+; CHECK-NOT: test
+; CHECK: cmovaeq
+declare i64 @llvm.cttz.i64(i64, i1)
+define i64 @testCTZ(i64 %v) nounwind {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 255, i64 %cnt
+  ret i64 %cond
+}
+
+; CHECK-LABEL: testCTZ2
+; CHECK: tzcntl
+; CHECK-NEXT: jb
+; CHECK: jmp foo
+declare i32 @llvm.cttz.i32(i32, i1)
+define void @testCTZ2(i32 %v) nounwind {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %cmp = icmp eq i32 %v, 0
+  br i1 %cmp, label %return, label %bb
+
+bb:
+  tail call void @foo(i32 %cnt)
+  br label %return
+
+return:
+  tail call void @foo32(i32 %cnt)
+  ret void
+}
+
+; CHECK-LABEL: testCTZ3
+; CHECK: tzcntl
+; CHECK-NEXT: jae
+; CHECK: jmp foo
+define void @testCTZ3(i32 %v) nounwind {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %cmp = icmp ne i32 %v, 0
+  br i1 %cmp, label %return, label %bb
+
+bb:
+  tail call void @foo(i32 %cnt)
+  br label %return
+
+return:
+  tail call void @foo32(i32 %cnt)
+  ret void
+}
+
+; CHECK-LABEL: testCLZ
+; CHECK: lzcntq
+; CHECK-NOT: test
+; CHECK: cmovaeq
+declare i64 @llvm.ctlz.i64(i64, i1)
+define i64 @testCLZ(i64 %v) nounwind {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 %cnt, i64 255
+  ret i64 %cond
+}
+
+; CHECK-LABEL: testPOPCNT
+; CHECK: popcntq
+; CHECK-NOT: test
+; CHECK: cmovneq
+declare i64 @llvm.ctpop.i64(i64)
+define i64 @testPOPCNT(i64 %v) nounwind {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %v)
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 %cnt, i64 255
+  ret i64 %cond
+}
diff --git a/test/CodeGen/X86/peephole-multiple-folds.ll b/test/CodeGen/X86/peephole-multiple-folds.ll
index d184569..a6cec66 100644
--- a/test/CodeGen/X86/peephole-multiple-folds.ll
+++ b/test/CodeGen/X86/peephole-multiple-folds.ll
@@ -9,8 +9,8 @@ entry:
 
 loopbody:
 ; CHECK: test_peephole_multi_fold:
-; CHECK: vfmadd231ps (%rdi),
-; CHECK: vfmadd231ps (%rsi),
+; CHECK: vfmadd231ps ({{%rdi|%rcx}}),
+; CHECK: vfmadd231ps ({{%rsi|%rdx}}),
   %vsum1 = phi <8 x float> [ %vsum1.next, %loopbody ], [ zeroinitializer, %entry ]
   %vsum2 = phi <8 x float> [ %vsum2.next, %loopbody ], [ zeroinitializer, %entry ]
   %m1 = load <8 x float>* %p1, align 1
diff --git a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
index f3669fb..d8e4572 100644
--- a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
+++ b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
@@ -2,6 +2,16 @@
 ; Without the last chance recoloring, this test fails with:
 ; "ran out of registers".
 
+; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH
+; Test whether failure due to cutoff for depth is reported
+
+; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF
+; Test whether failure due to cutoff for interference is reported
+
+; RUN: llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 -lcr-max-depth=0 -exhaustive-register-search < %s > %t 2>&1
+; RUN: FileCheck --input-file=%t %s --check-prefix=CHECK-EXHAUSTIVE
+; Test whether exhaustive-register-search can bypass the depth and interference cutoffs of last chance recoloring 
+
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 target triple = "i386-apple-macosx"
 
@@ -12,6 +22,9 @@ target triple = "i386-apple-macosx"
 
 ; Function Attrs: nounwind ssp
 ; CHECK-NOT: ran out of registers during register allocation
+; CHECK-INTERF: error: register allocation failed: maximum interference for recoloring reached
+; CHECK-DEPTH: error: register allocation failed: maximum depth for recoloring reached
+; CHECK-EXHAUSTIVE-NOT: error: register allocation failed: maximum {{depth|interference}} for recoloring reached
 define void @fp_dh_f870bf31fd8ffe068450366e3f05389a(i8* %arg) #0 {
 bb:
   indirectbr i8* undef, [label %bb85, label %bb206]
diff --git a/test/CodeGen/X86/rdtsc.ll b/test/CodeGen/X86/rdtsc.ll
index f21a44c..dba614a 100644
--- a/test/CodeGen/X86/rdtsc.ll
+++ b/test/CodeGen/X86/rdtsc.ll
@@ -1,8 +1,49 @@
-; RUN: llc < %s -march=x86 | grep rdtsc
-; RUN: llc < %s -march=x86-64 | grep rdtsc
-declare i64 @llvm.readcyclecounter()
+; RUN: llc < %s -march=x86-64 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+
+; Verify that we correctly lower ISD::READCYCLECOUNTER.
+
+
+define i64 @test_builtin_readcyclecounter() {
+  %1 = tail call i64 @llvm.readcyclecounter()
+  ret i64 %1
+}
+; CHECK-LABEL: test_builtin_readcyclecounter
+; CHECK: rdtsc
+; X86-NOT: shlq
+; X86-NOT: or
+; CHECK-NOT: mov
+; CHECK: ret
+
+
+; Verify that we correctly lower the Read Cycle Counter GCC x86 builtins
+; (i.e. RDTSC and RDTSCP).
 
-define i64 @foo() {
-	%tmp.1 = call i64 @llvm.readcyclecounter( )		; <i64> [#uses=1]
-	ret i64 %tmp.1
+define i64 @test_builtin_rdtsc() {
+  %1 = tail call i64 @llvm.x86.rdtsc()
+  ret i64 %1
 }
+; CHECK-LABEL: test_builtin_rdtsc
+; CHECK: rdtsc
+; X86-NOT: shlq
+; X86-NOT: or
+; CHECK-NOT: mov
+; CHECK: ret
+
+
+define i64 @test_builtin_rdtscp(i8* %A) {
+  %1 = tail call i64 @llvm.x86.rdtscp(i8* %A)
+  ret i64 %1
+}
+; CHECK-LABEL: test_builtin_rdtscp
+; CHECK: rdtscp
+; X86-NOT: shlq
+; CHECK:   movl	%ecx, (%{{[a-z0-9]+}})
+; X86-NOT: shlq
+; CHECK: ret
+
+
+declare i64 @llvm.readcyclecounter()
+declare i64 @llvm.x86.rdtscp(i8*)
+declare i64 @llvm.x86.rdtsc()
+
diff --git a/test/CodeGen/X86/remat-invalid-liveness.ll b/test/CodeGen/X86/remat-invalid-liveness.ll
new file mode 100644
index 0000000..d285e83
--- /dev/null
+++ b/test/CodeGen/X86/remat-invalid-liveness.ll
@@ -0,0 +1,85 @@
+; RUN: llc %s -mcpu=core2 -o - | FileCheck %s
+; This test was failing while tracking the liveness in the register scavenger
+; during the branching folding pass. The allocation of the subregisters was
+; incorrect.
+; I.e., the faulty pattern looked like:
+; CH = movb 64
+; ECX = movl 3 <- CH was killed here.
+; CH = subb CH, ...
+;
+; This reduced test case triggers the crash before the fix, but does not
+; strictly speaking check that the resulting code is correct.
+; To check that the code is actually correct we would need to check the
+; liveness of the produced code.
+;
+; Currently, we check that after ECX = movl 3, we do not have subb CH,
+; whereas CH could have been redefine in between and that would have been
+; totally fine.
+; <rdar://problem/16582185>
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9"
+
+%struct.A = type { %struct.B, %struct.C, %struct.D*, [1 x i8*] }
+%struct.B = type { i32, [4 x i8] }
+%struct.C = type { i128 }
+%struct.D = type { {}*, [0 x i32] }
+%union.E = type { i32 }
+
+; CHECK-LABEL: __XXX1:
+; CHECK: movl $3, %ecx
+; CHECK-NOT: subb %{{[a-z]+}}, %ch
+; Function Attrs: nounwind optsize ssp
+define fastcc void @__XXX1(%struct.A* %ht) #0 {
+entry:
+  %const72 = bitcast i128 72 to i128
+  %const3 = bitcast i128 3 to i128
+  switch i32 undef, label %if.end196 [
+    i32 1, label %sw.bb.i
+    i32 3, label %sw.bb2.i
+  ]
+
+sw.bb.i:                                          ; preds = %entry
+  %call.i.i.i = tail call i32 undef(%struct.A* %ht, i8 zeroext 22, i32 undef, i32 0, %struct.D* undef)
+  %bf.load.i.i = load i128* undef, align 4
+  %bf.lshr.i.i = lshr i128 %bf.load.i.i, %const72
+  %shl1.i.i = shl nuw nsw i128 %bf.lshr.i.i, 8
+  %shl.i.i = trunc i128 %shl1.i.i to i32
+  br i1 undef, label %cond.false10.i.i, label %__XXX2.exit.i.i
+
+__XXX2.exit.i.i:                    ; preds = %sw.bb.i
+  %extract11.i.i.i = lshr i128 %bf.load.i.i, %const3
+  %extract.t12.i.i.i = trunc i128 %extract11.i.i.i to i32
+  %bf.cast7.i.i.i = and i32 %extract.t12.i.i.i, 3
+  %arrayidx.i.i.i = getelementptr inbounds %struct.A* %ht, i32 0, i32 3, i32 %bf.cast7.i.i.i
+  br label %cond.end12.i.i
+
+cond.false10.i.i:                                 ; preds = %sw.bb.i
+  %arrayidx.i6.i.i = getelementptr inbounds %struct.A* %ht, i32 0, i32 3, i32 0
+  br label %cond.end12.i.i
+
+cond.end12.i.i:                                   ; preds = %cond.false10.i.i, %__XXX2.exit.i.i
+  %.sink.in.i.i = phi i8** [ %arrayidx.i.i.i, %__XXX2.exit.i.i ], [ %arrayidx.i6.i.i, %cond.false10.i.i ]
+  %.sink.i.i = load i8** %.sink.in.i.i, align 4
+  %tmp = bitcast i8* %.sink.i.i to %union.E*
+  br i1 undef, label %for.body.i.i, label %if.end196
+
+for.body.i.i:                                     ; preds = %for.body.i.i, %cond.end12.i.i
+  %weak.i.i = getelementptr inbounds %union.E* %tmp, i32 undef, i32 0
+  %tmp1 = load i32* %weak.i.i, align 4
+  %cmp36.i.i = icmp ne i32 %tmp1, %shl.i.i
+  %or.cond = and i1 %cmp36.i.i, false
+  br i1 %or.cond, label %for.body.i.i, label %if.end196
+
+sw.bb2.i:                                         ; preds = %entry
+  %bf.lshr.i85.i = lshr i128 undef, %const72
+  br i1 undef, label %if.end196, label %__XXX2.exit.i95.i
+
+__XXX2.exit.i95.i:                  ; preds = %sw.bb2.i
+  %extract11.i.i91.i = lshr i128 undef, %const3
+  br label %if.end196
+
+if.end196:                                        ; preds = %__XXX2.exit.i95.i, %sw.bb2.i, %for.body.i.i, %cond.end12.i.i, %entry
+  ret void
+}
+
+attributes #0 = { nounwind optsize ssp "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
diff --git a/test/CodeGen/X86/ret-mmx.ll b/test/CodeGen/X86/ret-mmx.ll
index 091fd53..fc9c78d 100644
--- a/test/CodeGen/X86/ret-mmx.ll
+++ b/test/CodeGen/X86/ret-mmx.ll
@@ -34,6 +34,7 @@ define double @t4() nounwind {
 	ret double bitcast (<2 x i32> <i32 1, i32 0> to double)
 ; CHECK-LABEL: t4:
 ; CHECK: movl $1
+; CHECK-NOT: pshufd
 ; CHECK: movd {{.*}}, %xmm0
 }
 
diff --git a/test/CodeGen/X86/rotate3.ll b/test/CodeGen/X86/rotate3.ll
deleted file mode 100644
index b92f7c2..0000000
--- a/test/CodeGen/X86/rotate3.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; Check that (or (shl x, y), (srl x, (sub 32, y))) is folded into (rotl x, y)
-; and (or (shl x, (sub 32, y)), (srl x, r)) into (rotr x, y) even if the
-; argument is zero extended. Fix for PR16726.
-
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
-
-define zeroext i8 @rolbyte(i32 %nBits_arg, i8 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i8 %x_arg to i32
-  %tmp3 = shl i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 8, %nBits_arg
-  %tmp10 = lshr i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i8
-  ret i8 %tmp12
-}
-; CHECK:    rolb %cl, %{{[a-z0-9]+}}
-
-
-define zeroext i8 @rorbyte(i32 %nBits_arg, i8 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i8 %x_arg to i32
-  %tmp3 = lshr i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 8, %nBits_arg
-  %tmp10 = shl i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i8
-  ret i8 %tmp12
-}
-; CHECK:    rorb %cl, %{{[a-z0-9]+}}
-
-define zeroext i16 @rolword(i32 %nBits_arg, i16 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i16 %x_arg to i32
-  %tmp3 = shl i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 16, %nBits_arg
-  %tmp10 = lshr i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i16
-  ret i16 %tmp12
-}
-; CHECK:    rolw %cl, %{{[a-z0-9]+}}
-
-define zeroext i16 @rorword(i32 %nBits_arg, i16 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i16 %x_arg to i32
-  %tmp3 = lshr i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 16, %nBits_arg
-  %tmp10 = shl i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i16
-  ret i16 %tmp12
-}
-; CHECK:    rorw %cl, %{{[a-z0-9]+}}
-
-define i64 @roldword(i64 %nBits_arg, i32 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i32 %x_arg to i64
-  %tmp3 = shl i64 %tmp1, %nBits_arg
-  %tmp8 = sub i64 32, %nBits_arg
-  %tmp10 = lshr i64 %tmp1, %tmp8
-  %tmp11 = or i64 %tmp3, %tmp10
-  ret i64 %tmp11
-}
-; CHECK:    roll %cl, %{{[a-z0-9]+}}
-
-define zeroext i64 @rordword(i64 %nBits_arg, i32 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i32 %x_arg to i64
-  %tmp3 = lshr i64 %tmp1, %nBits_arg
-  %tmp8 = sub i64 32, %nBits_arg
-  %tmp10 = shl i64 %tmp1, %tmp8
-  %tmp11 = or i64 %tmp3, %tmp10
-  ret i64 %tmp11
-}
-; CHECK:    rorl %cl, %{{[a-z0-9]+}}
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index e170762..b82be41 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -segmented-stacks -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define i32 @test_basic(i32 %l) {
+define i32 @test_basic(i32 %l) #0 {
         %mem = alloca i32, i32 %l
         call void @dummy_use (i32* %mem, i32 %l)
         %terminate = icmp eq i32 %l, 0
@@ -62,3 +62,5 @@ false:
 ; X64:      movq %rax, %rdi
 
 }
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index c02152b..9dab3cd 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -1,23 +1,23 @@
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
-; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
-; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-FreeBSD
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
+; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
+; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -verify-machineinstrs | FileCheck %s -check-prefix=X64-FreeBSD
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW
 
 ; We used to crash with filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks -filetype=obj
-
-; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris -segmented-stacks 2> %t.log
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -filetype=obj
+
+; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X64-Solaris
-; RUN: not llc < %s -mcpu=generic -mtriple=i686-freebsd -segmented-stacks 2> %t.log
+; RUN: not llc < %s -mcpu=generic -mtriple=i686-freebsd 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X32-FreeBSD
 
 ; X64-Solaris: Segmented stacks not supported on this platform
@@ -26,7 +26,7 @@
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define void @test_basic() {
+define void @test_basic() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -104,16 +104,18 @@ define void @test_basic() {
 
 }
 
-define i32 @test_nested(i32 * nest %closure, i32 %other) {
+define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        %addend = load i32 * %closure
        %result = add i32 %other, %addend
+       %mem = alloca i32, i32 10
+       call void @dummy_use (i32* %mem, i32 10)
        ret i32 %result
 
 ; X32-Linux:       cmpl %gs:48, %esp
 ; X32-Linux-NEXT:  ja      .LBB1_2
 
 ; X32-Linux:       pushl $4
-; X32-Linux-NEXT:  pushl $0
+; X32-Linux-NEXT:  pushl $60
 ; X32-Linux-NEXT:  calll __morestack
 ; X32-Linux-NEXT:  ret
 
@@ -121,7 +123,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-Linux-NEXT:  ja      .LBB1_2
 
 ; X64-Linux:       movq %r10, %rax
-; X64-Linux-NEXT:  movabsq $0, %r10
+; X64-Linux-NEXT:  movabsq $56, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
@@ -132,7 +134,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X32-Darwin-NEXT: ja      LBB1_2
 
 ; X32-Darwin:      pushl $4
-; X32-Darwin-NEXT: pushl $0
+; X32-Darwin-NEXT: pushl $60
 ; X32-Darwin-NEXT: calll ___morestack
 ; X32-Darwin-NEXT: ret
 
@@ -140,7 +142,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-Darwin-NEXT: ja      LBB1_2
 
 ; X64-Darwin:      movq %r10, %rax
-; X64-Darwin-NEXT: movabsq $0, %r10
+; X64-Darwin-NEXT: movabsq $56, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
 ; X64-Darwin-NEXT: callq ___morestack
 ; X64-Darwin-NEXT: ret
@@ -150,7 +152,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X32-MinGW-NEXT:  ja      LBB1_2
 
 ; X32-MinGW:       pushl $4
-; X32-MinGW-NEXT:  pushl $0
+; X32-MinGW-NEXT:  pushl $52
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
@@ -159,7 +161,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-MinGW-NEXT:  ja      .LBB1_2
 
 ; X64-MinGW:       movq %r10, %rax
-; X64-MinGW-NEXT:  movabsq $0, %r10
+; X64-MinGW-NEXT:  movabsq $88, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
 ; X64-MinGW-NEXT:  callq __morestack
 ; X64-MinGW-NEXT:  retq
@@ -169,7 +171,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-FreeBSD-NEXT:  ja      .LBB1_2
 
 ; X64-FreeBSD:       movq %r10, %rax
-; X64-FreeBSD-NEXT:  movabsq $0, %r10
+; X64-FreeBSD-NEXT:  movabsq $56, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
 ; X64-FreeBSD-NEXT:  callq __morestack
 ; X64-FreeBSD-NEXT:  ret
@@ -177,7 +179,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 
 }
 
-define void @test_large() {
+define void @test_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -249,7 +251,7 @@ define void @test_large() {
 
 }
 
-define fastcc void @test_fastcc() {
+define fastcc void @test_fastcc() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
         ret void
@@ -327,7 +329,7 @@ define fastcc void @test_fastcc() {
 
 }
 
-define fastcc void @test_fastcc_large() {
+define fastcc void @test_fastcc_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -412,7 +414,7 @@ define fastcc void @test_fastcc_large() {
 
 }
 
-define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) {
+define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 %a)
         ret void
@@ -434,3 +436,30 @@ define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) {
 ; X32-Darwin-NEXT: ret
 
 }
+
+define void @test_nostack() #0 {
+	ret void
+
+; X32-Linux-LABEL: test_nostack:
+; X32-Linux-NOT:   calll __morestack
+
+; X64-Linux-LABEL: test_nostack:
+; X32-Linux-NOT:   callq __morestack
+
+; X32-Darwin-LABEL: test_nostack:
+; X32-Darwin-NOT:   calll __morestack
+
+; X64-Darwin-LABEL: test_nostack:
+; X64-Darwin-NOT:   callq __morestack
+
+; X32-MinGW-LABEL: test_nostack:
+; X32-MinGW-NOT:   calll __morestack
+
+; X64-MinGW-LABEL: test_nostack:
+; X64-MinGW-NOT:   callq __morestack
+
+; X64-FreeBSD-LABEL: test_nostack:
+; X64-FreeBSD-NOT:   callq __morestack
+}
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index 628dba0..e8d3d6f 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -221,3 +221,21 @@ entry:
  %double2float.i = fptrunc <4 x double> %0 to <4 x float>
  ret <4 x float> %double2float.i
 }
+
+define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
+; CHECK-LABEL: test_insert_64_zext
+; CHECK-NOT: xor
+; CHECK: movq
+  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %1
+}
+
+define <4 x i32> @PR19721(<4 x i32> %i) {
+  %bc = bitcast <4 x i32> %i to i128
+  %insert = and i128 %bc, -4294967296
+  %bc2 = bitcast i128 %insert to <4 x i32>
+  ret <4 x i32> %bc2
+
+; CHECK-LABEL: PR19721
+; CHECK: punpckldq
+}
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 6d5b192..18bdcb3 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -209,7 +209,7 @@ entry:
 ; X64-LABEL: t13:
 ; X64: 	punpcklqdq	%xmm0, %xmm1
 ; X64: 	pextrw	$3, %xmm1, %eax
-; X64: 	pshufd	$52, %xmm1, %xmm0
+; X64: 	pshufhw	$12, %xmm1, %xmm0
 ; X64: 	pinsrw	$4, %eax, %xmm0
 ; X64: 	ret
 }
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index 4681fde..8ad7987 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
@@ -10,7 +10,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
@@ -18,7 +18,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 }
 
 ;CHECK-LABEL: vsel_4xi16:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
@@ -27,7 +27,7 @@ define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
 
 
 ;CHECK-LABEL: vsel_i32:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
@@ -88,3 +88,35 @@ entry:
   store double %extract214vector_func.i, double addrspace(1)* undef, align 8
   ret void
 }
+
+; If we can figure out a blend has a constant mask, we should emit the
+; blend instruction with an immediate mask
+define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
+; In this case, we emit a simple movss
+; CHECK-LABEL: constant_blendvpd
+; CHECK: movsd
+; CHECK: ret
+  %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab
+  ret <2 x double> %1
+}
+
+define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
+; CHECK-LABEL: constant_blendvps
+; CHECK-NOT: mov
+; CHECK: blendps $7
+; CHECK: ret
+  %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd
+  ret <4 x float> %1
+}
+
+define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
+; CHECK-LABEL: constant_pblendvb:
+; CHECK: movaps
+; CHECK: pblendvb
+; CHECK: ret
+  %1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd
+  ret <16 x i8> %1
+}
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index c15e24c..a3c6201 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
 
 @g16 = external global i16
 
@@ -249,3 +249,446 @@ entry:
 ; X64: ret
 }
 
+define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+entry:
+  %0 = load <4 x float>* %pb, align 16
+  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_1:
+; CHECK-NOT: movss
+; CHECK-NOT: shufps
+; CHECK: insertps    $48,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
+entry:
+  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_2:
+; CHECK-NOT: shufps
+; CHECK: insertps    $96,
+; CHECK: ret
+}
+
+; For loading an i32 from memory into an xmm register we use pinsrd
+; instead of insertps
+define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
+entry:
+  %0 = load <4 x i32>* %pb, align 16
+  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit6
+; CHECK-LABEL: pinsrd_from_shufflevector_i32:
+; CHECK-NOT: movss
+; CHECK-NOT: shufps
+; CHECK: pinsrd  $3,
+; CHECK: ret
+}
+
+define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+  ret <4 x i32> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_i32_2:
+; CHECK-NOT: shufps
+; CHECK-NOT: movaps
+; CHECK: insertps    $208,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef:
+; CHECK-NOT: movss
+; CHECK-NOT: shufps
+; CHECK: insertps    $16,
+; CHECK: ret
+  %1 = load float* %b, align 4
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
+; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
+;; aCHECK-NOT: movd
+; CHECK-NOT: shufps
+; CHECK: insertps    $32,
+; CHECK: ret
+  %1 = load i32* %b, align 4
+  %2 = insertelement <4 x i32> undef, i32 %1, i32 0
+  %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  ret <4 x i32> %result
+}
+
+;;;;;; Shuffles optimizable with a single insertps instruction
+define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $8
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext3 = extractelement <4 x float> %x, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $12
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $104
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYW0:
+; CHECK: insertps    $232
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext2 = extractelement <4 x float> %x, i32 3
+  %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $198
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 3
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps    $176
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x float> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $8
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext3 = extractelement <4 x i32> %x, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
+  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+  ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $12
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $104
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
+  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+  ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYW0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $232
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext2 = extractelement <4 x i32> %x, i32 3
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $198
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 3
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps    $176
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+  %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x i32> %vecinit5
+}
+
+;; Test for a bug in the first implementation of LowerBuildVectorv4x32
+define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
+; CHECK-LABEL: test_insertps_no_undef:
+; CHECK: movaps  %xmm0, %xmm1
+; CHECK-NEXT: insertps        $8, %xmm1, %xmm1
+; CHECK-NEXT: maxps   %xmm1, %xmm0
+; CHECK-NEXT: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext3 = extractelement <4 x float> %x, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  %mask = fcmp olt <4 x float> %vecinit5, %x
+  %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
+  ret <4 x float> %res
+}
+
+define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: blendvb_fallback
+; CHECK: blendvb
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %ret
+}
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+  ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+  ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; X32: movl    8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+  %2 = load <4 x float>* %1, align 16
+  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+  ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl    4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %b, align 4
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK: movss
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: addps
+; CHECK: addps
+; CHECK: addps
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+  %11 = fadd <4 x float> %7, %8
+  %12 = fadd <4 x float> %9, %10
+  %13 = fadd <4 x float> %11, %12
+  ret <4 x float> %13
+}
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index fb7e2db..cf88ade 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -33,7 +33,7 @@ attributes #0 = { sspreq }
 !5 = metadata !{}
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
-!8 = metadata !{metadata !9}
+!8 = metadata !{metadata !9, metadata !24, metadata !41, metadata !65}
 !9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"read_response_size", metadata !"read_response_size", metadata !"_Z18read_response_sizev", i32 27, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z18read_response_sizev, null, null, metadata !14, i32 27} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
 !10 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
 !11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
diff --git a/test/CodeGen/X86/stack-protector.ll b/test/CodeGen/X86/stack-protector.ll
index 265ec80..4db0f9a 100644
--- a/test/CodeGen/X86/stack-protector.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -16,13 +16,14 @@
 %struct.anon.0 = type { %union.anon.1 }
 %union.anon.1 = type { [2 x i8] }
 %struct.small = type { i8 }
+%struct.small_char = type { i32, [5 x i8] }
 
 @.str = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
 
 ; test1a: array of [16 x i8] 
 ;         no ssp attribute
 ; Requires no protector.
-define void @test1a(i8* %a) nounwind uwtable {
+define void @test1a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test1a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -53,7 +54,8 @@ entry:
 ; test1b: array of [16 x i8] 
 ;         ssp attribute
 ; Requires protector.
-define void @test1b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test1b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test1b:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -88,7 +90,8 @@ entry:
 ; test1c: array of [16 x i8] 
 ;         sspstrong attribute
 ; Requires protector.
-define void @test1c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test1c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test1c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -119,7 +122,8 @@ entry:
 ; test1d: array of [16 x i8] 
 ;         sspreq attribute
 ; Requires protector.
-define void @test1d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test1d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test1d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -150,7 +154,7 @@ entry:
 ; test2a: struct { [16 x i8] }
 ;         no ssp attribute
 ; Requires no protector.
-define void @test2a(i8* %a) nounwind uwtable {
+define void @test2a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test2a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -183,7 +187,8 @@ entry:
 ; test2b: struct { [16 x i8] }
 ;          ssp attribute
 ; Requires protector.
-define void @test2b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test2b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test2b:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -216,7 +221,8 @@ entry:
 ; test2c: struct { [16 x i8] }
 ;          sspstrong attribute
 ; Requires protector.
-define void @test2c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test2c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test2c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -249,7 +255,8 @@ entry:
 ; test2d: struct { [16 x i8] }
 ;          sspreq attribute
 ; Requires protector.
-define void @test2d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test2d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test2d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -282,7 +289,7 @@ entry:
 ; test3a:  array of [4 x i8]
 ;          no ssp attribute
 ; Requires no protector.
-define void @test3a(i8* %a) nounwind uwtable {
+define void @test3a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test3a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -313,7 +320,8 @@ entry:
 ; test3b:  array [4 x i8]
 ;          ssp attribute
 ; Requires no protector.
-define void @test3b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test3b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test3b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -344,7 +352,8 @@ entry:
 ; test3c:  array of [4 x i8]
 ;          sspstrong attribute
 ; Requires protector.
-define void @test3c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test3c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test3c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -375,7 +384,8 @@ entry:
 ; test3d:  array of [4 x i8]
 ;          sspreq attribute
 ; Requires protector.
-define void @test3d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test3d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test3d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -406,7 +416,7 @@ entry:
 ; test4a:  struct { [4 x i8] }
 ;          no ssp attribute
 ; Requires no protector.
-define void @test4a(i8* %a) nounwind uwtable {
+define void @test4a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test4a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -439,7 +449,8 @@ entry:
 ; test4b:  struct { [4 x i8] }
 ;          ssp attribute
 ; Requires no protector.
-define void @test4b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test4b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test4b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -472,7 +483,8 @@ entry:
 ; test4c:  struct { [4 x i8] }
 ;          sspstrong attribute
 ; Requires protector.
-define void @test4c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test4c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test4c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -505,7 +517,8 @@ entry:
 ; test4d:  struct { [4 x i8] }
 ;          sspreq attribute
 ; Requires protector.
-define void @test4d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test4d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test4d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -538,7 +551,7 @@ entry:
 ; test5a:  no arrays / no nested arrays
 ;          no ssp attribute
 ; Requires no protector.
-define void @test5a(i8* %a) nounwind uwtable {
+define void @test5a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test5a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -565,7 +578,8 @@ entry:
 ; test5b:  no arrays / no nested arrays
 ;          ssp attribute
 ; Requires no protector.
-define void @test5b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test5b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test5b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -592,7 +606,8 @@ entry:
 ; test5c:  no arrays / no nested arrays
 ;          sspstrong attribute
 ; Requires no protector.
-define void @test5c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test5c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test5c:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -619,7 +634,8 @@ entry:
 ; test5d:  no arrays / no nested arrays
 ;          sspreq attribute
 ; Requires protector.
-define void @test5d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test5d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test5d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -646,7 +662,7 @@ entry:
 ; test6a:  Address-of local taken (j = &a)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test6a() nounwind uwtable {
+define void @test6a() {
 entry:
 ; LINUX-I386-LABEL: test6a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -677,7 +693,8 @@ entry:
 ; test6b:  Address-of local taken (j = &a)
 ;          ssp attribute
 ; Requires no protector.
-define void @test6b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test6b() #0 {
 entry:
 ; LINUX-I386-LABEL: test6b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -708,7 +725,8 @@ entry:
 ; test6c:  Address-of local taken (j = &a)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test6c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test6c() #1 {
 entry:
 ; LINUX-I386-LABEL: test6c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -739,7 +757,8 @@ entry:
 ; test6d:  Address-of local taken (j = &a)
 ;          sspreq attribute
 ; Requires protector.
-define void @test6d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test6d() #2 {
 entry:
 ; LINUX-I386-LABEL: test6d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -770,7 +789,7 @@ entry:
 ; test7a:  PtrToInt Cast
 ;          no ssp attribute
 ; Requires no protector.
-define void @test7a() nounwind uwtable readnone {
+define void @test7a()  {
 entry:
 ; LINUX-I386-LABEL: test7a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -796,7 +815,8 @@ entry:
 ; test7b:  PtrToInt Cast
 ;          ssp attribute
 ; Requires no protector.
-define void @test7b() nounwind uwtable readnone ssp {
+; Function Attrs: ssp 
+define void @test7b() #0 {
 entry:
 ; LINUX-I386-LABEL: test7b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -822,7 +842,8 @@ entry:
 ; test7c:  PtrToInt Cast
 ;          sspstrong attribute
 ; Requires protector.
-define void @test7c() nounwind uwtable readnone sspstrong {
+; Function Attrs: sspstrong 
+define void @test7c() #1 {
 entry:
 ; LINUX-I386-LABEL: test7c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -848,7 +869,8 @@ entry:
 ; test7d:  PtrToInt Cast
 ;          sspreq attribute
 ; Requires protector.
-define void @test7d() nounwind uwtable readnone sspreq {
+; Function Attrs: sspreq 
+define void @test7d() #2 {
 entry:
 ; LINUX-I386-LABEL: test7d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -874,7 +896,7 @@ entry:
 ; test8a:  Passing addr-of to function call
 ;          no ssp attribute
 ; Requires no protector.
-define void @test8a() nounwind uwtable {
+define void @test8a() {
 entry:
 ; LINUX-I386-LABEL: test8a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -892,14 +914,15 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test8b:  Passing addr-of to function call
 ;          ssp attribute
 ; Requires no protector.
-define void @test8b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test8b() #0 {
 entry:
 ; LINUX-I386-LABEL: test8b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -917,14 +940,15 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test8c:  Passing addr-of to function call
 ;          sspstrong attribute
 ; Requires protector.
-define void @test8c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test8c() #1 {
 entry:
 ; LINUX-I386-LABEL: test8c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -942,14 +966,15 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test8d:  Passing addr-of to function call
 ;          sspreq attribute
 ; Requires protector.
-define void @test8d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test8d() #2 {
 entry:
 ; LINUX-I386-LABEL: test8d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -967,14 +992,14 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test9a:  Addr-of in select instruction
 ;          no ssp attribute
 ; Requires no protector.
-define void @test9a() nounwind uwtable {
+define void @test9a() {
 entry:
 ; LINUX-I386-LABEL: test9a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -992,7 +1017,7 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1003,7 +1028,8 @@ entry:
 ; test9b:  Addr-of in select instruction
 ;          ssp attribute
 ; Requires no protector.
-define void @test9b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test9b() #0 {
 entry:
 ; LINUX-I386-LABEL: test9b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1021,7 +1047,7 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1032,7 +1058,8 @@ entry:
 ; test9c:  Addr-of in select instruction
 ;          sspstrong attribute
 ; Requires protector.
-define void @test9c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test9c() #1 {
 entry:
 ; LINUX-I386-LABEL: test9c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1050,7 +1077,7 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1061,7 +1088,8 @@ entry:
 ; test9d:  Addr-of in select instruction
 ;          sspreq attribute
 ; Requires protector.
-define void @test9d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test9d() #2 {
 entry:
 ; LINUX-I386-LABEL: test9d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1079,7 +1107,7 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1090,7 +1118,7 @@ entry:
 ; test10a: Addr-of in phi instruction
 ;          no ssp attribute
 ; Requires no protector.
-define void @test10a() nounwind uwtable {
+define void @test10a() {
 entry:
 ; LINUX-I386-LABEL: test10a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1108,13 +1136,13 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1127,14 +1155,15 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test10b: Addr-of in phi instruction
 ;          ssp attribute
 ; Requires no protector.
-define void @test10b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test10b() #0 {
 entry:
 ; LINUX-I386-LABEL: test10b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1152,13 +1181,13 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1171,14 +1200,15 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test10c: Addr-of in phi instruction
 ;          sspstrong attribute
 ; Requires protector.
-define void @test10c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test10c() #1 {
 entry:
 ; LINUX-I386-LABEL: test10c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1196,13 +1226,13 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1215,14 +1245,15 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test10d: Addr-of in phi instruction
 ;          sspreq attribute
 ; Requires protector.
-define void @test10d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test10d() #2 {
 entry:
 ; LINUX-I386-LABEL: test10d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1240,13 +1271,13 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1259,14 +1290,14 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test11a: Addr-of struct element. (GEP followed by store).
 ;          no ssp attribute
 ; Requires no protector.
-define void @test11a() nounwind uwtable {
+define void @test11a() {
 entry:
 ; LINUX-I386-LABEL: test11a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1295,7 +1326,8 @@ entry:
 ; test11b: Addr-of struct element. (GEP followed by store).
 ;          ssp attribute
 ; Requires no protector.
-define void @test11b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test11b() #0 {
 entry:
 ; LINUX-I386-LABEL: test11b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1324,7 +1356,8 @@ entry:
 ; test11c: Addr-of struct element. (GEP followed by store).
 ;          sspstrong attribute
 ; Requires protector.
-define void @test11c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test11c() #1 {
 entry:
 ; LINUX-I386-LABEL: test11c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1353,7 +1386,8 @@ entry:
 ; test11d: Addr-of struct element. (GEP followed by store).
 ;          sspreq attribute
 ; Requires protector.
-define void @test11d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test11d() #2 {
 entry:
 ; LINUX-I386-LABEL: test11d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1382,7 +1416,7 @@ entry:
 ; test12a: Addr-of struct element, GEP followed by ptrtoint.
 ;          no ssp attribute
 ; Requires no protector.
-define void @test12a() nounwind uwtable {
+define void @test12a() {
 entry:
 ; LINUX-I386-LABEL: test12a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1410,7 +1444,8 @@ entry:
 ; test12b: Addr-of struct element, GEP followed by ptrtoint.
 ;          ssp attribute
 ; Requires no protector.
-define void @test12b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test12b() #0 {
 entry:
 ; LINUX-I386-LABEL: test12b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1437,8 +1472,8 @@ entry:
 
 ; test12c: Addr-of struct element, GEP followed by ptrtoint.
 ;          sspstrong attribute
-; Requires protector.
-define void @test12c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test12c() #1 {
 entry:
 ; LINUX-I386-LABEL: test12c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1466,7 +1501,8 @@ entry:
 ; test12d: Addr-of struct element, GEP followed by ptrtoint.
 ;          sspreq attribute
 ; Requires protector.
-define void @test12d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test12d() #2 {
 entry:
 ; LINUX-I386-LABEL: test12d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1494,7 +1530,7 @@ entry:
 ; test13a: Addr-of struct element, GEP followed by callinst.
 ;          no ssp attribute
 ; Requires no protector.
-define void @test13a() nounwind uwtable {
+define void @test13a() {
 entry:
 ; LINUX-I386-LABEL: test13a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1513,14 +1549,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test13b: Addr-of struct element, GEP followed by callinst.
 ;          ssp attribute
 ; Requires no protector.
-define void @test13b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test13b() #0 {
 entry:
 ; LINUX-I386-LABEL: test13b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1539,14 +1576,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test13c: Addr-of struct element, GEP followed by callinst.
 ;          sspstrong attribute
 ; Requires protector.
-define void @test13c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test13c() #1 {
 entry:
 ; LINUX-I386-LABEL: test13c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1565,14 +1603,15 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test13d: Addr-of struct element, GEP followed by callinst.
 ;          sspreq attribute
 ; Requires protector.
-define void @test13d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test13d() #2 {
 entry:
 ; LINUX-I386-LABEL: test13d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1591,14 +1630,14 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test14a: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test14a() nounwind uwtable {
+define void @test14a() {
 entry:
 ; LINUX-I386-LABEL: test14a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1617,14 +1656,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
 ; test14b: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          ssp attribute
 ; Requires no protector.
-define void @test14b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test14b() #0 {
 entry:
 ; LINUX-I386-LABEL: test14b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1643,14 +1683,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
 ; test14c: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test14c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test14c() #1 {
 entry:
 ; LINUX-I386-LABEL: test14c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1669,14 +1710,15 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
 ; test14d: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          sspreq  attribute
 ; Requires protector.
-define void @test14d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test14d() #2 {
 entry:
 ; LINUX-I386-LABEL: test14d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1695,7 +1737,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
@@ -1703,7 +1745,7 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test15a() nounwind uwtable {
+define void @test15a() {
 entry:
 ; LINUX-I386-LABEL: test15a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1734,7 +1776,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          ssp attribute
 ; Requires no protector.
-define void @test15b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test15b() #0 {
 entry:
 ; LINUX-I386-LABEL: test15b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1765,7 +1808,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test15c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test15c() #1 {
 entry:
 ; LINUX-I386-LABEL: test15c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1796,7 +1840,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspreq attribute
 ; Requires protector.
-define void @test15d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test15d() #2 {
 entry:
 ; LINUX-I386-LABEL: test15d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1827,7 +1872,7 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test16a() nounwind uwtable {
+define void @test16a() {
 entry:
 ; LINUX-I386-LABEL: test16a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1847,7 +1892,7 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
@@ -1855,7 +1900,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          ssp attribute
 ; Requires no protector.
-define void @test16b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test16b() #0 {
 entry:
 ; LINUX-I386-LABEL: test16b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1875,7 +1921,7 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
@@ -1883,7 +1929,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test16c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test16c() #1 {
 entry:
 ; LINUX-I386-LABEL: test16c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1903,7 +1950,7 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
@@ -1911,7 +1958,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspreq attribute
 ; Requires protector.
-define void @test16d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test16d() #2 {
 entry:
 ; LINUX-I386-LABEL: test16d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1931,14 +1979,14 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
 ; test17a: Addr-of a vector nested in a struct
 ;          no ssp attribute
 ; Requires no protector.
-define void @test17a() nounwind uwtable {
+define void @test17a() {
 entry:
 ; LINUX-I386-LABEL: test17a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1958,14 +2006,15 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test17b: Addr-of a vector nested in a struct
 ;          ssp attribute
 ; Requires no protector.
-define void @test17b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test17b() #0 {
 entry:
 ; LINUX-I386-LABEL: test17b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1985,14 +2034,15 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test17c: Addr-of a vector nested in a struct
 ;          sspstrong attribute
 ; Requires protector.
-define void @test17c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test17c() #1 {
 entry:
 ; LINUX-I386-LABEL: test17c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2012,14 +2062,15 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test17d: Addr-of a vector nested in a struct
 ;          sspreq attribute
 ; Requires protector.
-define void @test17d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test17d() #2 {
 entry:
 ; LINUX-I386-LABEL: test17d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2039,14 +2090,14 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test18a: Addr-of a variable passed into an invoke instruction.
 ;          no ssp attribute
 ; Requires no protector.
-define i32 @test18a() uwtable {
+define i32 @test18a()  {
 entry:
 ; LINUX-I386-LABEL: test18a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2082,7 +2133,8 @@ lpad:
 ; test18b: Addr-of a variable passed into an invoke instruction.
 ;          ssp attribute
 ; Requires no protector.
-define i32 @test18b() uwtable ssp {
+; Function Attrs: ssp 
+define i32 @test18b() #0 {
 entry:
 ; LINUX-I386-LABEL: test18b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2118,7 +2170,8 @@ lpad:
 ; test18c: Addr-of a variable passed into an invoke instruction.
 ;          sspstrong attribute
 ; Requires protector.
-define i32 @test18c() uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test18c() #1 {
 entry:
 ; LINUX-I386-LABEL: test18c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2154,7 +2207,8 @@ lpad:
 ; test18d: Addr-of a variable passed into an invoke instruction.
 ;          sspreq attribute
 ; Requires protector.
-define i32 @test18d() uwtable sspreq {
+; Function Attrs: sspreq 
+define i32 @test18d() #2 {
 entry:
 ; LINUX-I386-LABEL: test18d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2186,12 +2240,11 @@ lpad:
           catch i8* null
   ret i32 0
 }
-
 ; test19a: Addr-of a struct element passed into an invoke instruction.
 ;           (GEP followed by an invoke)
 ;          no ssp attribute
 ; Requires no protector.
-define i32 @test19a() uwtable {
+define i32 @test19a()  {
 entry:
 ; LINUX-I386-LABEL: test19a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2230,7 +2283,8 @@ lpad:
 ;           (GEP followed by an invoke)
 ;          ssp attribute
 ; Requires no protector.
-define i32 @test19b() uwtable ssp {
+; Function Attrs: ssp 
+define i32 @test19b() #0 {
 entry:
 ; LINUX-I386-LABEL: test19b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2269,7 +2323,8 @@ lpad:
 ;           (GEP followed by an invoke)
 ;          sspstrong attribute
 ; Requires protector.
-define i32 @test19c() uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test19c() #1 {
 entry:
 ; LINUX-I386-LABEL: test19c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2308,7 +2363,8 @@ lpad:
 ;           (GEP followed by an invoke)
 ;          sspreq attribute
 ; Requires protector.
-define i32 @test19d() uwtable sspreq {
+; Function Attrs: sspreq 
+define i32 @test19d() #2 {
 entry:
 ; LINUX-I386-LABEL: test19d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2350,7 +2406,7 @@ lpad:
 ; test20a: Addr-of a pointer
 ;          no ssp attribute
 ; Requires no protector.
-define void @test20a() nounwind uwtable {
+define void @test20a() {
 entry:
 ; LINUX-I386-LABEL: test20a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2380,7 +2436,8 @@ entry:
 ; test20b: Addr-of a pointer
 ;          ssp attribute
 ; Requires no protector.
-define void @test20b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test20b() #0 {
 entry:
 ; LINUX-I386-LABEL: test20b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2410,7 +2467,8 @@ entry:
 ; test20c: Addr-of a pointer
 ;          sspstrong attribute
 ; Requires protector.
-define void @test20c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test20c() #1 {
 entry:
 ; LINUX-I386-LABEL: test20c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2440,7 +2498,8 @@ entry:
 ; test20d: Addr-of a pointer
 ;          sspreq attribute
 ; Requires protector.
-define void @test20d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test20d() #2 {
 entry:
 ; LINUX-I386-LABEL: test20d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2470,7 +2529,7 @@ entry:
 ; test21a: Addr-of a casted pointer
 ;          no ssp attribute
 ; Requires no protector.
-define void @test21a() nounwind uwtable {
+define void @test21a() {
 entry:
 ; LINUX-I386-LABEL: test21a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2501,7 +2560,8 @@ entry:
 ; test21b: Addr-of a casted pointer
 ;          ssp attribute
 ; Requires no protector.
-define void @test21b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test21b() #0 {
 entry:
 ; LINUX-I386-LABEL: test21b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2532,7 +2592,8 @@ entry:
 ; test21c: Addr-of a casted pointer
 ;          sspstrong attribute
 ; Requires protector.
-define void @test21c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test21c() #1 {
 entry:
 ; LINUX-I386-LABEL: test21c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2563,7 +2624,8 @@ entry:
 ; test21d: Addr-of a casted pointer
 ;          sspreq attribute
 ; Requires protector.
-define void @test21d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test21d() #2 {
 entry:
 ; LINUX-I386-LABEL: test21d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2594,7 +2656,7 @@ entry:
 ; test22a: [2 x i8] in a class
 ;          no ssp attribute
 ; Requires no protector.
-define signext i8 @test22a() nounwind uwtable {
+define signext i8 @test22a() {
 entry:
 ; LINUX-I386-LABEL: test22a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2621,7 +2683,8 @@ entry:
 ; test22b: [2 x i8] in a class
 ;          ssp attribute
 ; Requires no protector.
-define signext i8 @test22b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define signext i8 @test22b() #0 {
 entry:
 ; LINUX-I386-LABEL: test22b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2648,7 +2711,8 @@ entry:
 ; test22c: [2 x i8] in a class
 ;          sspstrong attribute
 ; Requires protector.
-define signext i8 @test22c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define signext i8 @test22c() #1 {
 entry:
 ; LINUX-I386-LABEL: test22c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2675,7 +2739,8 @@ entry:
 ; test22d: [2 x i8] in a class
 ;          sspreq attribute
 ; Requires protector.
-define signext i8 @test22d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define signext i8 @test22d() #2 {
 entry:
 ; LINUX-I386-LABEL: test22d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2702,7 +2767,7 @@ entry:
 ; test23a: [2 x i8] nested in several layers of structs and unions
 ;          no ssp attribute
 ; Requires no protector.
-define signext i8 @test23a() nounwind uwtable {
+define signext i8 @test23a() {
 entry:
 ; LINUX-I386-LABEL: test23a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2733,7 +2798,8 @@ entry:
 ; test23b: [2 x i8] nested in several layers of structs and unions
 ;          ssp attribute
 ; Requires no protector.
-define signext i8 @test23b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define signext i8 @test23b() #0 {
 entry:
 ; LINUX-I386-LABEL: test23b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2764,7 +2830,8 @@ entry:
 ; test23c: [2 x i8] nested in several layers of structs and unions
 ;          sspstrong attribute
 ; Requires protector.
-define signext i8 @test23c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define signext i8 @test23c() #1 {
 entry:
 ; LINUX-I386-LABEL: test23c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2795,7 +2862,8 @@ entry:
 ; test23d: [2 x i8] nested in several layers of structs and unions
 ;          sspreq attribute
 ; Requires protector.
-define signext i8 @test23d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define signext i8 @test23d() #2 {
 entry:
 ; LINUX-I386-LABEL: test23d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2826,7 +2894,7 @@ entry:
 ; test24a: Variable sized alloca
 ;          no ssp attribute
 ; Requires no protector.
-define void @test24a(i32 %n) nounwind uwtable {
+define void @test24a(i32 %n) {
 entry:
 ; LINUX-I386-LABEL: test24a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2857,7 +2925,8 @@ entry:
 ; test24b: Variable sized alloca
 ;          ssp attribute
 ; Requires protector.
-define void @test24b(i32 %n) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test24b(i32 %n) #0 {
 entry:
 ; LINUX-I386-LABEL: test24b:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2888,7 +2957,8 @@ entry:
 ; test24c: Variable sized alloca
 ;          sspstrong attribute
 ; Requires protector.
-define void @test24c(i32 %n) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test24c(i32 %n) #1 {
 entry:
 ; LINUX-I386-LABEL: test24c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2919,7 +2989,8 @@ entry:
 ; test24d: Variable sized alloca
 ;          sspreq attribute
 ; Requires protector.
-define void @test24d(i32 %n) nounwind uwtable sspreq  {
+; Function Attrs: sspreq 
+define void @test24d(i32 %n) #2 {
 entry:
 ; LINUX-I386-LABEL: test24d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2950,7 +3021,7 @@ entry:
 ; test25a: array of [4 x i32]
 ;          no ssp attribute
 ; Requires no protector.
-define i32 @test25a() nounwind uwtable {
+define i32 @test25a() {
 entry:
 ; LINUX-I386-LABEL: test25a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2976,7 +3047,8 @@ entry:
 ; test25b: array of [4 x i32]
 ;          ssp attribute
 ; Requires no protector, except for Darwin which _does_ require a protector.
-define i32 @test25b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define i32 @test25b() #0 {
 entry:
 ; LINUX-I386-LABEL: test25b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -3002,7 +3074,8 @@ entry:
 ; test25c: array of [4 x i32]
 ;          sspstrong attribute
 ; Requires protector.
-define i32 @test25c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test25c() #1 {
 entry:
 ; LINUX-I386-LABEL: test25c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -3028,7 +3101,8 @@ entry:
 ; test25d: array of [4 x i32]
 ;          sspreq attribute
 ; Requires protector.
-define i32 @test25d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define i32 @test25d() #2 {
 entry:
 ; LINUX-I386-LABEL: test25d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -3056,7 +3130,8 @@ entry:
 ;         a stack protector.
 ;         ssptrong attribute
 ; Requires no protector.
-define void @test26() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test26() #1 {
 entry:
 ; LINUX-I386-LABEL: test26:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -3087,7 +3162,8 @@ entry:
 ;         Verify that the address-of analysis does not get stuck in infinite
 ;         recursion when chasing the alloca through the PHI nodes.
 ; Requires protector.
-define i32 @test27(i32 %arg) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test27(i32 %arg) #1 {
 bb:
 ; LINUX-I386-LABEL: test27:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -3105,7 +3181,7 @@ bb:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %tmp = alloca %struct.small*, align 8
-  %tmp1 = call i32 (...)* @dummy(%struct.small** %tmp) nounwind
+  %tmp1 = call i32 (...)* @dummy(%struct.small** %tmp)
   %tmp2 = load %struct.small** %tmp, align 8
   %tmp3 = ptrtoint %struct.small* %tmp2 to i64
   %tmp4 = trunc i64 %tmp3 to i32
@@ -3133,10 +3209,239 @@ bb17:                                             ; preds = %bb6
 
 bb21:                                             ; preds = %bb6, %bb
   %tmp22 = phi i32 [ %tmp1, %bb ], [ %tmp14, %bb6 ]
-  %tmp23 = call i32 (...)* @dummy(i32 %tmp22) nounwind
+  %tmp23 = call i32 (...)* @dummy(i32 %tmp22)
   ret i32 undef
 }
 
+; test28a: An array of [32 x i8] and a requested ssp-buffer-size of 33.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=33
+define i32 @test28a() #3 {
+entry:
+; LINUX-I386-LABEL: test28a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test28a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test28a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test28a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca [32 x i8], align 16
+  %arraydecay = getelementptr inbounds [32 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test28b: An array of [33 x i8] and a requested ssp-buffer-size of 33.
+; Requires protector.
+; Function Attrs: ssp stack-protector-buffer-size=33
+define i32 @test28b() #3 {
+entry:
+; LINUX-I386-LABEL: test28b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test28b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test28b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test28b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca [33 x i8], align 16
+  %arraydecay = getelementptr inbounds [33 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test29a: An array of [4 x i8] and a requested ssp-buffer-size of 5.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=5
+define i32 @test29a() #4 {
+entry:
+; LINUX-I386-LABEL: test29a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test29a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test29a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test29a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca [4 x i8], align 1
+  %arraydecay = getelementptr inbounds [4 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test29b: An array of [5 x i8] and a requested ssp-buffer-size of 5.
+; Requires protector.
+; Function Attrs: ssp stack-protector-buffer-size=5
+define i32 @test29b() #4 {
+entry:
+; LINUX-I386-LABEL: test29b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test29b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test29b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test29b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca [5 x i8], align 1
+  %arraydecay = getelementptr inbounds [5 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test30a: An structure containing an i32 and an array of [5 x i8].
+;          Requested ssp-buffer-size of 6.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=6
+define i32 @test30a() #5 {
+entry:
+; LINUX-I386-LABEL: test30a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test30a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test30a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test30a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca %struct.small_char, align 4
+  %test.coerce = alloca { i64, i8 }
+  %0 = bitcast { i64, i8 }* %test.coerce to i8*
+  %1 = bitcast %struct.small_char* %test to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 12, i32 0, i1 false)
+  %2 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 0
+  %3 = load i64* %2, align 1
+  %4 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 1
+  %5 = load i8* %4, align 1
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
+  ret i32 %call
+}
+
+; test30b: An structure containing an i32 and an array of [5 x i8].
+;          Requested ssp-buffer-size of 5.
+; Requires protector.
+; Function Attrs: ssp stack-protector-buffer-size=5
+define i32 @test30b() #4 {
+entry:
+; LINUX-I386-LABEL: test30b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test30b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test30b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test30b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca %struct.small_char, align 4
+  %test.coerce = alloca { i64, i8 }
+  %0 = bitcast { i64, i8 }* %test.coerce to i8*
+  %1 = bitcast %struct.small_char* %test to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 12, i32 0, i1 false)
+  %2 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 0
+  %3 = load i64* %2, align 1
+  %4 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 1
+  %5 = load i8* %4, align 1
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
+  ret i32 %call
+}
+
+; test31a: An alloca of size 5.
+;          Requested ssp-buffer-size of 6.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=6
+define i32 @test31a() #5 {
+entry:
+; LINUX-I386-LABEL: test31a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test31a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test31a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test31a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca i8*, align 8
+  %0 = alloca i8, i64 4
+  store i8* %0, i8** %test, align 8
+  %1 = load i8** %test, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %1)
+  ret i32 %call
+}
+
+; test31b: An alloca of size 5.
+;          Requested ssp-buffer-size of 5.
+; Requires protector.
+define i32 @test31b() #4 {
+entry:
+; LINUX-I386-LABEL: test31b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test31b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test31b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test31b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca i8*, align 8
+  %0 = alloca i8, i64 5
+  store i8* %0, i8** %test, align 8
+  %1 = load i8** %test, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %1)
+  ret i32 %call
+}
+
 declare double @testi_aux()
 declare i8* @strcpy(i8*, i8*)
 declare i32 @printf(i8*, ...)
@@ -3148,3 +3453,11 @@ declare void @_Z3exceptPi(i32*)
 declare i32 @__gxx_personality_v0(...)
 declare i32* @getp()
 declare i32 @dummy(...)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
+
+attributes #0 = { ssp }
+attributes #1 = { sspstrong }
+attributes #2 = { sspreq }
+attributes #3 = { ssp "stack-protector-buffer-size"="33" }
+attributes #4 = { ssp "stack-protector-buffer-size"="5" }
+attributes #5 = { ssp "stack-protector-buffer-size"="6" }
diff --git a/test/CodeGen/X86/stackpointer.ll b/test/CodeGen/X86/stackpointer.ll
new file mode 100644
index 0000000..80bcfbf
--- /dev/null
+++ b/test/CodeGen/X86/stackpointer.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin  | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnueabi | FileCheck %s
+; RUN: opt < %s -O3 -S -mtriple=x86_64-linux-gnueabi | FileCheck %s --check-prefix=OPT
+
+define i64 @get_stack() nounwind {
+entry:
+; CHECK-LABEL: get_stack:
+; CHECK: movq	%rsp, %rax
+	%sp = call i64 @llvm.read_register.i64(metadata !0)
+; OPT: @llvm.read_register.i64
+  ret i64 %sp
+}
+
+define void @set_stack(i64 %val) nounwind {
+entry:
+; CHECK-LABEL: set_stack:
+; CHECK: movq	%rdi, %rsp
+  call void @llvm.write_register.i64(metadata !0, i64 %val)
+; OPT: @llvm.write_register.i64
+  ret void
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+declare void @llvm.write_register.i64(metadata, i64) nounwind
+
+; register unsigned long current_stack_pointer asm("rsp");
+; CHECK-NOT: .asciz  "rsp"
+!0 = metadata !{metadata !"rsp\00"}
diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll
index 76a8402..75e7fc4 100644
--- a/test/CodeGen/X86/tls.ll
+++ b/test/CodeGen/X86/tls.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64_LINUX %s
 ; RUN: llc < %s -march=x86 -mtriple=x86-pc-win32 | FileCheck -check-prefix=X32_WIN %s
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=X64_WIN %s
+; RUN: llc < %s -march=x86 -mtriple=x86-pc-windows-gnu | FileCheck -check-prefix=MINGW32 %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-windows-gnu | FileCheck -check-prefix=X64_WIN %s
 
 @i1 = thread_local global i32 15
 @i2 = external thread_local global i32
@@ -30,6 +32,12 @@ define i32 @f1() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movl i1@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f1:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i1
@@ -57,6 +65,12 @@ define i32* @f2() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: leaq i1@SECREL32(%rax), %rax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f2:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i1
@@ -83,6 +97,12 @@ define i32 @f3() nounwind {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movl i2@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f3:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i2@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i2
@@ -110,6 +130,12 @@ define i32* @f4() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: leaq i2@SECREL32(%rax), %rax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f4:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i2@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i2
@@ -134,6 +160,12 @@ define i32 @f5() nounwind {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movl i3@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f5:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i3@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i3
@@ -161,6 +193,12 @@ define i32* @f6() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: leaq i3@SECREL32(%rax), %rax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f6:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i3@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i3
@@ -173,6 +211,12 @@ define i32 @f7() {
 ; X64_LINUX-LABEL: f7:
 ; X64_LINUX:      movl %fs:i4@TPOFF, %eax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f7:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i4@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i4
@@ -188,6 +232,12 @@ define i32* @f8() {
 ; X64_LINUX:      movq %fs:0, %rax
 ; X64_LINUX-NEXT: leaq i4@TPOFF(%rax), %rax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f8:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i4@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i4
@@ -200,6 +250,12 @@ define i32 @f9() {
 ; X64_LINUX-LABEL: f9:
 ; X64_LINUX:      movl %fs:i5@TPOFF, %eax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f9:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i5@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i5
@@ -215,6 +271,12 @@ define i32* @f10() {
 ; X64_LINUX:      movq %fs:0, %rax
 ; X64_LINUX-NEXT: leaq i5@TPOFF(%rax), %rax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f10:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i5@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i5
@@ -239,6 +301,12 @@ define i16 @f11() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movzwl s1@SECREL32(%rax), %eax
 ; X64_WIN:      ret
+; MINGW32-LABEL: _f11:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movzwl  _s1@SECREL32(%eax), %eax
+; MINGW32: retl
 
 entry:
 	%tmp1 = load i16* @s1
@@ -264,6 +332,13 @@ define i32 @f12() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movswl s1@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f12:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movswl _s1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
+
 
 entry:
 	%tmp1 = load i16* @s1
@@ -290,6 +365,12 @@ define i8 @f13() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movb b1@SECREL32(%rax), %al
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f13:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movb _b1@SECREL32(%eax), %al
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i8* @b1
@@ -315,6 +396,12 @@ define i32 @f14() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movsbl b1@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f14:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movsbl  _b1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i8* @b1
diff --git a/test/CodeGen/X86/vec_shuffle-41.ll b/test/CodeGen/X86/vec_shuffle-41.ll
new file mode 100644
index 0000000..28fdd2f
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuffle-41.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+
+; Use buildFromShuffleMostly which allows this to be generated as two 128-bit
+; shuffles and an insert.
+
+; This is the (somewhat questionable) LLVM IR that is generated for:
+;    x8.s0123456 = x8.s1234567;  // x8 is a <8 x float> type
+;    x8.s7 = f;                  // f is float
+
+
+define <8 x float> @test1(<8 x float> %a, float %b) {
+; CHECK-LABEL: test1:
+; CHECK: vinsertps
+; CHECK-NOT: vinsertps
+entry:
+  %shift = shufflevector <8 x float> %a, <8 x float> undef, <7 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %extend = shufflevector <7 x float> %shift, <7 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
+  %insert = insertelement <8 x float> %extend, float %b, i32 7
+
+  ret <8 x float> %insert
+}
diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
index 543c96e..a02e383 100644
--- a/test/CodeGen/X86/vec_splat.ll
+++ b/test/CodeGen/X86/vec_splat.ll
@@ -32,3 +32,19 @@ define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
 ; SSE3-LABEL: test_v2sd:
 ; SSE3: movddup
 }
+
+; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
+define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
+  %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
+  %2 = load <4 x float>* %1, align 16
+  %3 = extractelement <4 x float> %2, i64 %j
+  %4 = insertelement <4 x float> undef, float %3, i32 0
+  %5 = insertelement <4 x float> %4, float %3, i32 1
+  %6 = insertelement <4 x float> %5, float %3, i32 2
+  %7 = insertelement <4 x float> %6, float %3, i32 3
+  ret <4 x float> %7
+  
+; AVX-LABEL: load_extract_splat
+; AVX-NOT: movs
+; AVX: vbroadcastss
+}
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
new file mode 100644
index 0000000..4c30184
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -0,0 +1,217 @@
+; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=SSE41
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE
+; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX
+
+define <4 x i32> @test1(<4 x i32> %a) {
+  %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %div
+
+; SSE41-LABEL: test1:
+; SSE41: pmuludq
+; SSE41: pshufd	$57
+; SSE41: pmuludq
+; SSE41: shufps	$-35
+; SSE41: psubd
+; SSE41: psrld $1
+; SSE41: padd
+; SSE41: psrld $2
+
+; AVX-LABEL: test1:
+; AVX: vpmuludq
+; AVX: vpshufd	$57
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpsubd
+; AVX: vpsrld $1
+; AVX: vpadd
+; AVX: vpsrld $2
+}
+
+define <8 x i32> @test2(<8 x i32> %a) {
+  %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %div
+
+; AVX-LABEL: test2:
+; AVX: vpermd
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpsubd
+; AVX: vpsrld $1
+; AVX: vpadd
+; AVX: vpsrld $2
+}
+
+define <8 x i16> @test3(<8 x i16> %a) {
+  %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %div
+
+; SSE41-LABEL: test3:
+; SSE41: pmulhuw
+; SSE41: psubw
+; SSE41: psrlw $1
+; SSE41: paddw
+; SSE41: psrlw $2
+
+; AVX-LABEL: test3:
+; AVX: vpmulhuw
+; AVX: vpsubw
+; AVX: vpsrlw $1
+; AVX: vpaddw
+; AVX: vpsrlw $2
+}
+
+define <16 x i16> @test4(<16 x i16> %a) {
+  %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %div
+
+; AVX-LABEL: test4:
+; AVX: vpmulhuw
+; AVX: vpsubw
+; AVX: vpsrlw $1
+; AVX: vpaddw
+; AVX: vpsrlw $2
+; AVX-NOT: vpmulhuw
+}
+
+define <8 x i16> @test5(<8 x i16> %a) {
+  %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %div
+
+; SSE41-LABEL: test5:
+; SSE41: pmulhw
+; SSE41: psrlw $15
+; SSE41: psraw $1
+; SSE41: paddw
+
+; AVX-LABEL: test5:
+; AVX: vpmulhw
+; AVX: vpsrlw $15
+; AVX: vpsraw $1
+; AVX: vpaddw
+}
+
+define <16 x i16> @test6(<16 x i16> %a) {
+  %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %div
+
+; AVX-LABEL: test6:
+; AVX: vpmulhw
+; AVX: vpsrlw $15
+; AVX: vpsraw $1
+; AVX: vpaddw
+; AVX-NOT: vpmulhw
+}
+
+define <16 x i8> @test7(<16 x i8> %a) {
+  %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %div
+}
+
+define <4 x i32> @test8(<4 x i32> %a) {
+  %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %div
+
+; SSE41-LABEL: test8:
+; SSE41: pmuldq
+; SSE41: pshufd	$57
+; SSE41-NOT: pshufd	$57
+; SSE41: pmuldq
+; SSE41: shufps	$-35
+; SSE41: pshufd	$-40
+; SSE41: padd
+; SSE41: psrld $31
+; SSE41: psrad $2
+; SSE41: padd
+
+; SSE-LABEL: test8:
+; SSE: psrad $31
+; SSE: pand
+; SSE: paddd
+; SSE: pmuludq
+; SSE: pshufd	$57
+; SSE-NOT: pshufd	$57
+; SSE: pmuludq
+; SSE: shufps	$-35
+; SSE: pshufd	$-40
+; SSE: psubd
+; SSE: padd
+; SSE: psrld $31
+; SSE: psrad $2
+; SSE: padd
+
+; AVX-LABEL: test8:
+; AVX: vpmuldq
+; AVX: vpshufd	$57
+; AVX-NOT: vpshufd	$57
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpshufd	$-40
+; AVX: vpadd
+; AVX: vpsrld $31
+; AVX: vpsrad $2
+; AVX: vpadd
+}
+
+define <8 x i32> @test9(<8 x i32> %a) {
+  %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %div
+
+; AVX-LABEL: test9:
+; AVX: vpbroadcastd
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpshufd	$-40
+; AVX: vpadd
+; AVX: vpsrld $31
+; AVX: vpsrad $2
+; AVX: vpadd
+}
+
+define <8 x i32> @test10(<8 x i32> %a) {
+  %rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %rem
+
+; AVX-LABEL: test10:
+; AVX: vpbroadcastd
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpsubd
+; AVX: vpsrld $1
+; AVX: vpadd
+; AVX: vpsrld $2
+; AVX: vpmulld
+}
+
+define <8 x i32> @test11(<8 x i32> %a) {
+  %rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %rem
+
+; AVX-LABEL: test11:
+; AVX: vpbroadcastd
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpshufd	$-40
+; AVX: vpadd
+; AVX: vpsrld $31
+; AVX: vpsrad $2
+; AVX: vpadd
+; AVX: vpmulld
+}
+
+define <2 x i16> @test12() {
+  %I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0
+  %I9 = insertelement <2 x i16> %I8, i16 -1, i32 1
+  %B9 = urem <2 x i16> %I9, %I9
+  ret <2 x i16> %B9
+
+; AVX-LABEL: test12:
+; AVX: xorps
+}
diff --git a/test/CodeGen/X86/win32_sret.ll b/test/CodeGen/X86/win32_sret.ll
index d8ecd44..8728712 100644
--- a/test/CodeGen/X86/win32_sret.ll
+++ b/test/CodeGen/X86/win32_sret.ll
@@ -2,9 +2,11 @@
 ; some setups (e.g., Atom) from affecting the output.
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
+; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-cygwin | FileCheck %s -check-prefix=CYGWIN
 ; RUN: llc < %s -mcpu=core2 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32
 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
+; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-cygwin | FileCheck %s -check-prefix=CYGWIN
 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
 
 ; The SysV ABI used by most Unixes and Mingw on x86 specifies that an sret pointer
@@ -21,6 +23,9 @@ entry:
 ; MINGW_X86-LABEL:  _sret1:
 ; MINGW_X86:  {{retl$}}
 
+; CYGWIN-LABEL:     _sret1:
+; CYGWIN:     retl $4
+
 ; LINUX-LABEL:      sret1:
 ; LINUX:      retl $4
 
@@ -38,6 +43,9 @@ entry:
 ; MINGW_X86-LABEL:  _sret2:
 ; MINGW_X86:  {{retl$}}
 
+; CYGWIN-LABEL:     _sret2:
+; CYGWIN:     retl $4
+
 ; LINUX-LABEL:      sret2:
 ; LINUX:      retl $4
 
@@ -56,6 +64,9 @@ entry:
 ; MINGW_X86-LABEL:  _sret3:
 ; MINGW_X86:  {{retl$}}
 
+; CYGWIN-LABEL:     _sret3:
+; CYGWIN:     retl $4
+
 ; LINUX-LABEL:      sret3:
 ; LINUX:      retl $4
 
@@ -77,6 +88,9 @@ entry:
 ; MINGW_X86-LABEL: _sret4:
 ; MINGW_X86: {{retl$}}
 
+; CYGWIN-LABEL:    _sret4:
+; CYGWIN:    retl $4
+
 ; LINUX-LABEL:     sret4:
 ; LINUX:     retl $4
 
@@ -98,6 +112,7 @@ entry:
   ret void
 ; WIN32-LABEL:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
 ; MINGW_X86-LABEL: {{^}}"?foo@C5@@QAE?AUS5@@XZ":
+; CYGWIN-LABEL:    {{^}}"?foo@C5@@QAE?AUS5@@XZ":
 ; LINUX-LABEL:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
 
 ; The address of the return structure is passed as an implicit parameter.
@@ -115,6 +130,7 @@ entry:
   call x86_thiscallcc void @"\01?foo@C5@@QAE?AUS5@@XZ"(%struct.S5* sret %s, %class.C5* %c)
 ; WIN32-LABEL:      {{^}}_call_foo5:
 ; MINGW_X86-LABEL:  {{^}}_call_foo5:
+; CYGWIN-LABEL:     {{^}}_call_foo5:
 ; LINUX-LABEL:      {{^}}call_foo5:
 
 
@@ -135,6 +151,7 @@ entry:
 define void @test6_f(%struct.test6* %x) nounwind {
 ; WIN32-LABEL: _test6_f:
 ; MINGW_X86-LABEL: _test6_f:
+; CYGWIN-LABEL: _test6_f:
 ; LINUX-LABEL: test6_f:
 
 ; The %x argument is moved to %ecx. It will be the this pointer.
@@ -145,6 +162,9 @@ define void @test6_f(%struct.test6* %x) nounwind {
 ; MINGW_X86: movl    8(%ebp), %eax
 ; MINGW_X86: movl    %eax, (%e{{([a-d]x)|(sp)}})
 
+; CYGWIN: movl    8(%ebp), %eax
+; CYGWIN: movl    %eax, (%e{{([a-d]x)|(sp)}})
+
 ; The sret pointer is (%esp)
 ; WIN32:          leal    8(%esp), %[[REG:e[a-d]x]]
 ; WIN32-NEXT:     movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
@@ -153,8 +173,71 @@ define void @test6_f(%struct.test6* %x) nounwind {
 ; MINGW_X86-NEXT: leal    8(%esp), %ecx
 ; MINGW_X86-NEXT: calll   _test6_g
 
+; CYGWIN-NEXT: leal    8(%esp), %ecx
+; CYGWIN-NEXT: calll   _test6_g
+
   %tmp = alloca %struct.test6, align 4
   call x86_thiscallcc void @test6_g(%struct.test6* sret %tmp, %struct.test6* %x)
   ret void
 }
 declare x86_thiscallcc void @test6_g(%struct.test6* sret, %struct.test6*)
+
+; Flipping the parameters at the IR level generates the same code.
+%struct.test7 = type { i32, i32, i32 }
+define void @test7_f(%struct.test7* %x) nounwind {
+; WIN32-LABEL: _test7_f:
+; MINGW_X86-LABEL: _test7_f:
+; CYGWIN-LABEL: _test7_f:
+; LINUX-LABEL: test7_f:
+
+; The %x argument is moved to %ecx on all OSs. It will be the this pointer.
+; WIN32:      movl    8(%ebp), %ecx
+; MINGW_X86:  movl    8(%ebp), %ecx
+; CYGWIN:     movl    8(%ebp), %ecx
+
+; The sret pointer is (%esp)
+; WIN32:          leal    8(%esp), %[[REG:e[a-d]x]]
+; WIN32-NEXT:     movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
+; MINGW_X86:      leal    8(%esp), %[[REG:e[a-d]x]]
+; MINGW_X86-NEXT: movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
+; CYGWIN:         leal    8(%esp), %[[REG:e[a-d]x]]
+; CYGWIN-NEXT:    movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
+
+  %tmp = alloca %struct.test7, align 4
+  call x86_thiscallcc void @test7_g(%struct.test7* %x, %struct.test7* sret %tmp)
+  ret void
+}
+
+define x86_thiscallcc void @test7_g(%struct.test7* %in, %struct.test7* sret %out) {
+  %s = getelementptr %struct.test7* %in, i32 0, i32 0
+  %d = getelementptr %struct.test7* %out, i32 0, i32 0
+  %v = load i32* %s
+  store i32 %v, i32* %d
+  call void @clobber_eax()
+  ret void
+
+; Make sure we return the second parameter in %eax.
+; WIN32-LABEL: _test7_g:
+; WIN32: calll _clobber_eax
+; WIN32: movl {{.*}}, %eax
+; WIN32: retl
+}
+
+declare void @clobber_eax()
+
+; Test what happens if the first parameter has to be split by codegen.
+; Realistically, no frontend will generate code like this, but here it is for
+; completeness.
+define void @test8_f(i64 inreg %a, i64* sret %out) {
+  store i64 %a, i64* %out
+  call void @clobber_eax()
+  ret void
+
+; WIN32-LABEL: _test8_f:
+; WIN32: movl {{[0-9]+}}(%esp), %[[out:[a-z]+]]
+; WIN32-DAG: movl %edx, 4(%[[out]])
+; WIN32-DAG: movl %eax, (%[[out]])
+; WIN32: calll _clobber_eax
+; WIN32: movl {{.*}}, %eax
+; WIN32: retl
+}
diff --git a/test/CodeGen/X86/x86-64-sret-return-2.ll b/test/CodeGen/X86/x86-64-sret-return-2.ll
new file mode 100644
index 0000000..9f57ee1
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-sret-return-2.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=x86_64-apple-darwin8 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+
+; FIXME: x32 doesn't know how to select this.  This isn't a regression, it never
+; worked.
+; RUNX: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
+
+; This used to crash due to topological sorting issues in selection DAG.
+define void @foo(i32* sret %agg.result, i32, i32, i32, i32, i32, void (i32)* %pred) {
+entry:
+  call void %pred(i32 undef)
+  ret void
+
+; CHECK-LABEL: foo:
+; CHECK: callq
+; CHECK: movq {{.*}}, %rax
+; CHECK: ret
+}
diff --git a/test/CodeGen/XCore/epilogue_prologue.ll b/test/CodeGen/XCore/epilogue_prologue.ll
index 14f04a3..9997814 100644
--- a/test/CodeGen/XCore/epilogue_prologue.ll
+++ b/test/CodeGen/XCore/epilogue_prologue.ll
@@ -206,14 +206,41 @@ entry:
   ret i32 %i
 }
 
+; FP + large frame: spill FP+SR+LR = entsp 2 + 256  + extsp 1
+; CHECKFP-LABEL:f8
+; CHECKFP: entsp 258
+; CHECKFP-NEXT: stw r10, sp[1]
+; CHECKFP-NEXT: ldaw r10, sp[0]
+; CHECKFP-NEXT: mkmsk [[REG:r[0-9]+]], 8
+; CHECKFP-NEXT: ldaw r0, r10{{\[}}[[REG]]{{\]}}
+; CHECKFP-NEXT: extsp 1
+; CHECKFP-NEXT: bl f5
+; CHECKFP-NEXT: ldaw sp, sp[1]
+; CHECKFP-NEXT: set sp, r10
+; CHECKFP-NEXT: ldw r10, sp[1]
+; CHECKFP-NEXT: retsp 258
+;
+; !FP + large frame: spill SR+SR+LR = entsp 3 + 256
+; CHECK-LABEL:f8
+; CHECK: entsp 257
+; CHECK-NEXT: ldaw r0, sp[254]
+; CHECK-NEXT: bl f5
+; CHECK-NEXT: retsp 257
+define void @f8() nounwind {
+entry:
+  %0 = alloca [256 x i32]
+  %1 = getelementptr inbounds [256 x i32]* %0, i32 0, i32 253
+  call void @f5(i32* %1)
+  ret void
+}
 
 ; FP + large frame: spill FP+SR+LR = entsp 2 + 32768  + extsp 1
-; CHECKFP-LABEL:f8
+; CHECKFP-LABEL:f9
 ; CHECKFP: entsp 32770
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
-; CHECKFP-NEXT: mkmsk r1, 15
-; CHECKFP-NEXT: ldaw r0, r10[r1]
+; CHECKFP-NEXT: ldc [[REG:r[0-9]+]], 32767
+; CHECKFP-NEXT: ldaw r0, r10{{\[}}[[REG]]{{\]}}
 ; CHECKFP-NEXT: extsp 1
 ; CHECKFP-NEXT: bl f5
 ; CHECKFP-NEXT: ldaw sp, sp[1]
@@ -222,12 +249,12 @@ entry:
 ; CHECKFP-NEXT: retsp 32770
 ;
 ; !FP + large frame: spill SR+SR+LR = entsp 3 + 32768
-; CHECK-LABEL:f8
+; CHECK-LABEL:f9
 ; CHECK: entsp 32771
 ; CHECK-NEXT: ldaw r0, sp[32768]
 ; CHECK-NEXT: bl f5
 ; CHECK-NEXT: retsp 32771
-define void @f8() nounwind {
+define void @f9() nounwind {
 entry:
   %0 = alloca [32768 x i32]
   %1 = getelementptr inbounds [32768 x i32]* %0, i32 0, i32 32765
diff --git a/test/CodeGen/XCore/llvm-intrinsics.ll b/test/CodeGen/XCore/llvm-intrinsics.ll
index e0acd66..b436282 100644
--- a/test/CodeGen/XCore/llvm-intrinsics.ll
+++ b/test/CodeGen/XCore/llvm-intrinsics.ll
@@ -287,9 +287,8 @@ define void @Unwind1() {
 ; CHECKFP: .LBB{{[0-9_]+}}
 ; CHECKFP-NEXT: ldc r2, 40
 ; CHECKFP-NEXT: add r2, r10, r2
-; CHECKFP-NEXT: add r0, r2, r0
+; CHECKFP-NEXT: add r2, r2, r0
 ; CHECKFP-NEXT: mov r3, r1
-; CHECKFP-NEXT: mov r2, r0
 ; CHECKFP-NEXT: ldw r9, r10[4]
 ; CHECKFP-NEXT: ldw r8, r10[5]
 ; CHECKFP-NEXT: ldw r7, r10[6]
@@ -337,9 +336,8 @@ define void @Unwind1() {
 ; CHECK-NEXT: ldc r2, 36
 ; CHECK-NEXT: ldaw r3, sp[0]
 ; CHECK-NEXT: add r2, r3, r2
-; CHECK-NEXT: add r0, r2, r0
+; CHECK-NEXT: add r2, r2, r0
 ; CHECK-NEXT: mov r3, r1
-; CHECK-NEXT: mov r2, r0
 ; CHECK-NEXT: ldw r10, sp[2]
 ; CHECK-NEXT: ldw r9, sp[3]
 ; CHECK-NEXT: ldw r8, sp[4]
author	Stephen Hines <srhines@google.com>	2014-05-29 02:49:00 -0700
committer	Stephen Hines <srhines@google.com>	2014-05-29 02:49:00 -0700
commit	dce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch)
tree	dcebc53f2b182f145a2e659393bf9a0472cedf23 /test/CodeGen
parent	220b921aed042f9e520c26cffd8282a94c66c3d5 (diff)
download	external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2